...

Source file src/github.com/playwright-community/playwright-go/examples/parallel-scraping/main.go

Documentation: github.com/playwright-community/playwright-go/examples/parallel-scraping

     1  //go:build ignore
     2  // +build ignore
     3  
     4  package main
     5  
     6  import (
     7  	"archive/zip"
     8  	"bytes"
     9  	"context"
    10  	"encoding/csv"
    11  	"fmt"
    12  	"io"
    13  	"io/ioutil"
    14  	"log"
    15  	"math"
    16  	"net/http"
    17  	"os"
    18  	"path/filepath"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/playwright-community/playwright-go"
    23  )
    24  
    25  func assertErrorToNilf(message string, err error) {
    26  	if err != nil {
    27  		log.Fatalf(message, err)
    28  	}
    29  }
    30  
    31  func worker(id int, jobs chan Job, results chan<- Job, browser playwright.Browser) {
    32  	for job := range jobs {
    33  		fmt.Printf("starting (try: %d): %s\n", job.Try, job.URL)
    34  		if job.Try >= 3 {
    35  			job.Success = false
    36  			job.err = fmt.Errorf("Stopped with domain %s (%w)", job.URL, job.err)
    37  			results <- job
    38  			continue
    39  		}
    40  		jobCtx, cancel := context.WithTimeout(context.Background(), time.Second*12)
    41  		internalJobError := make(chan error, 1)
    42  		go func() {
    43  			internalJobError <- processJob(browser, job, jobCtx)
    44  			cancel()
    45  		}()
    46  		select {
    47  		case <-jobCtx.Done():
    48  			job.err = fmt.Errorf("timeout (try: %d)", job.Try+1)
    49  			job.Success = false
    50  			job.Try++
    51  			jobs <- job
    52  		case err := <-internalJobError:
    53  			if err != nil {
    54  				job.err = err
    55  				job.Success = false
    56  				job.Try++
    57  				jobs <- job
    58  				cancel()
    59  			} else {
    60  				job.Success = true
    61  				job.err = nil
    62  				results <- job
    63  			}
    64  		}
    65  	}
    66  }
    67  
    68  func processJob(browser playwright.Browser, job Job, ctx context.Context) error {
    69  	context, err := browser.NewContext(playwright.BrowserNewContextOptions{
    70  		UserAgent: playwright.String("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"),
    71  	})
    72  	if err != nil {
    73  		return fmt.Errorf("could not create context: %w", err)
    74  	}
    75  	defer context.Close()
    76  	go func() {
    77  		<-ctx.Done()
    78  		context.Close()
    79  	}()
    80  
    81  	page, err := context.NewPage()
    82  	if err != nil {
    83  		return fmt.Errorf("could not create page: %w", err)
    84  	}
    85  
    86  	_, err = page.Goto("http://"+job.URL, playwright.PageGotoOptions{
    87  		WaitUntil: playwright.WaitUntilStateNetworkidle,
    88  	})
    89  	if err != nil {
    90  		return fmt.Errorf("could not goto: %s: %v", job.URL, err)
    91  	}
    92  	cwd, err := os.Getwd()
    93  	if err != nil {
    94  		return fmt.Errorf("could not get cwd %w", err)
    95  	}
    96  	_, err = page.Screenshot(playwright.PageScreenshotOptions{
    97  		Path: playwright.String(filepath.Join(cwd, "out", strings.Replace(job.URL, ".", "-", -1)+".png")),
    98  	})
    99  	if err != nil {
   100  		return fmt.Errorf("could not screenshot: %w", err)
   101  	}
   102  	return nil
   103  }
   104  
   105  type Job struct {
   106  	URL     string
   107  	Try     int
   108  	err     error
   109  	Success bool
   110  }
   111  
   112  func main() {
   113  	log.Println("Downloading Alexa top domains")
   114  	topDomains, err := getAlexaTopDomains()
   115  	assertErrorToNilf("could not get alexa top domains: %w", err)
   116  	log.Println("Downloaded Alexa top domains successfully")
   117  	cwd, err := os.Getwd()
   118  	if err != nil {
   119  		assertErrorToNilf("could not get cwd %w", err)
   120  	}
   121  	if err := os.Mkdir(filepath.Join(cwd, "out"), 0777); err != nil && !os.IsExist(err) {
   122  		assertErrorToNilf("could not create output directory %w", err)
   123  	}
   124  
   125  	pw, err := playwright.Run()
   126  	assertErrorToNilf("could not launch playwright: %w", err)
   127  	browser, err := pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
   128  		Headless: playwright.Bool(false),
   129  	})
   130  	assertErrorToNilf("could not launch Chromium: %w", err)
   131  
   132  	numberOfJobs := int(math.Min(30, float64(len(topDomains))))
   133  
   134  	jobs := make(chan Job, numberOfJobs)
   135  	results := make(chan Job, numberOfJobs)
   136  
   137  	for w := 1; w <= 3; w++ {
   138  		go worker(w, jobs, results, browser)
   139  	}
   140  
   141  	for _, url := range topDomains[:numberOfJobs] {
   142  		jobs <- Job{
   143  			URL: url,
   144  		}
   145  	}
   146  
   147  	for a := 0; a < numberOfJobs; a++ {
   148  		job := <-results
   149  		if job.Success {
   150  			fmt.Println("success:", job.URL)
   151  		} else {
   152  			fmt.Println("error:", job.URL, job.err)
   153  		}
   154  	}
   155  
   156  	close(jobs)
   157  	close(results)
   158  
   159  	assertErrorToNilf("could not close browser: %w", browser.Close())
   160  	assertErrorToNilf("could not stop Playwright: %w", pw.Stop())
   161  }
   162  
   163  func getAlexaTopDomains() ([]string, error) {
   164  	resp, err := http.Get("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
   165  	if err != nil {
   166  		return nil, fmt.Errorf("could not get: %w", err)
   167  	}
   168  	body, err := ioutil.ReadAll(resp.Body)
   169  	if err != nil {
   170  		return nil, fmt.Errorf("could not read body: %w", err)
   171  	}
   172  	defer resp.Body.Close()
   173  	zipReader, err := zip.NewReader(bytes.NewReader(body), int64(len(body)))
   174  	if err != nil {
   175  		return nil, fmt.Errorf("could not create zip reader: %w", err)
   176  	}
   177  	alexaFile, err := zipReader.File[0].Open()
   178  	if err != nil {
   179  		return nil, fmt.Errorf("could not read alexa file: %w", err)
   180  	}
   181  	defer alexaFile.Close()
   182  	reader := csv.NewReader(alexaFile)
   183  	out := make([]string, 0)
   184  	for {
   185  		record, err := reader.Read()
   186  		if err == io.EOF {
   187  			return out, nil
   188  		}
   189  		if err != nil {
   190  			return nil, fmt.Errorf("could not read csv: %w", err)
   191  		}
   192  		out = append(out, record[1])
   193  	}
   194  }
   195  

View as plain text