1
2
3
4 package main
5
6 import (
7 "archive/zip"
8 "bytes"
9 "context"
10 "encoding/csv"
11 "fmt"
12 "io"
13 "io/ioutil"
14 "log"
15 "math"
16 "net/http"
17 "os"
18 "path/filepath"
19 "strings"
20 "time"
21
22 "github.com/playwright-community/playwright-go"
23 )
24
25 func assertErrorToNilf(message string, err error) {
26 if err != nil {
27 log.Fatalf(message, err)
28 }
29 }
30
31 func worker(id int, jobs chan Job, results chan<- Job, browser playwright.Browser) {
32 for job := range jobs {
33 fmt.Printf("starting (try: %d): %s\n", job.Try, job.URL)
34 if job.Try >= 3 {
35 job.Success = false
36 job.err = fmt.Errorf("Stopped with domain %s (%w)", job.URL, job.err)
37 results <- job
38 continue
39 }
40 jobCtx, cancel := context.WithTimeout(context.Background(), time.Second*12)
41 internalJobError := make(chan error, 1)
42 go func() {
43 internalJobError <- processJob(browser, job, jobCtx)
44 cancel()
45 }()
46 select {
47 case <-jobCtx.Done():
48 job.err = fmt.Errorf("timeout (try: %d)", job.Try+1)
49 job.Success = false
50 job.Try++
51 jobs <- job
52 case err := <-internalJobError:
53 if err != nil {
54 job.err = err
55 job.Success = false
56 job.Try++
57 jobs <- job
58 cancel()
59 } else {
60 job.Success = true
61 job.err = nil
62 results <- job
63 }
64 }
65 }
66 }
67
68 func processJob(browser playwright.Browser, job Job, ctx context.Context) error {
69 context, err := browser.NewContext(playwright.BrowserNewContextOptions{
70 UserAgent: playwright.String("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"),
71 })
72 if err != nil {
73 return fmt.Errorf("could not create context: %w", err)
74 }
75 defer context.Close()
76 go func() {
77 <-ctx.Done()
78 context.Close()
79 }()
80
81 page, err := context.NewPage()
82 if err != nil {
83 return fmt.Errorf("could not create page: %w", err)
84 }
85
86 _, err = page.Goto("http://"+job.URL, playwright.PageGotoOptions{
87 WaitUntil: playwright.WaitUntilStateNetworkidle,
88 })
89 if err != nil {
90 return fmt.Errorf("could not goto: %s: %v", job.URL, err)
91 }
92 cwd, err := os.Getwd()
93 if err != nil {
94 return fmt.Errorf("could not get cwd %w", err)
95 }
96 _, err = page.Screenshot(playwright.PageScreenshotOptions{
97 Path: playwright.String(filepath.Join(cwd, "out", strings.Replace(job.URL, ".", "-", -1)+".png")),
98 })
99 if err != nil {
100 return fmt.Errorf("could not screenshot: %w", err)
101 }
102 return nil
103 }
104
105 type Job struct {
106 URL string
107 Try int
108 err error
109 Success bool
110 }
111
112 func main() {
113 log.Println("Downloading Alexa top domains")
114 topDomains, err := getAlexaTopDomains()
115 assertErrorToNilf("could not get alexa top domains: %w", err)
116 log.Println("Downloaded Alexa top domains successfully")
117 cwd, err := os.Getwd()
118 if err != nil {
119 assertErrorToNilf("could not get cwd %w", err)
120 }
121 if err := os.Mkdir(filepath.Join(cwd, "out"), 0777); err != nil && !os.IsExist(err) {
122 assertErrorToNilf("could not create output directory %w", err)
123 }
124
125 pw, err := playwright.Run()
126 assertErrorToNilf("could not launch playwright: %w", err)
127 browser, err := pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
128 Headless: playwright.Bool(false),
129 })
130 assertErrorToNilf("could not launch Chromium: %w", err)
131
132 numberOfJobs := int(math.Min(30, float64(len(topDomains))))
133
134 jobs := make(chan Job, numberOfJobs)
135 results := make(chan Job, numberOfJobs)
136
137 for w := 1; w <= 3; w++ {
138 go worker(w, jobs, results, browser)
139 }
140
141 for _, url := range topDomains[:numberOfJobs] {
142 jobs <- Job{
143 URL: url,
144 }
145 }
146
147 for a := 0; a < numberOfJobs; a++ {
148 job := <-results
149 if job.Success {
150 fmt.Println("success:", job.URL)
151 } else {
152 fmt.Println("error:", job.URL, job.err)
153 }
154 }
155
156 close(jobs)
157 close(results)
158
159 assertErrorToNilf("could not close browser: %w", browser.Close())
160 assertErrorToNilf("could not stop Playwright: %w", pw.Stop())
161 }
162
163 func getAlexaTopDomains() ([]string, error) {
164 resp, err := http.Get("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
165 if err != nil {
166 return nil, fmt.Errorf("could not get: %w", err)
167 }
168 body, err := ioutil.ReadAll(resp.Body)
169 if err != nil {
170 return nil, fmt.Errorf("could not read body: %w", err)
171 }
172 defer resp.Body.Close()
173 zipReader, err := zip.NewReader(bytes.NewReader(body), int64(len(body)))
174 if err != nil {
175 return nil, fmt.Errorf("could not create zip reader: %w", err)
176 }
177 alexaFile, err := zipReader.File[0].Open()
178 if err != nil {
179 return nil, fmt.Errorf("could not read alexa file: %w", err)
180 }
181 defer alexaFile.Close()
182 reader := csv.NewReader(alexaFile)
183 out := make([]string, 0)
184 for {
185 record, err := reader.Read()
186 if err == io.EOF {
187 return out, nil
188 }
189 if err != nil {
190 return nil, fmt.Errorf("could not read csv: %w", err)
191 }
192 out = append(out, record[1])
193 }
194 }
195
View as plain text