-
Notifications
You must be signed in to change notification settings - Fork 6
/
content_crawler.go
84 lines (69 loc) · 2.17 KB
/
content_crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package main
import (
"github.com/datatogether/core"
"net/http"
"time"
"github.com/PuerkitoBio/fetchbot"
)
var (
// contentFetcher is a sideband GET-only fetcher
// to snatch urls that look like files as they cross the main
// crawler
contentFetcher *fetchbot.Fetcher
// que for content GET's
contentQueue *fetchbot.Queue
// chan to stop the crawler
stopContentCrawler chan bool
)
// startCrawling initializes the crawler, queue, stopCrawler channel, and
// crawlingUrls slice
func startCrawlingContent() {
// Create the muxer
mux := fetchbot.NewMux()
// Handle all errors the same
mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
log.Infof("content res error - %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
mu.Lock()
delete(enqued, ctx.Cmd.URL().String())
mu.Unlock()
}))
// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD requests.
mux.Response().Method("GET").Handler(fetchbot.HandlerFunc(
func(ctx *fetchbot.Context, res *http.Response, err error) {
u := &core.Url{Url: ctx.Cmd.URL().String()}
if err := u.Read(store); err != nil {
// log.Printf("[ERR] url read error: %s - (%s) - %s\n", ctx.Cmd.URL(), NormalizeURL(ctx.Cmd.URL()), err)
log.Infof("content url read error: %s - %s\n", u.Url, err)
return
}
mu.Lock()
delete(enqued, u.Url)
mu.Unlock()
_, _, err = u.HandleGetResponse(store, res)
if err != nil {
log.Info(err.Error())
return
}
// Enqueue all links as HEAD requests
// if err := enqueueDstLinks(u, links, ctx); err != nil {
// log.Info(err.Error())
// }
}))
// Create the Fetcher, handle the logging first, then dispatch to the Muxer
h := logHandler("B", mux)
contentFetcher = fetchbot.New(h)
contentFetcher.DisablePoliteness = !cfg.Polite
contentFetcher.CrawlDelay = time.Duration(cfg.CrawlDelaySeconds) * time.Second
// Start processing
log.Info("starting B crawler (content)")
q := contentFetcher.Start()
contentQueue = q
stopFunc := q.Close
stopContentCrawler = make(chan bool)
go func() {
<-stopContentCrawler
log.Info("stopping B crawler (content)")
stopFunc()
}()
q.Block()
}