-
Notifications
You must be signed in to change notification settings - Fork 0
/
robots.go
139 lines (115 loc) · 4.15 KB
/
robots.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// Copyright 2024 Factorial GmbH. All rights reserved.
package main
import (
"errors"
"fmt"
"log/slog"
"net/http"
"net/url"
"strings"
"sync"
"github.com/temoto/robotstxt"
)
func isProbablyRobots(url string) bool {
return strings.HasSuffix(url, "/robots.txt")
}
type RobotCheckFn func(agent string, u string) (bool, error)
var (
// ErrRobotsUnavailable is returned when the robots.txt file is unavailable. Callee should
// decide themselves whether to allow or disallow the URL in this case.
ErrRobotsUnavailable = errors.New("robots.txt file is unavailable")
)
// Robots is a simple wrapper around robotstxt.RobotsData that caches the
// robots.txt file for each host in memory. It is meant to be setup once
// and the instance passed into each Run.
//
// Robots is a store that provides information on what URLs are allowed to be
// fetched by a given user agent on a per host basis.
//
// Results from a fetch are not shared across workers, so we might end up
// fetching the same robots.txt file multiple times. This compromise is made in
// order to keep the implementation simple and to avoid the need for dedicated
// worker pools and work queues.
//
// Robots will blindly issue fetch requests for the control file, and not check
// rate limit information prior. When a fetch requests is denied by rate limit
// Robots will retry until it succeeds. It is assumed that such fetch requests
// have a high priority. Other requests such as regular visit requests to the
// same host will be delayed until the rate limit is lifted.
//
// It expects the provided HTTP client to perform any necessary authentication
// and caching. In addition to caching at HTTP layer Robots caches the parsed
// robots.txt files in memory for a certain time.
//
// FIXME: Need to handle forced expiry of robots.txt files from memory.
type Robots struct {
sync.RWMutex
// data is a map of Host IDs to the parsed robots.txt data.
data map[string]*robotstxt.RobotsData
}
func NewRobots() *Robots {
return &Robots{
data: make(map[string]*robotstxt.RobotsData),
}
}
// Check checks whether the given URL is allowed to be fetched by the given user agent.
func (r *Robots) Check(u string, getAuth GetAuthFn, agent string) (bool, error) {
p, err := url.Parse(u)
if err != nil {
return false, err
}
robot, err := r.get(NewHostFromURL(p), getAuth)
if err != nil {
slog.Info("Robots: Failed to fetch robots.txt file.", "url", u, "error", err)
}
return robot.TestAgent(agent, u), err
}
// Sitemaps returns available sitemap URLs for the given host.
func (r *Robots) Sitemaps(u string, getAuth GetAuthFn) ([]string, error) {
p, err := url.Parse(u)
if err != nil {
return nil, err
}
robot, err := r.get(NewHostFromURL(p), getAuth)
if err != nil {
return nil, err
}
return robot.Sitemaps, nil
}
// get ensures that the robots.txt file for the given host is fetched. It will block until.
func (r *Robots) get(h *Host, getAuth GetAuthFn) (*robotstxt.RobotsData, error) {
var robot *robotstxt.RobotsData
var err error
var res *http.Response
// We need to ensure that we don't retrieve a robots.txt that was earlier
// retrieved from a private host, to a request that doesn't provide
// authentication and treats it as a public host.
key := fmt.Sprintf("%x", h.Hash(getAuth))
r.RLock()
robot, ok := r.data[key]
r.RUnlock()
if ok {
return robot, nil
}
client := CreateRetryingHTTPClient(getAuth)
rurl := fmt.Sprintf("%s://%s/robots.txt", h.PreferredScheme, h.String())
hlogger := slog.With("url", rurl, "host.name", h.Name, "host.port", h.Port)
hlogger.Debug("Robots: Fetching missing robots.txt file...")
res, err = client.Get(rurl)
if err != nil {
// An HTTP error is handled inside robotstxt.FromResponse, so it is save
// to not handle it here.
defer res.Body.Close()
hlogger.Debug("Robots: Fetched missing robots.txt file.")
}
robot, err = robotstxt.FromResponse(res)
// Always cache the result, even if it is an error. This is to avoid
// fetching the same robots.txt file multiple times.
//
// FIXME: Errored robots.txt and empty files should be cached for a shorter
// time, currently they are cached forever.
r.Lock()
r.data[key] = robot
r.Unlock()
return robot, err
}