187 lines
5.2 KiB
Go
187 lines
5.2 KiB
Go
package fa
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/http/cookiejar"
|
|
"net/url"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/gocolly/colly/v2"
|
|
|
|
farouting "git.anthrove.art/public/go-fa-api/internal/urls"
|
|
)
|
|
|
|
// Client is the entry point of the SDK. It is safe for concurrent use; the
|
|
// internal rate limiter serializes outbound requests regardless of the
|
|
// number of calling goroutines.
|
|
//
|
|
// One Client corresponds to one FA session. Construct anonymous and
|
|
// authenticated clients separately rather than mutating one in-flight.
|
|
type Client struct {
|
|
cfg config
|
|
limiter *rateLimiter
|
|
logger *slog.Logger
|
|
collector *colly.Collector
|
|
http *http.Client
|
|
jar http.CookieJar
|
|
}
|
|
|
|
// New returns a configured Client. Pass options to override defaults.
|
|
//
|
|
// client := fa.New(
|
|
// fa.WithCookies(fa.Cookies{A: aCookie, B: bCookie}),
|
|
// fa.WithUserAgent("myapp/1.0"),
|
|
// )
|
|
func New(opts ...Option) *Client {
|
|
cfg := config{
|
|
userAgent: defaultUserAgent,
|
|
// One request per second steady-state, but allow a small burst so
|
|
// that e.g. avatar enrichment (one fetch per distinct author) can
|
|
// fire a few requests back-to-back before the 1/s pacing kicks in.
|
|
rateInterval: time.Second,
|
|
rateBurst: 3,
|
|
logger: slog.Default(),
|
|
maxRetries: defaultMaxRetries,
|
|
}
|
|
for _, o := range opts {
|
|
o(&cfg)
|
|
}
|
|
|
|
limiter := newRateLimiter(cfg.rateInterval, cfg.rateBurst, cfg.priorityRL)
|
|
|
|
// Build the base RoundTripper. If caller supplied an http.Client, reuse
|
|
// its transport as the "base" so that any TLS customisation (uTLS,
|
|
// chromedp, etc.) still applies. Otherwise wrap the stdlib default.
|
|
var baseRT http.RoundTripper = http.DefaultTransport
|
|
if cfg.httpClient != nil && cfg.httpClient.Transport != nil {
|
|
baseRT = cfg.httpClient.Transport
|
|
}
|
|
rt := &transport{
|
|
base: baseRT,
|
|
limiter: limiter,
|
|
userAgent: cfg.userAgent,
|
|
maxRetries: cfg.maxRetries,
|
|
logger: cfg.logger,
|
|
}
|
|
|
|
jar, _ := cookiejar.New(nil)
|
|
seedJar(jar, cfg.cookies, cfg.cf, cfg.sfw)
|
|
|
|
httpClient := &http.Client{
|
|
Transport: rt,
|
|
Jar: jar,
|
|
}
|
|
if cfg.httpClient != nil {
|
|
httpClient.Timeout = cfg.httpClient.Timeout
|
|
httpClient.CheckRedirect = cfg.httpClient.CheckRedirect
|
|
}
|
|
|
|
base := colly.NewCollector(
|
|
colly.UserAgent(cfg.userAgent),
|
|
colly.AllowURLRevisit(),
|
|
)
|
|
base.SetClient(httpClient)
|
|
base.SetCookieJar(jar)
|
|
// Colly's own LimitRule would compose with our transport limiter and
|
|
// double-throttle requests; instead, leave Colly unthrottled and let the
|
|
// transport be the single source of pacing truth.
|
|
|
|
return &Client{
|
|
cfg: cfg,
|
|
limiter: limiter,
|
|
logger: cfg.logger,
|
|
collector: base,
|
|
http: httpClient,
|
|
jar: jar,
|
|
}
|
|
}
|
|
|
|
// seedJar installs the FA session and Cloudflare clearance cookies onto the
|
|
// cookie jar so every outbound request to the host picks them up. The
|
|
// stdlib jar requires a URL to scope cookies; we use the FA host root.
|
|
//
|
|
// When sfw is [SFWOn] or [SFWOff] the `sfw` cookie is set to "1" or "0"
|
|
// respectively, matching what FA's navbar slider writes client-side.
|
|
// [SFWAuto] leaves the cookie unset so the account default applies.
|
|
func seedJar(jar http.CookieJar, fa Cookies, cf CFCookies, sfw SFWMode) {
|
|
hostURL, err := url.Parse(farouting.Host)
|
|
if err != nil {
|
|
return
|
|
}
|
|
var cookies []*http.Cookie
|
|
if fa.A != "" {
|
|
cookies = append(cookies, &http.Cookie{Name: "a", Value: fa.A, Path: "/"})
|
|
}
|
|
if fa.B != "" {
|
|
cookies = append(cookies, &http.Cookie{Name: "b", Value: fa.B, Path: "/"})
|
|
}
|
|
if cf.Clearance != "" {
|
|
cookies = append(cookies, &http.Cookie{Name: "cf_clearance", Value: cf.Clearance, Path: "/"})
|
|
}
|
|
switch sfw {
|
|
case SFWOn:
|
|
cookies = append(cookies, &http.Cookie{Name: "sfw", Value: "1", Path: "/"})
|
|
case SFWOff:
|
|
cookies = append(cookies, &http.Cookie{Name: "sfw", Value: "0", Path: "/"})
|
|
}
|
|
if len(cookies) > 0 {
|
|
jar.SetCookies(hostURL, cookies)
|
|
}
|
|
}
|
|
|
|
// fetch executes a single GET via the internal Colly collector and hands the
|
|
// parsed goquery document to parse. The collector clone scopes the OnHTML/
|
|
// OnResponse callbacks to this single call, so concurrent calls do not see
|
|
// each other's responses.
|
|
//
|
|
// Context cancellation propagates through the http.Request and the rate
|
|
// limiter a cancelled ctx surfaces from Wait or from the underlying
|
|
// transport, depending on which phase the request is in.
|
|
func (c *Client) fetch(ctx context.Context, rawURL string, parse func(doc *goquery.Document) error, opts ...Option) error {
|
|
ctx = c.applyRequestOptions(ctx, opts)
|
|
clone := c.collector.Clone()
|
|
clone.SetClient(c.http)
|
|
clone.SetCookieJar(c.jar)
|
|
clone.Context = ctx
|
|
|
|
var (
|
|
parseErr error
|
|
respErr error
|
|
)
|
|
|
|
clone.OnResponse(func(r *colly.Response) {
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
|
if err != nil {
|
|
parseErr = fmt.Errorf("%w: build document: %v", ErrParse, err)
|
|
return
|
|
}
|
|
if smErr := classifySystemMessage(doc); smErr != nil {
|
|
parseErr = smErr
|
|
return
|
|
}
|
|
if err := parse(doc); err != nil {
|
|
parseErr = err
|
|
}
|
|
})
|
|
|
|
clone.OnError(func(r *colly.Response, err error) {
|
|
respErr = err
|
|
})
|
|
|
|
if err := clone.Visit(rawURL); err != nil {
|
|
if respErr != nil {
|
|
return respErr
|
|
}
|
|
return err
|
|
}
|
|
if respErr != nil {
|
|
return respErr
|
|
}
|
|
return parseErr
|
|
}
|