Files
go-fa-api/inbox.go
2026-05-26 20:21:55 +02:00

169 lines
5.3 KiB
Go

package fa
import (
"context"
"iter"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"git.anthrove.art/public/go-fa-api/internal/urls"
)
// SubmissionInbox iterates the new-submission inbox at /msg/submissions/ —
// the feed of submissions posted since you last cleared the inbox by users
// you watch. Requires a logged-in client; anonymous calls hit the login
// gate and surface as [ErrUnauthorized].
//
// Each yielded *Submission carries ID, Title, Author, ThumbURL, Rating,
// and PostedAt (derived from the date-divider's data-date timestamp).
// Items are yielded in document order newest first, grouped by day.
//
// Pagination follows FA's cursor scheme (the "Next 72" link encodes
// "submissions newer than ID X, 72 per page" in its href). When FA serves
// a full page but omits that link, the iterator synthesizes the next
// cursor from the oldest submission on the page so a large inbox is not
// truncated to its first page. Iteration stops once a page yields no new
// submissions, or returns fewer than a full page with no cursor link.
//
// Use [ListOptions.MaxPages] to bound the crawl; the inbox can hold
// hundreds of pending items if you watch many active artists.
//
// ListOptions.StartPage is ignored the inbox is cursor-paginated by
// FA (the "Next 72" link encodes a from-id), not page-numbered, so there
// is nothing meaningful to start from.
func (c *Client) SubmissionInbox(ctx context.Context, opts ListOptions, reqOpts ...Option) iter.Seq2[*Submission, error] {
return func(yield func(*Submission, error) bool) {
nextURL := urls.MsgSubmissions()
pagesFetched := 0
visited := make(map[string]bool)
seen := make(map[SubmissionID]bool)
for nextURL != "" {
if opts.reachedLimit(pagesFetched) {
return
}
// Loop guard: FA (or a synthesized cursor) can point back at a
// page already crawled; stop rather than spin forever.
if visited[nextURL] {
return
}
visited[nextURL] = true
var (
items []*Submission
next string
)
err := c.fetch(ctx, nextURL, func(doc *goquery.Document) error {
items, next = parseSubmissionInboxPage(doc, c.cfg.jsonListings)
return nil
}, reqOpts...)
if err != nil {
yield(nil, err)
return
}
pagesFetched++
newCount := 0
minID := SubmissionID(0)
for _, s := range items {
if minID == 0 || s.ID < minID {
minID = s.ID
}
if seen[s.ID] {
continue
}
seen[s.ID] = true
newCount++
if !yield(s, nil) {
return
}
}
// A page that adds nothing new is the natural end of the crawl.
if newCount == 0 {
return
}
// FA renders a "Next 72" cursor link on every page that has a
// successor but it can omit it even when the inbox holds more.
// When the page came back full, trust the item count over the
// missing link and synthesize the cursor from the oldest id.
if next == "" {
if len(items) >= urls.InboxPageSize && minID > 0 {
next = urls.MsgSubmissionsCursor(int64(minID))
} else {
return
}
}
nextURL = next
}
}
}
// parseSubmissionInboxPage walks /msg/submissions/ (or one of its cursor-
// paginated variants), returning each yielded submission and the absolute
// URL of the "Next 72" cursor page, or "" if there's no further page.
//
// Inbox items are grouped under <div class="notifications-by-date"
// data-date="UNIXTIME"> wrappers; the parser lifts the group timestamp
// onto each contained submission's PostedAt so callers don't have to
// re-derive it.
//
// useJSON controls the experimental JSON-first merge see parseGalleryPage.
func parseSubmissionInboxPage(doc *goquery.Document, useJSON bool) (items []*Submission, nextURL string) {
var jsonData listingJSONMap
if useJSON {
jsonData = readListingJSON(doc)
}
doc.Find("#messagecenter-submissions div.notifications-by-date").Each(func(_ int, group *goquery.Selection) {
groupTime := groupDate(group)
group.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
s := parseGalleryFigure(sel, jsonData)
if s == nil {
return
}
if s.PostedAt.IsZero() && !groupTime.IsZero() {
s.PostedAt = groupTime
}
items = append(items, s)
})
})
if len(items) == 0 {
doc.Find("#messagecenter-submissions figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
if s := parseGalleryFigure(sel, jsonData); s != nil {
items = append(items, s)
}
})
}
// Last resort: a cursor page may drop the #messagecenter-submissions
// wrapper entirely. /msg/submissions/ carries no figures other than the
// inbox gallery, so a document-wide sweep is safe here.
if len(items) == 0 {
doc.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
if s := parseGalleryFigure(sel, jsonData); s != nil {
items = append(items, s)
}
})
}
if next := doc.Find("div.messagecenter-navigation a.button.more").First(); next.Length() > 0 {
href, _ := next.Attr("href")
nextURL = urls.AbsoluteCDN(href)
}
return items, nextURL
}
// groupDate reads the unix timestamp from a notifications-by-date wrapper's
// data-date attribute. Returns zero time when missing/unparseable.
func groupDate(group *goquery.Selection) time.Time {
v := strings.TrimSpace(trimAttr(group, "data-date"))
if v == "" {
return time.Time{}
}
secs, err := strconv.ParseInt(v, 10, 64)
if err != nil || secs <= 0 {
return time.Time{}
}
return time.Unix(secs, 0).UTC()
}