169 lines
5.3 KiB
Go
169 lines
5.3 KiB
Go
package fa
|
|
|
|
import (
|
|
"context"
|
|
"iter"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"git.anthrove.art/public/go-fa-api/internal/urls"
|
|
)
|
|
|
|
// SubmissionInbox iterates the new-submission inbox at /msg/submissions/ —
|
|
// the feed of submissions posted since you last cleared the inbox by users
|
|
// you watch. Requires a logged-in client; anonymous calls hit the login
|
|
// gate and surface as [ErrUnauthorized].
|
|
//
|
|
// Each yielded *Submission carries ID, Title, Author, ThumbURL, Rating,
|
|
// and PostedAt (derived from the date-divider's data-date timestamp).
|
|
// Items are yielded in document order newest first, grouped by day.
|
|
//
|
|
// Pagination follows FA's cursor scheme (the "Next 72" link encodes
|
|
// "submissions newer than ID X, 72 per page" in its href). When FA serves
|
|
// a full page but omits that link, the iterator synthesizes the next
|
|
// cursor from the oldest submission on the page so a large inbox is not
|
|
// truncated to its first page. Iteration stops once a page yields no new
|
|
// submissions, or returns fewer than a full page with no cursor link.
|
|
//
|
|
// Use [ListOptions.MaxPages] to bound the crawl; the inbox can hold
|
|
// hundreds of pending items if you watch many active artists.
|
|
//
|
|
// ListOptions.StartPage is ignored the inbox is cursor-paginated by
|
|
// FA (the "Next 72" link encodes a from-id), not page-numbered, so there
|
|
// is nothing meaningful to start from.
|
|
func (c *Client) SubmissionInbox(ctx context.Context, opts ListOptions, reqOpts ...Option) iter.Seq2[*Submission, error] {
|
|
return func(yield func(*Submission, error) bool) {
|
|
nextURL := urls.MsgSubmissions()
|
|
pagesFetched := 0
|
|
visited := make(map[string]bool)
|
|
seen := make(map[SubmissionID]bool)
|
|
for nextURL != "" {
|
|
if opts.reachedLimit(pagesFetched) {
|
|
return
|
|
}
|
|
// Loop guard: FA (or a synthesized cursor) can point back at a
|
|
// page already crawled; stop rather than spin forever.
|
|
if visited[nextURL] {
|
|
return
|
|
}
|
|
visited[nextURL] = true
|
|
|
|
var (
|
|
items []*Submission
|
|
next string
|
|
)
|
|
err := c.fetch(ctx, nextURL, func(doc *goquery.Document) error {
|
|
items, next = parseSubmissionInboxPage(doc, c.cfg.jsonListings)
|
|
return nil
|
|
}, reqOpts...)
|
|
if err != nil {
|
|
yield(nil, err)
|
|
return
|
|
}
|
|
pagesFetched++
|
|
|
|
newCount := 0
|
|
minID := SubmissionID(0)
|
|
for _, s := range items {
|
|
if minID == 0 || s.ID < minID {
|
|
minID = s.ID
|
|
}
|
|
if seen[s.ID] {
|
|
continue
|
|
}
|
|
seen[s.ID] = true
|
|
newCount++
|
|
if !yield(s, nil) {
|
|
return
|
|
}
|
|
}
|
|
// A page that adds nothing new is the natural end of the crawl.
|
|
if newCount == 0 {
|
|
return
|
|
}
|
|
// FA renders a "Next 72" cursor link on every page that has a
|
|
// successor but it can omit it even when the inbox holds more.
|
|
// When the page came back full, trust the item count over the
|
|
// missing link and synthesize the cursor from the oldest id.
|
|
if next == "" {
|
|
if len(items) >= urls.InboxPageSize && minID > 0 {
|
|
next = urls.MsgSubmissionsCursor(int64(minID))
|
|
} else {
|
|
return
|
|
}
|
|
}
|
|
nextURL = next
|
|
}
|
|
}
|
|
}
|
|
|
|
// parseSubmissionInboxPage walks /msg/submissions/ (or one of its cursor-
|
|
// paginated variants), returning each yielded submission and the absolute
|
|
// URL of the "Next 72" cursor page, or "" if there's no further page.
|
|
//
|
|
// Inbox items are grouped under <div class="notifications-by-date"
|
|
// data-date="UNIXTIME"> wrappers; the parser lifts the group timestamp
|
|
// onto each contained submission's PostedAt so callers don't have to
|
|
// re-derive it.
|
|
//
|
|
// useJSON controls the experimental JSON-first merge see parseGalleryPage.
|
|
func parseSubmissionInboxPage(doc *goquery.Document, useJSON bool) (items []*Submission, nextURL string) {
|
|
var jsonData listingJSONMap
|
|
if useJSON {
|
|
jsonData = readListingJSON(doc)
|
|
}
|
|
doc.Find("#messagecenter-submissions div.notifications-by-date").Each(func(_ int, group *goquery.Selection) {
|
|
groupTime := groupDate(group)
|
|
group.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
|
|
s := parseGalleryFigure(sel, jsonData)
|
|
if s == nil {
|
|
return
|
|
}
|
|
if s.PostedAt.IsZero() && !groupTime.IsZero() {
|
|
s.PostedAt = groupTime
|
|
}
|
|
items = append(items, s)
|
|
})
|
|
})
|
|
if len(items) == 0 {
|
|
doc.Find("#messagecenter-submissions figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
|
|
if s := parseGalleryFigure(sel, jsonData); s != nil {
|
|
items = append(items, s)
|
|
}
|
|
})
|
|
}
|
|
// Last resort: a cursor page may drop the #messagecenter-submissions
|
|
// wrapper entirely. /msg/submissions/ carries no figures other than the
|
|
// inbox gallery, so a document-wide sweep is safe here.
|
|
if len(items) == 0 {
|
|
doc.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
|
|
if s := parseGalleryFigure(sel, jsonData); s != nil {
|
|
items = append(items, s)
|
|
}
|
|
})
|
|
}
|
|
|
|
if next := doc.Find("div.messagecenter-navigation a.button.more").First(); next.Length() > 0 {
|
|
href, _ := next.Attr("href")
|
|
nextURL = urls.AbsoluteCDN(href)
|
|
}
|
|
return items, nextURL
|
|
}
|
|
|
|
// groupDate reads the unix timestamp from a notifications-by-date wrapper's
|
|
// data-date attribute. Returns zero time when missing/unparseable.
|
|
func groupDate(group *goquery.Selection) time.Time {
|
|
v := strings.TrimSpace(trimAttr(group, "data-date"))
|
|
if v == "" {
|
|
return time.Time{}
|
|
}
|
|
secs, err := strconv.ParseInt(v, 10, 64)
|
|
if err != nil || secs <= 0 {
|
|
return time.Time{}
|
|
}
|
|
return time.Unix(secs, 0).UTC()
|
|
}
|