inital commit
This commit is contained in:
168
inbox.go
Normal file
168
inbox.go
Normal file
@@ -0,0 +1,168 @@
|
||||
package fa
|
||||
|
||||
import (
|
||||
"context"
|
||||
"iter"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"git.anthrove.art/public/go-fa-api/internal/urls"
|
||||
)
|
||||
|
||||
// SubmissionInbox iterates the new-submission inbox at /msg/submissions/ —
|
||||
// the feed of submissions posted since you last cleared the inbox by users
|
||||
// you watch. Requires a logged-in client; anonymous calls hit the login
|
||||
// gate and surface as [ErrUnauthorized].
|
||||
//
|
||||
// Each yielded *Submission carries ID, Title, Author, ThumbURL, Rating,
|
||||
// and PostedAt (derived from the date-divider's data-date timestamp).
|
||||
// Items are yielded in document order newest first, grouped by day.
|
||||
//
|
||||
// Pagination follows FA's cursor scheme (the "Next 72" link encodes
|
||||
// "submissions newer than ID X, 72 per page" in its href). When FA serves
|
||||
// a full page but omits that link, the iterator synthesizes the next
|
||||
// cursor from the oldest submission on the page so a large inbox is not
|
||||
// truncated to its first page. Iteration stops once a page yields no new
|
||||
// submissions, or returns fewer than a full page with no cursor link.
|
||||
//
|
||||
// Use [ListOptions.MaxPages] to bound the crawl; the inbox can hold
|
||||
// hundreds of pending items if you watch many active artists.
|
||||
//
|
||||
// ListOptions.StartPage is ignored the inbox is cursor-paginated by
|
||||
// FA (the "Next 72" link encodes a from-id), not page-numbered, so there
|
||||
// is nothing meaningful to start from.
|
||||
func (c *Client) SubmissionInbox(ctx context.Context, opts ListOptions) iter.Seq2[*Submission, error] {
|
||||
return func(yield func(*Submission, error) bool) {
|
||||
nextURL := urls.MsgSubmissions()
|
||||
pagesFetched := 0
|
||||
visited := make(map[string]bool)
|
||||
seen := make(map[SubmissionID]bool)
|
||||
for nextURL != "" {
|
||||
if opts.reachedLimit(pagesFetched) {
|
||||
return
|
||||
}
|
||||
// Loop guard: FA (or a synthesized cursor) can point back at a
|
||||
// page already crawled; stop rather than spin forever.
|
||||
if visited[nextURL] {
|
||||
return
|
||||
}
|
||||
visited[nextURL] = true
|
||||
|
||||
var (
|
||||
items []*Submission
|
||||
next string
|
||||
)
|
||||
err := c.fetch(ctx, nextURL, func(doc *goquery.Document) error {
|
||||
items, next = parseSubmissionInboxPage(doc, c.cfg.jsonListings)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
yield(nil, err)
|
||||
return
|
||||
}
|
||||
pagesFetched++
|
||||
|
||||
newCount := 0
|
||||
minID := SubmissionID(0)
|
||||
for _, s := range items {
|
||||
if minID == 0 || s.ID < minID {
|
||||
minID = s.ID
|
||||
}
|
||||
if seen[s.ID] {
|
||||
continue
|
||||
}
|
||||
seen[s.ID] = true
|
||||
newCount++
|
||||
if !yield(s, nil) {
|
||||
return
|
||||
}
|
||||
}
|
||||
// A page that adds nothing new is the natural end of the crawl.
|
||||
if newCount == 0 {
|
||||
return
|
||||
}
|
||||
// FA renders a "Next 72" cursor link on every page that has a
|
||||
// successor but it can omit it even when the inbox holds more.
|
||||
// When the page came back full, trust the item count over the
|
||||
// missing link and synthesize the cursor from the oldest id.
|
||||
if next == "" {
|
||||
if len(items) >= urls.InboxPageSize && minID > 0 {
|
||||
next = urls.MsgSubmissionsCursor(int64(minID))
|
||||
} else {
|
||||
return
|
||||
}
|
||||
}
|
||||
nextURL = next
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseSubmissionInboxPage walks /msg/submissions/ (or one of its cursor-
|
||||
// paginated variants), returning each yielded submission and the absolute
|
||||
// URL of the "Next 72" cursor page, or "" if there's no further page.
|
||||
//
|
||||
// Inbox items are grouped under <div class="notifications-by-date"
|
||||
// data-date="UNIXTIME"> wrappers; the parser lifts the group timestamp
|
||||
// onto each contained submission's PostedAt so callers don't have to
|
||||
// re-derive it.
|
||||
//
|
||||
// useJSON controls the experimental JSON-first merge see parseGalleryPage.
|
||||
func parseSubmissionInboxPage(doc *goquery.Document, useJSON bool) (items []*Submission, nextURL string) {
|
||||
var jsonData listingJSONMap
|
||||
if useJSON {
|
||||
jsonData = readListingJSON(doc)
|
||||
}
|
||||
doc.Find("#messagecenter-submissions div.notifications-by-date").Each(func(_ int, group *goquery.Selection) {
|
||||
groupTime := groupDate(group)
|
||||
group.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
|
||||
s := parseGalleryFigure(sel, jsonData)
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
if s.PostedAt.IsZero() && !groupTime.IsZero() {
|
||||
s.PostedAt = groupTime
|
||||
}
|
||||
items = append(items, s)
|
||||
})
|
||||
})
|
||||
if len(items) == 0 {
|
||||
doc.Find("#messagecenter-submissions figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
|
||||
if s := parseGalleryFigure(sel, jsonData); s != nil {
|
||||
items = append(items, s)
|
||||
}
|
||||
})
|
||||
}
|
||||
// Last resort: a cursor page may drop the #messagecenter-submissions
|
||||
// wrapper entirely. /msg/submissions/ carries no figures other than the
|
||||
// inbox gallery, so a document-wide sweep is safe here.
|
||||
if len(items) == 0 {
|
||||
doc.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
|
||||
if s := parseGalleryFigure(sel, jsonData); s != nil {
|
||||
items = append(items, s)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if next := doc.Find("div.messagecenter-navigation a.button.more").First(); next.Length() > 0 {
|
||||
href, _ := next.Attr("href")
|
||||
nextURL = urls.AbsoluteCDN(href)
|
||||
}
|
||||
return items, nextURL
|
||||
}
|
||||
|
||||
// groupDate reads the unix timestamp from a notifications-by-date wrapper's
|
||||
// data-date attribute. Returns zero time when missing/unparseable.
|
||||
func groupDate(group *goquery.Selection) time.Time {
|
||||
v := strings.TrimSpace(trimAttr(group, "data-date"))
|
||||
if v == "" {
|
||||
return time.Time{}
|
||||
}
|
||||
secs, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil || secs <= 0 {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.Unix(secs, 0).UTC()
|
||||
}
|
||||
Reference in New Issue
Block a user