inital commit

This commit is contained in:
2026-05-25 22:27:18 +02:00
commit 965f9d6ad4
91 changed files with 28963 additions and 0 deletions

266
submission_parser.go Normal file
View File

@@ -0,0 +1,266 @@
package fa
import (
"fmt"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"git.anthrove.art/public/go-fa-api/internal/urls"
)
// parseSubmission lifts a [Submission] out of a /view/{id}/ document. The
// selectors target FA's beta theme as captured in testdata/html/submission.html.
//
// FA's beta page renders submission metadata as two parallel <span> columns
// inside .submission-content-stats labels in the first, values in the
// second so this parser pairs them up positionally rather than scanning
// label-then-value rows.
func parseSubmission(id SubmissionID, doc *goquery.Document) (*Submission, error) {
s := &Submission{ID: id}
// Resolve the canonical ID from the og:url meta tag when caller passed 0
// (e.g. the real-fixture test). Lets the parser stand on its own.
if s.ID == 0 {
if og := trimAttr(doc.Find(`meta[property="og:url"]`).First(), "content"); og != "" {
if n := extractIntFromHref(og); n > 0 {
s.ID = SubmissionID(n)
}
}
}
// Author scoping: there can be multiple .iconusername references on the
// page (e.g. inside the description). The submission's true author lives
// inside .submission-description-artist.
authorBox := doc.Find("div.submission-description-artist").First()
// Title.
s.Title = strings.TrimSpace(authorBox.Find("div.submission-title h2").First().Text())
if s.Title == "" {
s.Title = strings.TrimSpace(doc.Find("div.submission-title h2").First().Text())
}
if s.Title == "" {
// Surface what FA actually served so the caller can tell the
// difference between a CF challenge, an SFW guard, a deleted
// submission, and a real markup-drift bug.
pageTitle := strings.TrimSpace(doc.Find("title").First().Text())
return nil, fmt.Errorf("%w: submission %d: missing title (page <title>=%q)",
ErrParse, id, pageTitle)
}
// Author.
if authorBox.Length() > 0 {
avatarLink := authorBox.Find("a[href^='/user/'] img").First()
nameSpan := authorBox.Find(".c-usernameBlockSimple__displayName").First()
s.Author = UserRef{
DisplayName: trimText(nameSpan),
AvatarURL: urls.AbsoluteCDN(trimAttr(avatarLink, "src")),
}
// Prefer the title attr (URL-safe login) when present; fall back to href.
if t := strings.TrimSpace(trimAttr(nameSpan, "title")); t != "" {
s.Author.Name = strings.ToLower(t)
}
if s.Author.Name == "" {
href, _ := authorBox.Find("a[href^='/user/']").First().Attr("href")
if parts := strings.Split(strings.Trim(href, "/"), "/"); len(parts) >= 2 {
s.Author.Name = strings.ToLower(parts[1])
}
}
}
// Posted date popup_date carries authoritative data-time.
s.PostedAt = parsePopupDate(authorBox.Find("span.popup_date").First())
if s.PostedAt.IsZero() {
s.PostedAt = parsePopupDate(doc.Find("span.popup_date").First())
}
// Rating div with class c-contentRating--{general,mature,adult} in the
// page stats panel; fall back to legacy .rating-box for older markup.
doc.Find("div.submission-page-stats div[class*='c-contentRating--']").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
s.Rating = ParseRating(trimText(sel))
return false
})
if s.Rating == "" {
doc.Find(".rating-box").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
s.Rating = ParseRating(trimText(sel))
return false
})
}
// Stats .submission-page-stats > div[title=...] each holds <div>N</div>
// <div class="highlight">Label</div>.
doc.Find("div.submission-page-stats > div[title]").Each(func(_ int, sel *goquery.Selection) {
title := strings.ToLower(trimAttr(sel, "title"))
val := parseStatNumber(trimText(sel.Find("div").First()))
switch title {
case "views":
s.Stats.Views = val
case "favorites":
s.Stats.Favorites = val
case "comments":
s.Stats.Comments = val
}
})
// Category / Theme / Species / Resolution / File Size are two parallel
// span columns inside .submission-content-stats. Pair them up by index.
statsBlock := doc.Find("div.submission-content-stats").First()
if statsBlock.Length() > 0 {
var labels []string
statsBlock.Find("span.highlight > span").Each(func(_ int, sel *goquery.Selection) {
labels = append(labels, strings.ToLower(strings.TrimSpace(sel.Text())))
})
var values []string
statsBlock.ChildrenFiltered("span").Each(func(_ int, sel *goquery.Selection) {
if class, _ := sel.Attr("class"); strings.Contains(class, "highlight") {
return // skip the labels column
}
sel.ChildrenFiltered("span").Each(func(_ int, inner *goquery.Selection) {
values = append(values, strings.TrimSpace(inner.Text()))
})
})
for i := 0; i < len(labels) && i < len(values); i++ {
switch labels[i] {
case "category":
s.Category = Category(values[i])
case "type", "theme":
s.Type = Type(values[i])
case "species":
s.Species = Species(values[i])
case "gender":
s.Gender = Gender(values[i])
case "resolution":
if w, h, ok := parseResolution(values[i]); ok {
s.Width, s.Height = w, h
}
}
}
}
// Description section.submission-description holds the body inside
// .section-body > .submission-description-text.
descBody := doc.Find("section.submission-description div.submission-description-text").First()
if descBody.Length() == 0 {
descBody = doc.Find("div.submission-description").First()
}
s.Description = htmlOf(descBody)
s.DescriptionText = strings.TrimSpace(descBody.Text())
// Tags anchors inside .submission-tags whose href targets the search.
// Tag-block helper anchors and invalid system tags are filtered out.
doc.Find("div.submission-tags span.tags a[href*='/search/']").Each(func(_ int, a *goquery.Selection) {
t := strings.TrimSpace(a.Text())
if t != "" {
s.Tags = append(s.Tags, t)
}
})
// File URL FA renders a "Download" button in #submission-options that
// links to the canonical file for *every* submission type. For visual
// art it equals the #submissionImg source; for stories and music it's
// the only correct source, because FA injects a generated thumbnail
// (e.g. ".thumbnail.<name>.docx.gif") into #submissionImg there. So the
// Download button is authoritative; #submissionImg is only a fallback.
doc.Find("div#submission-options a").EachWithBreak(func(_ int, a *goquery.Selection) bool {
if strings.EqualFold(trimText(a), "download") {
s.FileURL = urls.AbsoluteCDN(trimAttr(a, "href"))
return false
}
return true
})
// #submissionImg holds the inline image for visual art, or a generated
// thumbnail for non-image submissions. It always supplies ThumbURL, but
// only supplies FileURL when no Download button was found.
img := doc.Find("#submissionImg").First()
if img.Length() > 0 {
s.ThumbURL = urls.AbsoluteCDN(firstNonEmpty(
trimAttr(img, "data-preview-src"),
trimAttr(img, "src"),
))
if s.FileURL == "" {
s.FileURL = urls.AbsoluteCDN(firstNonEmpty(
trimAttr(img, "data-fullview-src"),
trimAttr(img, "src"),
))
}
// Dimensions also live in width/height attrs on some pages.
if w, err := strconv.Atoi(trimAttr(img, "data-fullview-width")); err == nil {
s.Width = w
}
if h, err := strconv.Atoi(trimAttr(img, "data-fullview-height")); err == nil {
s.Height = h
}
}
// Legacy fallback for older themes that predate #submission-options.
if s.FileURL == "" {
dl := doc.Find("div.submission-controls-upper a[href*='/d.furaffinity.net/'], div.download a, a.download-logged-in").First()
s.FileURL = urls.AbsoluteCDN(trimAttr(dl, "href"))
}
// Prev / Next (FA's minigallery-navigation: "Newer" / "Older").
doc.Find("div.minigallery-navigation a").Each(func(_ int, a *goquery.Selection) {
href, _ := a.Attr("href")
text := strings.ToLower(trimText(a))
id := SubmissionID(extractIntFromHref(href))
if id == 0 {
return
}
switch {
case strings.Contains(text, "newer"):
// "Newer" goes to a more recent submission surface as Prev so
// callers walking a gallery can call client.GetSubmission(s.Prev)
// to step toward the newest.
s.Prev = id
case strings.Contains(text, "older"):
s.Next = id
}
})
// Legacy fallback for older themes that still use favorite-nav.
if s.Prev == 0 && s.Next == 0 {
doc.Find("div.favorite-nav a, .submission-nav a").Each(func(_ int, a *goquery.Selection) {
href, _ := a.Attr("href")
text := strings.ToLower(trimText(a))
id := SubmissionID(extractIntFromHref(href))
if id == 0 {
return
}
switch {
case strings.Contains(text, "prev"):
s.Prev = id
case strings.Contains(text, "next"):
s.Next = id
}
})
}
// Favorite state FA renders exactly one of the "+Fav" / "Fav" anchors
// for an authenticated viewer; the "Fav" (/unfav/) link means this
// submission is currently favorited. findFavLinks (actions.go) already
// scrapes both. An anonymous fetch shows neither, leaving Favorited false.
if _, unfav := findFavLinks(doc, int64(s.ID)); unfav != "" {
s.Favorited = true
}
return s, nil
}
// parseResolution splits a "2071 x 1779" string into width and height ints.
// Returns ok=false on any malformed input so callers can leave Width/Height
// at zero.
func parseResolution(s string) (w, h int, ok bool) {
parts := strings.Split(s, "x")
if len(parts) != 2 {
return 0, 0, false
}
wn, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
return 0, 0, false
}
hn, err := strconv.Atoi(strings.TrimSpace(parts[1]))
if err != nil {
return 0, 0, false
}
return wn, hn, true
}