Files
go-fa-api/submission_parser.go
SoXX 20fcad7fbb feat(submission): parse FA's prefixed system tags into CategorizedTags
FA renders its species/character/artist/type system tags as tag-block
anchors with a data-tag-name carrying a single-letter prefix
(s_/c_/a_-u_/t_) and a sibling tag-invalid span instead of a /search/
link. The existing keyword pass skips them, so they were lost.

Adds a Submission.CategorizedTags field exposing the four buckets with
the prefix stripped, plus an examples/categorized_tags runnable demo.
2026-06-02 21:15:30 +02:00

293 lines
10 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package fa
import (
"fmt"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"git.anthrove.art/public/go-fa-api/internal/urls"
)
// parseSubmission lifts a [Submission] out of a /view/{id}/ document. The
// selectors target FA's beta theme as captured in testdata/html/submission.html.
//
// FA's beta page renders submission metadata as two parallel <span> columns
// inside .submission-content-stats labels in the first, values in the
// second so this parser pairs them up positionally rather than scanning
// label-then-value rows.
func parseSubmission(id SubmissionID, doc *goquery.Document) (*Submission, error) {
s := &Submission{ID: id}
// Resolve the canonical ID from the og:url meta tag when caller passed 0
// (e.g. the real-fixture test). Lets the parser stand on its own.
if s.ID == 0 {
if og := trimAttr(doc.Find(`meta[property="og:url"]`).First(), "content"); og != "" {
if n := extractIntFromHref(og); n > 0 {
s.ID = SubmissionID(n)
}
}
}
// Author scoping: there can be multiple .iconusername references on the
// page (e.g. inside the description). The submission's true author lives
// inside .submission-description-artist.
authorBox := doc.Find("div.submission-description-artist").First()
// Title.
s.Title = strings.TrimSpace(authorBox.Find("div.submission-title h2").First().Text())
if s.Title == "" {
s.Title = strings.TrimSpace(doc.Find("div.submission-title h2").First().Text())
}
if s.Title == "" {
// Surface what FA actually served so the caller can tell the
// difference between a CF challenge, an SFW guard, a deleted
// submission, and a real markup-drift bug.
pageTitle := strings.TrimSpace(doc.Find("title").First().Text())
return nil, fmt.Errorf("%w: submission %d: missing title (page <title>=%q)",
ErrParse, id, pageTitle)
}
// Author.
if authorBox.Length() > 0 {
avatarLink := authorBox.Find("a[href^='/user/'] img").First()
nameSpan := authorBox.Find(".c-usernameBlockSimple__displayName").First()
s.Author = UserRef{
DisplayName: trimText(nameSpan),
AvatarURL: urls.AbsoluteCDN(trimAttr(avatarLink, "src")),
}
// Prefer the title attr (URL-safe login) when present; fall back to href.
if t := strings.TrimSpace(trimAttr(nameSpan, "title")); t != "" {
s.Author.Name = strings.ToLower(t)
}
if s.Author.Name == "" {
href, _ := authorBox.Find("a[href^='/user/']").First().Attr("href")
if parts := strings.Split(strings.Trim(href, "/"), "/"); len(parts) >= 2 {
s.Author.Name = strings.ToLower(parts[1])
}
}
}
// Posted date popup_date carries authoritative data-time.
s.PostedAt = parsePopupDate(authorBox.Find("span.popup_date").First())
if s.PostedAt.IsZero() {
s.PostedAt = parsePopupDate(doc.Find("span.popup_date").First())
}
// Rating div with class c-contentRating--{general,mature,adult} in the
// page stats panel; fall back to legacy .rating-box for older markup.
doc.Find("div.submission-page-stats div[class*='c-contentRating--']").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
s.Rating = ParseRating(trimText(sel))
return false
})
if s.Rating == "" {
doc.Find(".rating-box").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
s.Rating = ParseRating(trimText(sel))
return false
})
}
// Stats .submission-page-stats > div[title=...] each holds <div>N</div>
// <div class="highlight">Label</div>.
doc.Find("div.submission-page-stats > div[title]").Each(func(_ int, sel *goquery.Selection) {
title := strings.ToLower(trimAttr(sel, "title"))
val := parseStatNumber(trimText(sel.Find("div").First()))
switch title {
case "views":
s.Stats.Views = val
case "favorites":
s.Stats.Favorites = val
case "comments":
s.Stats.Comments = val
}
})
// Category / Theme / Species / Resolution / File Size are two parallel
// span columns inside .submission-content-stats. Pair them up by index.
statsBlock := doc.Find("div.submission-content-stats").First()
if statsBlock.Length() > 0 {
var labels []string
statsBlock.Find("span.highlight > span").Each(func(_ int, sel *goquery.Selection) {
labels = append(labels, strings.ToLower(strings.TrimSpace(sel.Text())))
})
var values []string
statsBlock.ChildrenFiltered("span").Each(func(_ int, sel *goquery.Selection) {
if class, _ := sel.Attr("class"); strings.Contains(class, "highlight") {
return // skip the labels column
}
sel.ChildrenFiltered("span").Each(func(_ int, inner *goquery.Selection) {
values = append(values, strings.TrimSpace(inner.Text()))
})
})
for i := 0; i < len(labels) && i < len(values); i++ {
switch labels[i] {
case "category":
s.Category = Category(values[i])
case "type", "theme":
s.Type = Type(values[i])
case "species":
s.Species = Species(values[i])
case "gender":
s.Gender = Gender(values[i])
case "resolution":
if w, h, ok := parseResolution(values[i]); ok {
s.Width, s.Height = w, h
}
}
}
}
// Description section.submission-description holds the body inside
// .section-body > .submission-description-text.
descBody := doc.Find("section.submission-description div.submission-description-text").First()
if descBody.Length() == 0 {
descBody = doc.Find("div.submission-description").First()
}
s.Description = htmlOf(descBody)
s.DescriptionText = strings.TrimSpace(descBody.Text())
// Tags anchors inside .submission-tags whose href targets the search.
// Tag-block helper anchors and invalid system tags are filtered out.
doc.Find("div.submission-tags span.tags a[href*='/search/']").Each(func(_ int, a *goquery.Selection) {
t := strings.TrimSpace(a.Text())
if t != "" {
s.Tags = append(s.Tags, t)
}
})
// Prefixed system tags FA renders these as tag-block anchors with a
// data-tag-name attribute carrying a leading single-letter prefix:
// s_ species, c_ character, a_/u_ artist, t_ type.
// They are paired with a sibling <span class="tag-invalid"> and have no
// /search/ href, so they are skipped by the keyword pass above.
doc.Find("div.submission-tags a.tag-block[data-tag-name]").Each(func(_ int, a *goquery.Selection) {
raw := strings.TrimSpace(trimAttr(a, "data-tag-name"))
if len(raw) < 3 || raw[1] != '_' {
return
}
name := raw[2:]
if name == "" {
return
}
switch raw[0] {
case 's':
s.CategorizedTags.Species = append(s.CategorizedTags.Species, name)
case 'c':
s.CategorizedTags.Characters = append(s.CategorizedTags.Characters, name)
case 'a', 'u':
s.CategorizedTags.Artists = append(s.CategorizedTags.Artists, name)
case 't':
s.CategorizedTags.Types = append(s.CategorizedTags.Types, name)
}
})
// File URL FA renders a "Download" button in #submission-options that
// links to the canonical file for *every* submission type. For visual
// art it equals the #submissionImg source; for stories and music it's
// the only correct source, because FA injects a generated thumbnail
// (e.g. ".thumbnail.<name>.docx.gif") into #submissionImg there. So the
// Download button is authoritative; #submissionImg is only a fallback.
doc.Find("div#submission-options a").EachWithBreak(func(_ int, a *goquery.Selection) bool {
if strings.EqualFold(trimText(a), "download") {
s.FileURL = urls.AbsoluteCDN(trimAttr(a, "href"))
return false
}
return true
})
// #submissionImg holds the inline image for visual art, or a generated
// thumbnail for non-image submissions. It always supplies ThumbURL, but
// only supplies FileURL when no Download button was found.
img := doc.Find("#submissionImg").First()
if img.Length() > 0 {
s.ThumbURL = urls.AbsoluteCDN(firstNonEmpty(
trimAttr(img, "data-preview-src"),
trimAttr(img, "src"),
))
if s.FileURL == "" {
s.FileURL = urls.AbsoluteCDN(firstNonEmpty(
trimAttr(img, "data-fullview-src"),
trimAttr(img, "src"),
))
}
// Dimensions also live in width/height attrs on some pages.
if w, err := strconv.Atoi(trimAttr(img, "data-fullview-width")); err == nil {
s.Width = w
}
if h, err := strconv.Atoi(trimAttr(img, "data-fullview-height")); err == nil {
s.Height = h
}
}
// Legacy fallback for older themes that predate #submission-options.
if s.FileURL == "" {
dl := doc.Find("div.submission-controls-upper a[href*='/d.furaffinity.net/'], div.download a, a.download-logged-in").First()
s.FileURL = urls.AbsoluteCDN(trimAttr(dl, "href"))
}
// Prev / Next (FA's minigallery-navigation: "Newer" / "Older").
doc.Find("div.minigallery-navigation a").Each(func(_ int, a *goquery.Selection) {
href, _ := a.Attr("href")
text := strings.ToLower(trimText(a))
id := SubmissionID(extractIntFromHref(href))
if id == 0 {
return
}
switch {
case strings.Contains(text, "newer"):
// "Newer" goes to a more recent submission surface as Prev so
// callers walking a gallery can call client.GetSubmission(s.Prev)
// to step toward the newest.
s.Prev = id
case strings.Contains(text, "older"):
s.Next = id
}
})
// Legacy fallback for older themes that still use favorite-nav.
if s.Prev == 0 && s.Next == 0 {
doc.Find("div.favorite-nav a, .submission-nav a").Each(func(_ int, a *goquery.Selection) {
href, _ := a.Attr("href")
text := strings.ToLower(trimText(a))
id := SubmissionID(extractIntFromHref(href))
if id == 0 {
return
}
switch {
case strings.Contains(text, "prev"):
s.Prev = id
case strings.Contains(text, "next"):
s.Next = id
}
})
}
// Favorite state FA renders exactly one of the "+Fav" / "Fav" anchors
// for an authenticated viewer; the "Fav" (/unfav/) link means this
// submission is currently favorited. findFavLinks (actions.go) already
// scrapes both. An anonymous fetch shows neither, leaving Favorited false.
if _, unfav := findFavLinks(doc, int64(s.ID)); unfav != "" {
s.Favorited = true
}
return s, nil
}
// parseResolution splits a "2071 x 1779" string into width and height ints.
// Returns ok=false on any malformed input so callers can leave Width/Height
// at zero.
func parseResolution(s string) (w, h int, ok bool) {
parts := strings.Split(s, "x")
if len(parts) != 2 {
return 0, 0, false
}
wn, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
return 0, 0, false
}
hn, err := strconv.Atoi(strings.TrimSpace(parts[1]))
if err != nil {
return 0, 0, false
}
return wn, hn, true
}