FA renders its species/character/artist/type system tags as tag-block anchors with a data-tag-name carrying a single-letter prefix (s_/c_/a_-u_/t_) and a sibling tag-invalid span instead of a /search/ link. The existing keyword pass skips them, so they were lost. Adds a Submission.CategorizedTags field exposing the four buckets with the prefix stripped, plus an examples/categorized_tags runnable demo.
293 lines
10 KiB
Go
293 lines
10 KiB
Go
package fa
|
||
|
||
import (
|
||
"fmt"
|
||
"strconv"
|
||
"strings"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
|
||
"git.anthrove.art/public/go-fa-api/internal/urls"
|
||
)
|
||
|
||
// parseSubmission lifts a [Submission] out of a /view/{id}/ document. The
|
||
// selectors target FA's beta theme as captured in testdata/html/submission.html.
|
||
//
|
||
// FA's beta page renders submission metadata as two parallel <span> columns
|
||
// inside .submission-content-stats labels in the first, values in the
|
||
// second so this parser pairs them up positionally rather than scanning
|
||
// label-then-value rows.
|
||
func parseSubmission(id SubmissionID, doc *goquery.Document) (*Submission, error) {
|
||
s := &Submission{ID: id}
|
||
|
||
// Resolve the canonical ID from the og:url meta tag when caller passed 0
|
||
// (e.g. the real-fixture test). Lets the parser stand on its own.
|
||
if s.ID == 0 {
|
||
if og := trimAttr(doc.Find(`meta[property="og:url"]`).First(), "content"); og != "" {
|
||
if n := extractIntFromHref(og); n > 0 {
|
||
s.ID = SubmissionID(n)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Author scoping: there can be multiple .iconusername references on the
|
||
// page (e.g. inside the description). The submission's true author lives
|
||
// inside .submission-description-artist.
|
||
authorBox := doc.Find("div.submission-description-artist").First()
|
||
|
||
// Title.
|
||
s.Title = strings.TrimSpace(authorBox.Find("div.submission-title h2").First().Text())
|
||
if s.Title == "" {
|
||
s.Title = strings.TrimSpace(doc.Find("div.submission-title h2").First().Text())
|
||
}
|
||
if s.Title == "" {
|
||
// Surface what FA actually served so the caller can tell the
|
||
// difference between a CF challenge, an SFW guard, a deleted
|
||
// submission, and a real markup-drift bug.
|
||
pageTitle := strings.TrimSpace(doc.Find("title").First().Text())
|
||
return nil, fmt.Errorf("%w: submission %d: missing title (page <title>=%q)",
|
||
ErrParse, id, pageTitle)
|
||
}
|
||
|
||
// Author.
|
||
if authorBox.Length() > 0 {
|
||
avatarLink := authorBox.Find("a[href^='/user/'] img").First()
|
||
nameSpan := authorBox.Find(".c-usernameBlockSimple__displayName").First()
|
||
s.Author = UserRef{
|
||
DisplayName: trimText(nameSpan),
|
||
AvatarURL: urls.AbsoluteCDN(trimAttr(avatarLink, "src")),
|
||
}
|
||
// Prefer the title attr (URL-safe login) when present; fall back to href.
|
||
if t := strings.TrimSpace(trimAttr(nameSpan, "title")); t != "" {
|
||
s.Author.Name = strings.ToLower(t)
|
||
}
|
||
if s.Author.Name == "" {
|
||
href, _ := authorBox.Find("a[href^='/user/']").First().Attr("href")
|
||
if parts := strings.Split(strings.Trim(href, "/"), "/"); len(parts) >= 2 {
|
||
s.Author.Name = strings.ToLower(parts[1])
|
||
}
|
||
}
|
||
}
|
||
|
||
// Posted date popup_date carries authoritative data-time.
|
||
s.PostedAt = parsePopupDate(authorBox.Find("span.popup_date").First())
|
||
if s.PostedAt.IsZero() {
|
||
s.PostedAt = parsePopupDate(doc.Find("span.popup_date").First())
|
||
}
|
||
|
||
// Rating div with class c-contentRating--{general,mature,adult} in the
|
||
// page stats panel; fall back to legacy .rating-box for older markup.
|
||
doc.Find("div.submission-page-stats div[class*='c-contentRating--']").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
|
||
s.Rating = ParseRating(trimText(sel))
|
||
return false
|
||
})
|
||
if s.Rating == "" {
|
||
doc.Find(".rating-box").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
|
||
s.Rating = ParseRating(trimText(sel))
|
||
return false
|
||
})
|
||
}
|
||
|
||
// Stats .submission-page-stats > div[title=...] each holds <div>N</div>
|
||
// <div class="highlight">Label</div>.
|
||
doc.Find("div.submission-page-stats > div[title]").Each(func(_ int, sel *goquery.Selection) {
|
||
title := strings.ToLower(trimAttr(sel, "title"))
|
||
val := parseStatNumber(trimText(sel.Find("div").First()))
|
||
switch title {
|
||
case "views":
|
||
s.Stats.Views = val
|
||
case "favorites":
|
||
s.Stats.Favorites = val
|
||
case "comments":
|
||
s.Stats.Comments = val
|
||
}
|
||
})
|
||
|
||
// Category / Theme / Species / Resolution / File Size are two parallel
|
||
// span columns inside .submission-content-stats. Pair them up by index.
|
||
statsBlock := doc.Find("div.submission-content-stats").First()
|
||
if statsBlock.Length() > 0 {
|
||
var labels []string
|
||
statsBlock.Find("span.highlight > span").Each(func(_ int, sel *goquery.Selection) {
|
||
labels = append(labels, strings.ToLower(strings.TrimSpace(sel.Text())))
|
||
})
|
||
var values []string
|
||
statsBlock.ChildrenFiltered("span").Each(func(_ int, sel *goquery.Selection) {
|
||
if class, _ := sel.Attr("class"); strings.Contains(class, "highlight") {
|
||
return // skip the labels column
|
||
}
|
||
sel.ChildrenFiltered("span").Each(func(_ int, inner *goquery.Selection) {
|
||
values = append(values, strings.TrimSpace(inner.Text()))
|
||
})
|
||
})
|
||
for i := 0; i < len(labels) && i < len(values); i++ {
|
||
switch labels[i] {
|
||
case "category":
|
||
s.Category = Category(values[i])
|
||
case "type", "theme":
|
||
s.Type = Type(values[i])
|
||
case "species":
|
||
s.Species = Species(values[i])
|
||
case "gender":
|
||
s.Gender = Gender(values[i])
|
||
case "resolution":
|
||
if w, h, ok := parseResolution(values[i]); ok {
|
||
s.Width, s.Height = w, h
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Description section.submission-description holds the body inside
|
||
// .section-body > .submission-description-text.
|
||
descBody := doc.Find("section.submission-description div.submission-description-text").First()
|
||
if descBody.Length() == 0 {
|
||
descBody = doc.Find("div.submission-description").First()
|
||
}
|
||
s.Description = htmlOf(descBody)
|
||
s.DescriptionText = strings.TrimSpace(descBody.Text())
|
||
|
||
// Tags anchors inside .submission-tags whose href targets the search.
|
||
// Tag-block helper anchors and invalid system tags are filtered out.
|
||
doc.Find("div.submission-tags span.tags a[href*='/search/']").Each(func(_ int, a *goquery.Selection) {
|
||
t := strings.TrimSpace(a.Text())
|
||
if t != "" {
|
||
s.Tags = append(s.Tags, t)
|
||
}
|
||
})
|
||
|
||
// Prefixed system tags FA renders these as tag-block anchors with a
|
||
// data-tag-name attribute carrying a leading single-letter prefix:
|
||
// s_ species, c_ character, a_/u_ artist, t_ type.
|
||
// They are paired with a sibling <span class="tag-invalid"> and have no
|
||
// /search/ href, so they are skipped by the keyword pass above.
|
||
doc.Find("div.submission-tags a.tag-block[data-tag-name]").Each(func(_ int, a *goquery.Selection) {
|
||
raw := strings.TrimSpace(trimAttr(a, "data-tag-name"))
|
||
if len(raw) < 3 || raw[1] != '_' {
|
||
return
|
||
}
|
||
name := raw[2:]
|
||
if name == "" {
|
||
return
|
||
}
|
||
switch raw[0] {
|
||
case 's':
|
||
s.CategorizedTags.Species = append(s.CategorizedTags.Species, name)
|
||
case 'c':
|
||
s.CategorizedTags.Characters = append(s.CategorizedTags.Characters, name)
|
||
case 'a', 'u':
|
||
s.CategorizedTags.Artists = append(s.CategorizedTags.Artists, name)
|
||
case 't':
|
||
s.CategorizedTags.Types = append(s.CategorizedTags.Types, name)
|
||
}
|
||
})
|
||
|
||
// File URL FA renders a "Download" button in #submission-options that
|
||
// links to the canonical file for *every* submission type. For visual
|
||
// art it equals the #submissionImg source; for stories and music it's
|
||
// the only correct source, because FA injects a generated thumbnail
|
||
// (e.g. ".thumbnail.<name>.docx.gif") into #submissionImg there. So the
|
||
// Download button is authoritative; #submissionImg is only a fallback.
|
||
doc.Find("div#submission-options a").EachWithBreak(func(_ int, a *goquery.Selection) bool {
|
||
if strings.EqualFold(trimText(a), "download") {
|
||
s.FileURL = urls.AbsoluteCDN(trimAttr(a, "href"))
|
||
return false
|
||
}
|
||
return true
|
||
})
|
||
|
||
// #submissionImg holds the inline image for visual art, or a generated
|
||
// thumbnail for non-image submissions. It always supplies ThumbURL, but
|
||
// only supplies FileURL when no Download button was found.
|
||
img := doc.Find("#submissionImg").First()
|
||
if img.Length() > 0 {
|
||
s.ThumbURL = urls.AbsoluteCDN(firstNonEmpty(
|
||
trimAttr(img, "data-preview-src"),
|
||
trimAttr(img, "src"),
|
||
))
|
||
if s.FileURL == "" {
|
||
s.FileURL = urls.AbsoluteCDN(firstNonEmpty(
|
||
trimAttr(img, "data-fullview-src"),
|
||
trimAttr(img, "src"),
|
||
))
|
||
}
|
||
// Dimensions also live in width/height attrs on some pages.
|
||
if w, err := strconv.Atoi(trimAttr(img, "data-fullview-width")); err == nil {
|
||
s.Width = w
|
||
}
|
||
if h, err := strconv.Atoi(trimAttr(img, "data-fullview-height")); err == nil {
|
||
s.Height = h
|
||
}
|
||
}
|
||
// Legacy fallback for older themes that predate #submission-options.
|
||
if s.FileURL == "" {
|
||
dl := doc.Find("div.submission-controls-upper a[href*='/d.furaffinity.net/'], div.download a, a.download-logged-in").First()
|
||
s.FileURL = urls.AbsoluteCDN(trimAttr(dl, "href"))
|
||
}
|
||
|
||
// Prev / Next (FA's minigallery-navigation: "Newer" / "Older").
|
||
doc.Find("div.minigallery-navigation a").Each(func(_ int, a *goquery.Selection) {
|
||
href, _ := a.Attr("href")
|
||
text := strings.ToLower(trimText(a))
|
||
id := SubmissionID(extractIntFromHref(href))
|
||
if id == 0 {
|
||
return
|
||
}
|
||
switch {
|
||
case strings.Contains(text, "newer"):
|
||
// "Newer" goes to a more recent submission surface as Prev so
|
||
// callers walking a gallery can call client.GetSubmission(s.Prev)
|
||
// to step toward the newest.
|
||
s.Prev = id
|
||
case strings.Contains(text, "older"):
|
||
s.Next = id
|
||
}
|
||
})
|
||
// Legacy fallback for older themes that still use favorite-nav.
|
||
if s.Prev == 0 && s.Next == 0 {
|
||
doc.Find("div.favorite-nav a, .submission-nav a").Each(func(_ int, a *goquery.Selection) {
|
||
href, _ := a.Attr("href")
|
||
text := strings.ToLower(trimText(a))
|
||
id := SubmissionID(extractIntFromHref(href))
|
||
if id == 0 {
|
||
return
|
||
}
|
||
switch {
|
||
case strings.Contains(text, "prev"):
|
||
s.Prev = id
|
||
case strings.Contains(text, "next"):
|
||
s.Next = id
|
||
}
|
||
})
|
||
}
|
||
|
||
// Favorite state FA renders exactly one of the "+Fav" / "−Fav" anchors
|
||
// for an authenticated viewer; the "−Fav" (/unfav/) link means this
|
||
// submission is currently favorited. findFavLinks (actions.go) already
|
||
// scrapes both. An anonymous fetch shows neither, leaving Favorited false.
|
||
if _, unfav := findFavLinks(doc, int64(s.ID)); unfav != "" {
|
||
s.Favorited = true
|
||
}
|
||
|
||
return s, nil
|
||
}
|
||
|
||
// parseResolution splits a "2071 x 1779" string into width and height ints.
|
||
// Returns ok=false on any malformed input so callers can leave Width/Height
|
||
// at zero.
|
||
func parseResolution(s string) (w, h int, ok bool) {
|
||
parts := strings.Split(s, "x")
|
||
if len(parts) != 2 {
|
||
return 0, 0, false
|
||
}
|
||
wn, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||
if err != nil {
|
||
return 0, 0, false
|
||
}
|
||
hn, err := strconv.Atoi(strings.TrimSpace(parts[1]))
|
||
if err != nil {
|
||
return 0, 0, false
|
||
}
|
||
return wn, hn, true
|
||
}
|