inital commit
This commit is contained in:
266
submission_parser.go
Normal file
266
submission_parser.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package fa
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"git.anthrove.art/public/go-fa-api/internal/urls"
|
||||
)
|
||||
|
||||
// parseSubmission lifts a [Submission] out of a /view/{id}/ document. The
|
||||
// selectors target FA's beta theme as captured in testdata/html/submission.html.
|
||||
//
|
||||
// FA's beta page renders submission metadata as two parallel <span> columns
|
||||
// inside .submission-content-stats labels in the first, values in the
|
||||
// second so this parser pairs them up positionally rather than scanning
|
||||
// label-then-value rows.
|
||||
func parseSubmission(id SubmissionID, doc *goquery.Document) (*Submission, error) {
|
||||
s := &Submission{ID: id}
|
||||
|
||||
// Resolve the canonical ID from the og:url meta tag when caller passed 0
|
||||
// (e.g. the real-fixture test). Lets the parser stand on its own.
|
||||
if s.ID == 0 {
|
||||
if og := trimAttr(doc.Find(`meta[property="og:url"]`).First(), "content"); og != "" {
|
||||
if n := extractIntFromHref(og); n > 0 {
|
||||
s.ID = SubmissionID(n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Author scoping: there can be multiple .iconusername references on the
|
||||
// page (e.g. inside the description). The submission's true author lives
|
||||
// inside .submission-description-artist.
|
||||
authorBox := doc.Find("div.submission-description-artist").First()
|
||||
|
||||
// Title.
|
||||
s.Title = strings.TrimSpace(authorBox.Find("div.submission-title h2").First().Text())
|
||||
if s.Title == "" {
|
||||
s.Title = strings.TrimSpace(doc.Find("div.submission-title h2").First().Text())
|
||||
}
|
||||
if s.Title == "" {
|
||||
// Surface what FA actually served so the caller can tell the
|
||||
// difference between a CF challenge, an SFW guard, a deleted
|
||||
// submission, and a real markup-drift bug.
|
||||
pageTitle := strings.TrimSpace(doc.Find("title").First().Text())
|
||||
return nil, fmt.Errorf("%w: submission %d: missing title (page <title>=%q)",
|
||||
ErrParse, id, pageTitle)
|
||||
}
|
||||
|
||||
// Author.
|
||||
if authorBox.Length() > 0 {
|
||||
avatarLink := authorBox.Find("a[href^='/user/'] img").First()
|
||||
nameSpan := authorBox.Find(".c-usernameBlockSimple__displayName").First()
|
||||
s.Author = UserRef{
|
||||
DisplayName: trimText(nameSpan),
|
||||
AvatarURL: urls.AbsoluteCDN(trimAttr(avatarLink, "src")),
|
||||
}
|
||||
// Prefer the title attr (URL-safe login) when present; fall back to href.
|
||||
if t := strings.TrimSpace(trimAttr(nameSpan, "title")); t != "" {
|
||||
s.Author.Name = strings.ToLower(t)
|
||||
}
|
||||
if s.Author.Name == "" {
|
||||
href, _ := authorBox.Find("a[href^='/user/']").First().Attr("href")
|
||||
if parts := strings.Split(strings.Trim(href, "/"), "/"); len(parts) >= 2 {
|
||||
s.Author.Name = strings.ToLower(parts[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Posted date popup_date carries authoritative data-time.
|
||||
s.PostedAt = parsePopupDate(authorBox.Find("span.popup_date").First())
|
||||
if s.PostedAt.IsZero() {
|
||||
s.PostedAt = parsePopupDate(doc.Find("span.popup_date").First())
|
||||
}
|
||||
|
||||
// Rating div with class c-contentRating--{general,mature,adult} in the
|
||||
// page stats panel; fall back to legacy .rating-box for older markup.
|
||||
doc.Find("div.submission-page-stats div[class*='c-contentRating--']").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
|
||||
s.Rating = ParseRating(trimText(sel))
|
||||
return false
|
||||
})
|
||||
if s.Rating == "" {
|
||||
doc.Find(".rating-box").EachWithBreak(func(_ int, sel *goquery.Selection) bool {
|
||||
s.Rating = ParseRating(trimText(sel))
|
||||
return false
|
||||
})
|
||||
}
|
||||
|
||||
// Stats .submission-page-stats > div[title=...] each holds <div>N</div>
|
||||
// <div class="highlight">Label</div>.
|
||||
doc.Find("div.submission-page-stats > div[title]").Each(func(_ int, sel *goquery.Selection) {
|
||||
title := strings.ToLower(trimAttr(sel, "title"))
|
||||
val := parseStatNumber(trimText(sel.Find("div").First()))
|
||||
switch title {
|
||||
case "views":
|
||||
s.Stats.Views = val
|
||||
case "favorites":
|
||||
s.Stats.Favorites = val
|
||||
case "comments":
|
||||
s.Stats.Comments = val
|
||||
}
|
||||
})
|
||||
|
||||
// Category / Theme / Species / Resolution / File Size are two parallel
|
||||
// span columns inside .submission-content-stats. Pair them up by index.
|
||||
statsBlock := doc.Find("div.submission-content-stats").First()
|
||||
if statsBlock.Length() > 0 {
|
||||
var labels []string
|
||||
statsBlock.Find("span.highlight > span").Each(func(_ int, sel *goquery.Selection) {
|
||||
labels = append(labels, strings.ToLower(strings.TrimSpace(sel.Text())))
|
||||
})
|
||||
var values []string
|
||||
statsBlock.ChildrenFiltered("span").Each(func(_ int, sel *goquery.Selection) {
|
||||
if class, _ := sel.Attr("class"); strings.Contains(class, "highlight") {
|
||||
return // skip the labels column
|
||||
}
|
||||
sel.ChildrenFiltered("span").Each(func(_ int, inner *goquery.Selection) {
|
||||
values = append(values, strings.TrimSpace(inner.Text()))
|
||||
})
|
||||
})
|
||||
for i := 0; i < len(labels) && i < len(values); i++ {
|
||||
switch labels[i] {
|
||||
case "category":
|
||||
s.Category = Category(values[i])
|
||||
case "type", "theme":
|
||||
s.Type = Type(values[i])
|
||||
case "species":
|
||||
s.Species = Species(values[i])
|
||||
case "gender":
|
||||
s.Gender = Gender(values[i])
|
||||
case "resolution":
|
||||
if w, h, ok := parseResolution(values[i]); ok {
|
||||
s.Width, s.Height = w, h
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Description section.submission-description holds the body inside
|
||||
// .section-body > .submission-description-text.
|
||||
descBody := doc.Find("section.submission-description div.submission-description-text").First()
|
||||
if descBody.Length() == 0 {
|
||||
descBody = doc.Find("div.submission-description").First()
|
||||
}
|
||||
s.Description = htmlOf(descBody)
|
||||
s.DescriptionText = strings.TrimSpace(descBody.Text())
|
||||
|
||||
// Tags anchors inside .submission-tags whose href targets the search.
|
||||
// Tag-block helper anchors and invalid system tags are filtered out.
|
||||
doc.Find("div.submission-tags span.tags a[href*='/search/']").Each(func(_ int, a *goquery.Selection) {
|
||||
t := strings.TrimSpace(a.Text())
|
||||
if t != "" {
|
||||
s.Tags = append(s.Tags, t)
|
||||
}
|
||||
})
|
||||
|
||||
// File URL FA renders a "Download" button in #submission-options that
|
||||
// links to the canonical file for *every* submission type. For visual
|
||||
// art it equals the #submissionImg source; for stories and music it's
|
||||
// the only correct source, because FA injects a generated thumbnail
|
||||
// (e.g. ".thumbnail.<name>.docx.gif") into #submissionImg there. So the
|
||||
// Download button is authoritative; #submissionImg is only a fallback.
|
||||
doc.Find("div#submission-options a").EachWithBreak(func(_ int, a *goquery.Selection) bool {
|
||||
if strings.EqualFold(trimText(a), "download") {
|
||||
s.FileURL = urls.AbsoluteCDN(trimAttr(a, "href"))
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
// #submissionImg holds the inline image for visual art, or a generated
|
||||
// thumbnail for non-image submissions. It always supplies ThumbURL, but
|
||||
// only supplies FileURL when no Download button was found.
|
||||
img := doc.Find("#submissionImg").First()
|
||||
if img.Length() > 0 {
|
||||
s.ThumbURL = urls.AbsoluteCDN(firstNonEmpty(
|
||||
trimAttr(img, "data-preview-src"),
|
||||
trimAttr(img, "src"),
|
||||
))
|
||||
if s.FileURL == "" {
|
||||
s.FileURL = urls.AbsoluteCDN(firstNonEmpty(
|
||||
trimAttr(img, "data-fullview-src"),
|
||||
trimAttr(img, "src"),
|
||||
))
|
||||
}
|
||||
// Dimensions also live in width/height attrs on some pages.
|
||||
if w, err := strconv.Atoi(trimAttr(img, "data-fullview-width")); err == nil {
|
||||
s.Width = w
|
||||
}
|
||||
if h, err := strconv.Atoi(trimAttr(img, "data-fullview-height")); err == nil {
|
||||
s.Height = h
|
||||
}
|
||||
}
|
||||
// Legacy fallback for older themes that predate #submission-options.
|
||||
if s.FileURL == "" {
|
||||
dl := doc.Find("div.submission-controls-upper a[href*='/d.furaffinity.net/'], div.download a, a.download-logged-in").First()
|
||||
s.FileURL = urls.AbsoluteCDN(trimAttr(dl, "href"))
|
||||
}
|
||||
|
||||
// Prev / Next (FA's minigallery-navigation: "Newer" / "Older").
|
||||
doc.Find("div.minigallery-navigation a").Each(func(_ int, a *goquery.Selection) {
|
||||
href, _ := a.Attr("href")
|
||||
text := strings.ToLower(trimText(a))
|
||||
id := SubmissionID(extractIntFromHref(href))
|
||||
if id == 0 {
|
||||
return
|
||||
}
|
||||
switch {
|
||||
case strings.Contains(text, "newer"):
|
||||
// "Newer" goes to a more recent submission surface as Prev so
|
||||
// callers walking a gallery can call client.GetSubmission(s.Prev)
|
||||
// to step toward the newest.
|
||||
s.Prev = id
|
||||
case strings.Contains(text, "older"):
|
||||
s.Next = id
|
||||
}
|
||||
})
|
||||
// Legacy fallback for older themes that still use favorite-nav.
|
||||
if s.Prev == 0 && s.Next == 0 {
|
||||
doc.Find("div.favorite-nav a, .submission-nav a").Each(func(_ int, a *goquery.Selection) {
|
||||
href, _ := a.Attr("href")
|
||||
text := strings.ToLower(trimText(a))
|
||||
id := SubmissionID(extractIntFromHref(href))
|
||||
if id == 0 {
|
||||
return
|
||||
}
|
||||
switch {
|
||||
case strings.Contains(text, "prev"):
|
||||
s.Prev = id
|
||||
case strings.Contains(text, "next"):
|
||||
s.Next = id
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Favorite state FA renders exactly one of the "+Fav" / "−Fav" anchors
|
||||
// for an authenticated viewer; the "−Fav" (/unfav/) link means this
|
||||
// submission is currently favorited. findFavLinks (actions.go) already
|
||||
// scrapes both. An anonymous fetch shows neither, leaving Favorited false.
|
||||
if _, unfav := findFavLinks(doc, int64(s.ID)); unfav != "" {
|
||||
s.Favorited = true
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// parseResolution splits a "2071 x 1779" string into width and height ints.
|
||||
// Returns ok=false on any malformed input so callers can leave Width/Height
|
||||
// at zero.
|
||||
func parseResolution(s string) (w, h int, ok bool) {
|
||||
parts := strings.Split(s, "x")
|
||||
if len(parts) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
wn, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
hn, err := strconv.Atoi(strings.TrimSpace(parts[1]))
|
||||
if err != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
return wn, hn, true
|
||||
}
|
||||
Reference in New Issue
Block a user