go-fa-api/gallery_parser.go

package fa

import (
	"strings"

	"github.com/PuerkitoBio/goquery"

	"git.anthrove.art/public/go-fa-api/internal/urls"
)

// parseGalleryPage parses one page of /gallery/, /scraps/, /favorites/, or
// /browse/, returning each submission preview and whether a next page
// exists.
//
// useJSON controls the experimental JSON-first merge: when true, the
// parser reads the embedded js-submissionData blob first and uses it as
// the primary source for title/author/avatar; HTML scraping covers what
// the JSON doesn't carry (rating, thumb, ID). When false the parser is
// pure HTML the same behaviour as before [WithExperimentalJSONListings]
// existed.
func parseGalleryPage(doc *goquery.Document, useJSON bool) (items []*Submission, hasNext bool) {
	items, _, hasNext = parseListingPage(doc, useJSON)
	return items, hasNext
}

// parseListingPage parses one page of a listing endpoint and also returns
// the raw next-page URL FA emits in its "Next" pagination form. Callers
// that need to chain across cursor-based pages (Favorites) consume the
// URL; callers that don't (Gallery / Scraps) can ignore it.
func parseListingPage(doc *goquery.Document, useJSON bool) (items []*Submission, nextURL string, hasNext bool) {
	var jsonData listingJSONMap
	if useJSON {
		jsonData = readListingJSON(doc)
	}
	doc.Find("figure[id^=sid-]").Each(func(_ int, sel *goquery.Selection) {
		if s := parseGalleryFigure(sel, jsonData); s != nil {
			items = append(items, s)
		}
	})
	nextURL, hasNext = nextPageURL(doc)
	return items, nextURL, hasNext
}

// parseGalleryFigure lifts a single submission preview from a
// <figure id="sid-…"> element. Shared between gallery, browse, favorites,
// search, and the submission inbox.
//
// When jsonData is non-nil and contains an entry for this submission's
// ID, the JSON values win for title/author display name/lower-cased name/
// avatar. Rating, ThumbURL, and ID always come from the HTML those
// aren't represented in the JSON blob.
func parseGalleryFigure(sel *goquery.Selection, jsonData listingJSONMap) *Submission {
	idAttr, _ := sel.Attr("id")
	idStr := strings.TrimPrefix(idAttr, "sid-")
	id, err := parseID[SubmissionID](idStr)
	if err != nil || id == 0 {
		return nil
	}
	s := &Submission{ID: id}

	viewLink := sel.Find("a[href^='/view/']").First()
	if viewLink.Length() > 0 {
		s.Title = firstNonEmpty(
			trimAttr(viewLink, "title"),
			trimText(sel.Find("figcaption p:first-child").First()),
			trimText(viewLink),
		)
		img := viewLink.Find("img").First()
		s.ThumbURL = urls.AbsoluteCDN(firstNonEmpty(
			trimAttr(img, "data-src"),
			trimAttr(img, "src"),
		))
	}

	// Rating class on the figure: figure.t-image.r-general (et al.)
	class, _ := sel.Attr("class")
	switch {
	case strings.Contains(class, "r-adult"):
		s.Rating = RatingAdult
	case strings.Contains(class, "r-mature"):
		s.Rating = RatingMature
	case strings.Contains(class, "r-general"):
		s.Rating = RatingGeneral
	}

	// Author from figcaption (favorites/browse render an artist link there).
	if author := sel.Find("figcaption a[href^='/user/']").First(); author.Length() > 0 {
		href, _ := author.Attr("href")
		s.Author = UserRef{
			DisplayName: trimText(author),
		}
		if parts := strings.Split(strings.Trim(href, "/"), "/"); len(parts) >= 2 {
			s.Author.Name = strings.ToLower(parts[1])
		}
	}

	// data-tags on the figure's <img> carries both the unprefixed keyword
	// list and the prefixed system tags (s_/c_/a_/u_/t_). Splitting it lets
	// callers classify listing items without an extra /view/ fetch.
	if img := sel.Find("img[data-tags]").First(); img.Length() > 0 {
		if raw, ok := img.Attr("data-tags"); ok {
			applyListingDataTags(s, raw)
		}
	}

	// JSON enrichment preferred sources for the fields it carries.
	if jsonData != nil {
		if entry, ok := jsonData[id]; ok {
			if entry.Title != "" {
				s.Title = entry.Title
			}
			if entry.Username != "" {
				s.Author.DisplayName = entry.Username
			}
			if entry.Lower != "" {
				s.Author.Name = entry.Lower
			}
			if av := avatarURLFromMtime(entry.Lower, entry.AvatarMtime); av != "" {
				s.Author.AvatarURL = av
			}
		}
	}

	return s
}

// applyListingDataTags splits the whitespace-separated data-tags attribute
// FA emits on listing-page <img> elements and routes each token to either
// CategorizedTags (when the token has a known single-letter prefix
// s_/c_/a_/u_/t_) or Tags (everything else).
//
// The prefix mapping mirrors the /view/ parser in submission_parser.go so a
// listing-path Submission carries the same categorisation a /view/-path one
// would, modulo tokens FA can't represent in this flat attribute (multi-word
// tags, the a_ vs u_ distinction).
func applyListingDataTags(s *Submission, raw string) {
	for _, tok := range strings.Fields(raw) {
		if len(tok) >= 3 && tok[1] == '_' {
			name := tok[2:]
			switch tok[0] {
			case 's':
				s.CategorizedTags.Species = append(s.CategorizedTags.Species, name)
				continue
			case 'c':
				s.CategorizedTags.Characters = append(s.CategorizedTags.Characters, name)
				continue
			case 'a', 'u':
				s.CategorizedTags.Artists = append(s.CategorizedTags.Artists, name)
				continue
			case 't':
				s.CategorizedTags.Types = append(s.CategorizedTags.Types, name)
				continue
			}
		}
		s.Tags = append(s.Tags, tok)
	}
}