feat(listing): populate Tags and CategorizedTags from figure data-tags
FA's beta listing pages emit each submission's tag list on the
figure's <img data-tags="..."> attribute, mixing prefixed system tags
(s_/c_/a_/u_/t_) with the unprefixed keyword list. Reading it during
gallery-page parse lets callers classify favorites/gallery/scraps/
browse/search/inbox items at scrape time, avoiding a /view/{id}
round-trip per submission.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -85,6 +85,15 @@ func parseGalleryFigure(sel *goquery.Selection, jsonData listingJSONMap) *Submis
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// data-tags on the figure's <img> carries both the unprefixed keyword
|
||||||
|
// list and the prefixed system tags (s_/c_/a_/u_/t_). Splitting it lets
|
||||||
|
// callers classify listing items without an extra /view/ fetch.
|
||||||
|
if img := sel.Find("img[data-tags]").First(); img.Length() > 0 {
|
||||||
|
if raw, ok := img.Attr("data-tags"); ok {
|
||||||
|
applyListingDataTags(s, raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// JSON enrichment preferred sources for the fields it carries.
|
// JSON enrichment preferred sources for the fields it carries.
|
||||||
if jsonData != nil {
|
if jsonData != nil {
|
||||||
if entry, ok := jsonData[id]; ok {
|
if entry, ok := jsonData[id]; ok {
|
||||||
@@ -105,3 +114,35 @@ func parseGalleryFigure(sel *goquery.Selection, jsonData listingJSONMap) *Submis
|
|||||||
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// applyListingDataTags splits the whitespace-separated data-tags attribute
|
||||||
|
// FA emits on listing-page <img> elements and routes each token to either
|
||||||
|
// CategorizedTags (when the token has a known single-letter prefix
|
||||||
|
// s_/c_/a_/u_/t_) or Tags (everything else).
|
||||||
|
//
|
||||||
|
// The prefix mapping mirrors the /view/ parser in submission_parser.go so a
|
||||||
|
// listing-path Submission carries the same categorisation a /view/-path one
|
||||||
|
// would, modulo tokens FA can't represent in this flat attribute (multi-word
|
||||||
|
// tags, the a_ vs u_ distinction).
|
||||||
|
func applyListingDataTags(s *Submission, raw string) {
|
||||||
|
for _, tok := range strings.Fields(raw) {
|
||||||
|
if len(tok) >= 3 && tok[1] == '_' {
|
||||||
|
name := tok[2:]
|
||||||
|
switch tok[0] {
|
||||||
|
case 's':
|
||||||
|
s.CategorizedTags.Species = append(s.CategorizedTags.Species, name)
|
||||||
|
continue
|
||||||
|
case 'c':
|
||||||
|
s.CategorizedTags.Characters = append(s.CategorizedTags.Characters, name)
|
||||||
|
continue
|
||||||
|
case 'a', 'u':
|
||||||
|
s.CategorizedTags.Artists = append(s.CategorizedTags.Artists, name)
|
||||||
|
continue
|
||||||
|
case 't':
|
||||||
|
s.CategorizedTags.Types = append(s.CategorizedTags.Types, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Tags = append(s.Tags, tok)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -62,6 +62,99 @@ func TestParseGalleryPage_Synthetic(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseGalleryFigure_DataTags(t *testing.T) {
|
||||||
|
const html = `<html><body>
|
||||||
|
<figure id="sid-2001" class="t-image r-general">
|
||||||
|
<a href="/view/2001/" title="Mixed Tags">
|
||||||
|
<img data-tags="u_someartist c_artwork_digital t_all s_wolf wolf solo digital landscape" src="//d.example/thumb/2001.png"/>
|
||||||
|
</a>
|
||||||
|
</figure>
|
||||||
|
<figure id="sid-2002" class="t-image r-general">
|
||||||
|
<a href="/view/2002/" title="No Tags">
|
||||||
|
<img src="//d.example/thumb/2002.png"/>
|
||||||
|
</a>
|
||||||
|
</figure>
|
||||||
|
<figure id="sid-2003" class="t-image r-general">
|
||||||
|
<a href="/view/2003/" title="Only Keywords">
|
||||||
|
<img data-tags="wolf solo" src="//d.example/thumb/2003.png"/>
|
||||||
|
</a>
|
||||||
|
</figure>
|
||||||
|
</body></html>`
|
||||||
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("setup: %v", err)
|
||||||
|
}
|
||||||
|
items, _ := parseGalleryPage(doc, false)
|
||||||
|
if len(items) != 3 {
|
||||||
|
t.Fatalf("items = %d; want 3", len(items))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mixed prefixed + unprefixed.
|
||||||
|
mixed := items[0]
|
||||||
|
wantTags := []string{"wolf", "solo", "digital", "landscape"}
|
||||||
|
if !equalStrings(mixed.Tags, wantTags) {
|
||||||
|
t.Errorf("items[0].Tags = %v; want %v", mixed.Tags, wantTags)
|
||||||
|
}
|
||||||
|
if !equalStrings(mixed.CategorizedTags.Species, []string{"wolf"}) {
|
||||||
|
t.Errorf("items[0].Species = %v", mixed.CategorizedTags.Species)
|
||||||
|
}
|
||||||
|
if !equalStrings(mixed.CategorizedTags.Characters, []string{"artwork_digital"}) {
|
||||||
|
t.Errorf("items[0].Characters = %v", mixed.CategorizedTags.Characters)
|
||||||
|
}
|
||||||
|
if !equalStrings(mixed.CategorizedTags.Types, []string{"all"}) {
|
||||||
|
t.Errorf("items[0].Types = %v", mixed.CategorizedTags.Types)
|
||||||
|
}
|
||||||
|
if !equalStrings(mixed.CategorizedTags.Artists, []string{"someartist"}) {
|
||||||
|
t.Errorf("items[0].Artists = %v", mixed.CategorizedTags.Artists)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Missing data-tags: both slices stay nil.
|
||||||
|
if items[1].Tags != nil {
|
||||||
|
t.Errorf("items[1].Tags = %v; want nil", items[1].Tags)
|
||||||
|
}
|
||||||
|
if items[1].CategorizedTags.Species != nil ||
|
||||||
|
items[1].CategorizedTags.Characters != nil ||
|
||||||
|
items[1].CategorizedTags.Artists != nil ||
|
||||||
|
items[1].CategorizedTags.Types != nil {
|
||||||
|
t.Errorf("items[1].CategorizedTags = %+v; want zero", items[1].CategorizedTags)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unprefixed-only: everything lands in Tags.
|
||||||
|
if !equalStrings(items[2].Tags, []string{"wolf", "solo"}) {
|
||||||
|
t.Errorf("items[2].Tags = %v", items[2].Tags)
|
||||||
|
}
|
||||||
|
if items[2].CategorizedTags.Species != nil {
|
||||||
|
t.Errorf("items[2].Species = %v; want nil", items[2].CategorizedTags.Species)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseGalleryPage_RealFixtureTags(t *testing.T) {
|
||||||
|
raw := loadFixture(t, "gallery_page1.html")
|
||||||
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(raw))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read doc: %v", err)
|
||||||
|
}
|
||||||
|
items, _ := parseGalleryPage(doc, false)
|
||||||
|
if len(items) == 0 {
|
||||||
|
t.Fatal("real fixture: no items parsed")
|
||||||
|
}
|
||||||
|
var withTags, withSpecies int
|
||||||
|
for _, it := range items {
|
||||||
|
if len(it.Tags) > 0 {
|
||||||
|
withTags++
|
||||||
|
}
|
||||||
|
if len(it.CategorizedTags.Species) > 0 {
|
||||||
|
withSpecies++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if withTags == 0 {
|
||||||
|
t.Error("no items got Tags populated from data-tags")
|
||||||
|
}
|
||||||
|
if withSpecies == 0 {
|
||||||
|
t.Error("no items got CategorizedTags.Species populated from data-tags")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseGalleryPage_RealFixture(t *testing.T) {
|
func TestParseGalleryPage_RealFixture(t *testing.T) {
|
||||||
raw := loadFixture(t, "gallery_page1.html")
|
raw := loadFixture(t, "gallery_page1.html")
|
||||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(raw))
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(raw))
|
||||||
|
|||||||
@@ -35,10 +35,19 @@ type Submission struct {
|
|||||||
Gender Gender
|
Gender Gender
|
||||||
Description string // raw HTML; sanitise before rendering to a browser
|
Description string // raw HTML; sanitise before rendering to a browser
|
||||||
DescriptionText string // plaintext convenience
|
DescriptionText string // plaintext convenience
|
||||||
Tags []string
|
// Tags holds the user-supplied keyword tags. On /view/-path Submissions
|
||||||
|
// these come from div.submission-tags anchors. On listing-path
|
||||||
|
// Submissions (Gallery/Scraps/Favorites/Browse/Search/SubmissionInbox)
|
||||||
|
// they come from the figure's data-tags attribute, which carries the
|
||||||
|
// same keywords FA renders on /view/ for that submission.
|
||||||
|
Tags []string
|
||||||
// CategorizedTags groups FA's prefixed system tags by category.
|
// CategorizedTags groups FA's prefixed system tags by category.
|
||||||
// FA emits these as tag-block entries inside div.submission-tags with
|
// On /view/-path Submissions FA emits these as tag-block entries inside
|
||||||
// prefixes s_ (species), c_ (character), a_/u_ (artist), and t_ (type).
|
// div.submission-tags with prefixes s_ (species), c_ (character),
|
||||||
|
// a_/u_ (artist), and t_ (type). On listing-path Submissions the same
|
||||||
|
// prefixed tokens are parsed out of the figure's data-tags attribute;
|
||||||
|
// the a_ vs u_ distinction is lost there because FA collapses both into
|
||||||
|
// u_ in that flat list.
|
||||||
CategorizedTags CategorizedTags
|
CategorizedTags CategorizedTags
|
||||||
FileURL string // absolute CDN URL; pass to Download
|
FileURL string // absolute CDN URL; pass to Download
|
||||||
ThumbURL string
|
ThumbURL string
|
||||||
|
|||||||
Reference in New Issue
Block a user