diff --git a/gallery_parser.go b/gallery_parser.go index 53fed6a..037ceca 100644 --- a/gallery_parser.go +++ b/gallery_parser.go @@ -85,6 +85,15 @@ func parseGalleryFigure(sel *goquery.Selection, jsonData listingJSONMap) *Submis } } + // data-tags on the figure's carries both the unprefixed keyword + // list and the prefixed system tags (s_/c_/a_/u_/t_). Splitting it lets + // callers classify listing items without an extra /view/ fetch. + if img := sel.Find("img[data-tags]").First(); img.Length() > 0 { + if raw, ok := img.Attr("data-tags"); ok { + applyListingDataTags(s, raw) + } + } + // JSON enrichment preferred sources for the fields it carries. if jsonData != nil { if entry, ok := jsonData[id]; ok { @@ -105,3 +114,35 @@ func parseGalleryFigure(sel *goquery.Selection, jsonData listingJSONMap) *Submis return s } + +// applyListingDataTags splits the whitespace-separated data-tags attribute +// FA emits on listing-page elements and routes each token to either +// CategorizedTags (when the token has a known single-letter prefix +// s_/c_/a_/u_/t_) or Tags (everything else). +// +// The prefix mapping mirrors the /view/ parser in submission_parser.go so a +// listing-path Submission carries the same categorisation a /view/-path one +// would, modulo tokens FA can't represent in this flat attribute (multi-word +// tags, the a_ vs u_ distinction). +func applyListingDataTags(s *Submission, raw string) { + for _, tok := range strings.Fields(raw) { + if len(tok) >= 3 && tok[1] == '_' { + name := tok[2:] + switch tok[0] { + case 's': + s.CategorizedTags.Species = append(s.CategorizedTags.Species, name) + continue + case 'c': + s.CategorizedTags.Characters = append(s.CategorizedTags.Characters, name) + continue + case 'a', 'u': + s.CategorizedTags.Artists = append(s.CategorizedTags.Artists, name) + continue + case 't': + s.CategorizedTags.Types = append(s.CategorizedTags.Types, name) + continue + } + } + s.Tags = append(s.Tags, tok) + } +} diff --git a/gallery_parser_test.go b/gallery_parser_test.go index ee631da..44db617 100644 --- a/gallery_parser_test.go +++ b/gallery_parser_test.go @@ -62,6 +62,99 @@ func TestParseGalleryPage_Synthetic(t *testing.T) { } } +func TestParseGalleryFigure_DataTags(t *testing.T) { + const html = ` +
+ + + +
+
+ + + +
+
+ + + +
+` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatalf("setup: %v", err) + } + items, _ := parseGalleryPage(doc, false) + if len(items) != 3 { + t.Fatalf("items = %d; want 3", len(items)) + } + + // Mixed prefixed + unprefixed. + mixed := items[0] + wantTags := []string{"wolf", "solo", "digital", "landscape"} + if !equalStrings(mixed.Tags, wantTags) { + t.Errorf("items[0].Tags = %v; want %v", mixed.Tags, wantTags) + } + if !equalStrings(mixed.CategorizedTags.Species, []string{"wolf"}) { + t.Errorf("items[0].Species = %v", mixed.CategorizedTags.Species) + } + if !equalStrings(mixed.CategorizedTags.Characters, []string{"artwork_digital"}) { + t.Errorf("items[0].Characters = %v", mixed.CategorizedTags.Characters) + } + if !equalStrings(mixed.CategorizedTags.Types, []string{"all"}) { + t.Errorf("items[0].Types = %v", mixed.CategorizedTags.Types) + } + if !equalStrings(mixed.CategorizedTags.Artists, []string{"someartist"}) { + t.Errorf("items[0].Artists = %v", mixed.CategorizedTags.Artists) + } + + // Missing data-tags: both slices stay nil. + if items[1].Tags != nil { + t.Errorf("items[1].Tags = %v; want nil", items[1].Tags) + } + if items[1].CategorizedTags.Species != nil || + items[1].CategorizedTags.Characters != nil || + items[1].CategorizedTags.Artists != nil || + items[1].CategorizedTags.Types != nil { + t.Errorf("items[1].CategorizedTags = %+v; want zero", items[1].CategorizedTags) + } + + // Unprefixed-only: everything lands in Tags. + if !equalStrings(items[2].Tags, []string{"wolf", "solo"}) { + t.Errorf("items[2].Tags = %v", items[2].Tags) + } + if items[2].CategorizedTags.Species != nil { + t.Errorf("items[2].Species = %v; want nil", items[2].CategorizedTags.Species) + } +} + +func TestParseGalleryPage_RealFixtureTags(t *testing.T) { + raw := loadFixture(t, "gallery_page1.html") + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(raw)) + if err != nil { + t.Fatalf("read doc: %v", err) + } + items, _ := parseGalleryPage(doc, false) + if len(items) == 0 { + t.Fatal("real fixture: no items parsed") + } + var withTags, withSpecies int + for _, it := range items { + if len(it.Tags) > 0 { + withTags++ + } + if len(it.CategorizedTags.Species) > 0 { + withSpecies++ + } + } + if withTags == 0 { + t.Error("no items got Tags populated from data-tags") + } + if withSpecies == 0 { + t.Error("no items got CategorizedTags.Species populated from data-tags") + } +} + func TestParseGalleryPage_RealFixture(t *testing.T) { raw := loadFixture(t, "gallery_page1.html") doc, err := goquery.NewDocumentFromReader(bytes.NewReader(raw)) diff --git a/submission.go b/submission.go index 04323e9..89e0333 100644 --- a/submission.go +++ b/submission.go @@ -35,10 +35,19 @@ type Submission struct { Gender Gender Description string // raw HTML; sanitise before rendering to a browser DescriptionText string // plaintext convenience - Tags []string + // Tags holds the user-supplied keyword tags. On /view/-path Submissions + // these come from div.submission-tags anchors. On listing-path + // Submissions (Gallery/Scraps/Favorites/Browse/Search/SubmissionInbox) + // they come from the figure's data-tags attribute, which carries the + // same keywords FA renders on /view/ for that submission. + Tags []string // CategorizedTags groups FA's prefixed system tags by category. - // FA emits these as tag-block entries inside div.submission-tags with - // prefixes s_ (species), c_ (character), a_/u_ (artist), and t_ (type). + // On /view/-path Submissions FA emits these as tag-block entries inside + // div.submission-tags with prefixes s_ (species), c_ (character), + // a_/u_ (artist), and t_ (type). On listing-path Submissions the same + // prefixed tokens are parsed out of the figure's data-tags attribute; + // the a_ vs u_ distinction is lost there because FA collapses both into + // u_ in that flat list. CategorizedTags CategorizedTags FileURL string // absolute CDN URL; pass to Download ThumbURL string