Files
go-fa-api/system_message.go
2026-05-25 22:27:18 +02:00

66 lines
2.2 KiB
Go

package fa
import (
"strings"
"github.com/PuerkitoBio/goquery"
)
// classifySystemMessage looks at a fetched document and, when it is one of
// FA's gate/error templates rather than a real content page, maps it to the
// most specific sentinel error. Returns nil for normal content pages so
// parsing continues.
//
// FA emits at least two distinct gate templates:
//
// - <title>System Error</title> generic "not found / no permission /
// rate limited" page with the human message in a section-body.
// - <title>Login Required …</title> auth-gated content (some submissions
// are not viewable anonymously). Always maps to ErrUnauthorized.
//
// We classify by <title> first to pick a template, then by message text
// within the section-body when needed. Anything else is left for the
// downstream parser to handle.
func classifySystemMessage(doc *goquery.Document) error {
pageTitle := strings.ToLower(strings.TrimSpace(doc.Find("title").First().Text()))
switch {
case strings.HasPrefix(pageTitle, "login required"):
return ErrUnauthorized
case pageTitle == "system error",
strings.HasPrefix(pageTitle, "system error"):
// fall through to body classification below
default:
return nil
}
headerTitle := trimText(doc.Find("section .section-header h2").First())
if headerTitle == "" {
headerTitle = trimText(doc.Find("h2").First())
}
body := strings.TrimSpace(doc.Find("section .section-body").First().Text())
if body == "" {
body = strings.TrimSpace(doc.Find(".section-body").First().Text())
}
low := strings.ToLower(headerTitle + " " + body)
switch {
case strings.Contains(low, "not in our database"),
strings.Contains(low, "submission not found"),
strings.Contains(low, "user not found"),
strings.Contains(low, "journal not found"),
strings.Contains(low, "page not found"),
strings.Contains(low, "no such"):
return ErrNotFound
case strings.Contains(low, "you must be logged in"),
strings.Contains(low, "log in to view"),
strings.Contains(low, "permission to view"),
strings.Contains(low, "permission to access"):
return ErrUnauthorized
case strings.Contains(low, "rate limit"),
strings.Contains(low, "too many requests"):
return ErrRateLimited
}
return &SystemMessageError{Title: headerTitle, Body: body}
}