66 lines
2.2 KiB
Go
66 lines
2.2 KiB
Go
package fa
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
// classifySystemMessage looks at a fetched document and, when it is one of
|
|
// FA's gate/error templates rather than a real content page, maps it to the
|
|
// most specific sentinel error. Returns nil for normal content pages so
|
|
// parsing continues.
|
|
//
|
|
// FA emits at least two distinct gate templates:
|
|
//
|
|
// - <title>System Error</title> generic "not found / no permission /
|
|
// rate limited" page with the human message in a section-body.
|
|
// - <title>Login Required …</title> auth-gated content (some submissions
|
|
// are not viewable anonymously). Always maps to ErrUnauthorized.
|
|
//
|
|
// We classify by <title> first to pick a template, then by message text
|
|
// within the section-body when needed. Anything else is left for the
|
|
// downstream parser to handle.
|
|
func classifySystemMessage(doc *goquery.Document) error {
|
|
pageTitle := strings.ToLower(strings.TrimSpace(doc.Find("title").First().Text()))
|
|
|
|
switch {
|
|
case strings.HasPrefix(pageTitle, "login required"):
|
|
return ErrUnauthorized
|
|
case pageTitle == "system error",
|
|
strings.HasPrefix(pageTitle, "system error"):
|
|
// fall through to body classification below
|
|
default:
|
|
return nil
|
|
}
|
|
|
|
headerTitle := trimText(doc.Find("section .section-header h2").First())
|
|
if headerTitle == "" {
|
|
headerTitle = trimText(doc.Find("h2").First())
|
|
}
|
|
body := strings.TrimSpace(doc.Find("section .section-body").First().Text())
|
|
if body == "" {
|
|
body = strings.TrimSpace(doc.Find(".section-body").First().Text())
|
|
}
|
|
|
|
low := strings.ToLower(headerTitle + " " + body)
|
|
switch {
|
|
case strings.Contains(low, "not in our database"),
|
|
strings.Contains(low, "submission not found"),
|
|
strings.Contains(low, "user not found"),
|
|
strings.Contains(low, "journal not found"),
|
|
strings.Contains(low, "page not found"),
|
|
strings.Contains(low, "no such"):
|
|
return ErrNotFound
|
|
case strings.Contains(low, "you must be logged in"),
|
|
strings.Contains(low, "log in to view"),
|
|
strings.Contains(low, "permission to view"),
|
|
strings.Contains(low, "permission to access"):
|
|
return ErrUnauthorized
|
|
case strings.Contains(low, "rate limit"),
|
|
strings.Contains(low, "too many requests"):
|
|
return ErrRateLimited
|
|
}
|
|
return &SystemMessageError{Title: headerTitle, Body: body}
|
|
}
|