inital commit
This commit is contained in:
65
system_message.go
Normal file
65
system_message.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package fa
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// classifySystemMessage looks at a fetched document and, when it is one of
|
||||
// FA's gate/error templates rather than a real content page, maps it to the
|
||||
// most specific sentinel error. Returns nil for normal content pages so
|
||||
// parsing continues.
|
||||
//
|
||||
// FA emits at least two distinct gate templates:
|
||||
//
|
||||
// - <title>System Error</title> generic "not found / no permission /
|
||||
// rate limited" page with the human message in a section-body.
|
||||
// - <title>Login Required …</title> auth-gated content (some submissions
|
||||
// are not viewable anonymously). Always maps to ErrUnauthorized.
|
||||
//
|
||||
// We classify by <title> first to pick a template, then by message text
|
||||
// within the section-body when needed. Anything else is left for the
|
||||
// downstream parser to handle.
|
||||
func classifySystemMessage(doc *goquery.Document) error {
|
||||
pageTitle := strings.ToLower(strings.TrimSpace(doc.Find("title").First().Text()))
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(pageTitle, "login required"):
|
||||
return ErrUnauthorized
|
||||
case pageTitle == "system error",
|
||||
strings.HasPrefix(pageTitle, "system error"):
|
||||
// fall through to body classification below
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
headerTitle := trimText(doc.Find("section .section-header h2").First())
|
||||
if headerTitle == "" {
|
||||
headerTitle = trimText(doc.Find("h2").First())
|
||||
}
|
||||
body := strings.TrimSpace(doc.Find("section .section-body").First().Text())
|
||||
if body == "" {
|
||||
body = strings.TrimSpace(doc.Find(".section-body").First().Text())
|
||||
}
|
||||
|
||||
low := strings.ToLower(headerTitle + " " + body)
|
||||
switch {
|
||||
case strings.Contains(low, "not in our database"),
|
||||
strings.Contains(low, "submission not found"),
|
||||
strings.Contains(low, "user not found"),
|
||||
strings.Contains(low, "journal not found"),
|
||||
strings.Contains(low, "page not found"),
|
||||
strings.Contains(low, "no such"):
|
||||
return ErrNotFound
|
||||
case strings.Contains(low, "you must be logged in"),
|
||||
strings.Contains(low, "log in to view"),
|
||||
strings.Contains(low, "permission to view"),
|
||||
strings.Contains(low, "permission to access"):
|
||||
return ErrUnauthorized
|
||||
case strings.Contains(low, "rate limit"),
|
||||
strings.Contains(low, "too many requests"):
|
||||
return ErrRateLimited
|
||||
}
|
||||
return &SystemMessageError{Title: headerTitle, Body: body}
|
||||
}
|
||||
Reference in New Issue
Block a user