New and generic scrape algorithm #11

Merged
SoXX merged 20 commits from dev/algorithm into main 2024-11-01 21:39:21 +00:00
4 changed files with 152 additions and 31 deletions
Showing only changes of commit 41882b9bfb - Show all commits

View File

@ -1,12 +0,0 @@
package otter
import (
"context"
"git.anthrove.art/Anthrove/otter-space-sdk/v4/pkg/database"
"git.anthrove.art/Anthrove/otter-space-sdk/v4/pkg/models"
)
func ConnectToDatabase(ctx context.Context, config models.DatabaseConfig) error {
return database.Connect(ctx, config)
}

129
pkg/plug/algorithm.go Normal file
View File

@ -0,0 +1,129 @@
package plug
import (
"context"
"git.anthrove.art/Anthrove/otter-space-sdk/v4/pkg/models"
log "github.com/sirupsen/logrus"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"gorm.io/gorm"
)
type User struct {
userFavoriteCount int64
userName string
userID string
}
type Favorites struct {
posts []models.Post
nextPage string
lastPage string
}
type Plug interface {
// GetFavoritePage
// The API Key can be an empty string if it's not supplied by the user, in this case the default API Key should be used
GetFavoritePage(ctx context.Context, apiKey string, userSource models.UserSource, pageIdentifier string) (Favorites, error)
// GetUserProfile
// The API Key can be an empty string if it's not supplied by the user, in this case the default API Key should be used
GetUserProfile(ctx context.Context, apiKey string, userSource models.UserSource) (User, error)
}
func Algorithm(ctx context.Context, plugInterface Plug, db *gorm.DB, userSource models.UserSource, deepScrape bool, apiKey string) (TaskSummery, error) {
ctx, span := tracer.Start(ctx, "mainScrapeAlgorithm")
defer span.End()
span.SetAttributes(
attribute.String("user_source_id", string(userSource.ID)),
attribute.String("user_source_user_id", string(userSource.UserID)),
attribute.String("user_source_source_id", string(userSource.SourceID)),
)
basicLoggingInfo := log.Fields{
"user_source_id": userSource.ID,
"user_source_user_id": userSource.UserID,
"user_source_source_id": userSource.SourceID,
}
log.WithContext(ctx).WithFields(basicLoggingInfo).Info("Starting mainScrapeAlgorithm")
taskSummery := TaskSummery{
AddedPosts: 0,
DeletedPosts: 0,
}
anthroveUserFavCount, err := getUserFavoriteCountFromDatabase(ctx, db, userSource.UserID, userSource.ID)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.WithContext(ctx).WithFields(basicLoggingInfo).WithError(err).Error("Failed to get user favorite count from db")
return taskSummery, err
}
Alphyron marked this conversation as resolved Outdated

Hier muss eine sonderbedingung gemacht weerden, weil UserFavoriteCount kann ja -1 sein (wenn es nicht ausgelesen werden kann.

Hier muss eine sonderbedingung gemacht weerden, weil UserFavoriteCount kann ja -1 sein (wenn es nicht ausgelesen werden kann.

@SoXX to fix this problem and the Issue one comment bellow I would recommend, to change the algorithm to the following:

@SoXX to fix this problem and the Issue one comment bellow I would recommend, to change the algorithm to the following:
[https://www.plantuml.com/plantuml/uml/RP1FRzH03CNl_XIlj-mDhea4BIgWIXMAgChTk-D9HyoVoEFija9yTtOIYTBIkOtzUy_Flgp6QakAT34hJucnLFaXQk70ySQZPA8LeVwsiCDz5Hsr-105Nal269VfQhmPwFJGQftf8ZiwFw3_AeOlV1nvsk1LFH2mUSbZg1RoXB5KgnlHs7__rv_-vvkdRFqtFPgcYQwSGoxsch62APOzHypdF-AvERo9RsEUSS_7bKPNr8aYL8Gq5pNETe4i9wa67xJQRa1B43owpo_TFk3T3hy80FOg_9E0tslOQ_4X2xx9esr7i8AxW_8i0qbswtM9-liv5cuvywkr1kg_or6qIfk4spLdBYUKw9vpzNyTjZXTi1Thm1v4fPKODN6CSC5xbNmGxCLE8haX2LtYfxtWFLBzkDji7Pj0nHRDk5jIOdtgYQgLcIubkoN5Fm00](https://plantuml.github.io/plantuml-core/raw.html?VLB1RXCn4BtxArulrUu5qMiMWGejg5A5LbJbxcJ76mlRinhF9ktVOsS3j57epPxtvisRPrzCOiTeCMVwIQ9-OyET0oTZibfHixfdE-0na_J2pWU24uxempnUztUTsMzhhN4ij2DfZG6yGsEJReLtL1k_sjqNdfFY0wP5uzsnpEAL5kpoyWtm8zwtq2qbD2epjMK8i1Qolyg9qk1TdRlwHehIsDlnwB8gTHKvB45FFgjF8thEcezxLEwB-ytdC_oYIKVyd4RjgKtNmu34UifHnXXuRGEcSaSeO7UMlepvHtTsTs2ZOHb8u3Dyd5YqX7k1H7igcFEIjUtm9_ZRrsgdoU4_qTsGyudmdPvqoQJOUDY8dQWpXNLuPMDMZrEeoIHE9rjsPlMZVe1CTJ3k3xOJxy5XU67phyBYhl7wQglgLAYMB7Aq8Q3uSq9fGrYJxgdiKIWHJRJFQy8LOlxKBhA3LOAGpfBmCCTevRchdvYdtm00)
fixed an issue @SoXX https://plantuml.github.io/plantuml-core/raw.html?ZL9DJzmm4BtxLpmkABbKzBefL53Q2WbLKH6zUpVZh5N74uqdk_3lQoTBAGkjUeldVSoRoPoCOll1OahqWqJzneOR1ux69BMYPdNBjiDz8cc5dGy49poW3LD_sTuqPhyjjSgnq8waDWRm3fMDkXNUKH5-iRjFF4N51uoBnxj3cSKhBTZfwJ_02vpLq2r5L2eJrRa9i1QoF_CNnSwxqdVrbHHPsDh-aB8uDK-HvdBBIHFEpuyCyHV7UNXwy4bzx0_YPVvdi_bzVTMyvkSiTC3VyYYQ8hhiEaJMOuuo-i1h6p3cDq86cpVfIvjdkhExcOsAfIE1J_33oABvfvmWqfuovhmahHiyRUVpIxUB_rpGDJaV2T_eGPUTv1Xt6x4ZDIQpvkimPdK_XhAPoF5eoiQilfV3ILGQutQE7NaF3jqVCNyxBEagho-cAac5IiaY3JO4v7nEA6u8Qz8zvJaoPK0rsCSruJs6zB5UR3kh126T9E9k6WOhZqqvVXv_0G00
profile, err := plugInterface.GetUserProfile(ctx, apiKey, userSource)
Alphyron marked this conversation as resolved Outdated

Issue still exists
Anthrove/plug-e621#15

Issue still exists https://git.anthrove.art/Anthrove/plug-e621/issues/15
if err != nil {
return taskSummery, err
}
nextPage := ""
outer:
for anthroveUserFavCount < profile.userFavoriteCount {
select {
case <-ctx.Done():
break outer
default:
span.AddEvent("Executing getFavorites request")
favorites, err := plugInterface.GetFavoritePage(ctx, apiKey, userSource, nextPage)
span.AddEvent("Finished executing getFavorites request")
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.WithContext(ctx).WithFields(basicLoggingInfo).WithError(err).Error("Failed to execute favorites page")
return taskSummery, err
}
if len(favorites.posts) <= 0 {
span.AddEvent("No more favorites found")
log.WithContext(ctx).WithFields(basicLoggingInfo).Info("No more favorites found")
break outer
}
summery, err := BatchPostProcessingWithSummery(ctx, userSource, favorites.posts)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.WithContext(ctx).WithFields(basicLoggingInfo).WithError(err).Error("Failed in BatchPostProcessing")
return taskSummery, err
}
if summery.AddedFavorites != int64(len(favorites.posts)) {
span.AddEvent("user has no more favorites to add")
break outer
}
nextPage = favorites.nextPage
taskSummery.AddedPosts += int(summery.AddedFavorites)
}
}
span.AddEvent("Completed scraping algorithm")
log.WithContext(ctx).WithFields(basicLoggingInfo).Info("Completed scraping algorithm")
return taskSummery, nil
}
// getUserFavoriteCountFromDatabase
func getUserFavoriteCountFromDatabase(ctx context.Context, gorm *gorm.DB, userID models.UserID, userSourceID models.UserSourceID) (int64, error) {
var count int64
err := gorm.WithContext(ctx).Model(&models.UserFavorite{}).Where("user_id = ? AND user_source_id = ?", userID, userSourceID).Count(&count).Error
if err != nil {
return count, err
}
return count, nil
}

View File

@ -17,20 +17,20 @@ import (
type server struct {
gRPC.UnimplementedPlugConnectorServer
ctx map[string]context.CancelFunc
taskExecutionFunction TaskExecution
sendMessageExecution SendMessageExecution
getMessageExecution GetMessageExecution
source models.Source
ctx map[string]context.CancelFunc
plugInterface Plug
sendMessageExecution SendMessageExecution
getMessageExecution GetMessageExecution
source models.Source
}
func NewGrpcServer(source models.Source, taskExecutionFunction TaskExecution, sendMessageExecution SendMessageExecution, getMessageExecution GetMessageExecution) gRPC.PlugConnectorServer {
func NewGrpcServer(source models.Source, plugAPIInterface Plug, sendMessageExecution SendMessageExecution, getMessageExecution GetMessageExecution) gRPC.PlugConnectorServer {
return &server{
ctx: make(map[string]context.CancelFunc),
taskExecutionFunction: taskExecutionFunction,
sendMessageExecution: sendMessageExecution,
getMessageExecution: getMessageExecution,
source: source,
ctx: make(map[string]context.CancelFunc),
plugInterface: plugAPIInterface,
sendMessageExecution: sendMessageExecution,
getMessageExecution: getMessageExecution,
source: source,
}
}
@ -116,7 +116,12 @@ func (s *server) TaskStart(ctx context.Context, creation *gRPC.PlugTaskCreation)
go func() {
var err error
taskSummery, err := s.taskExecutionFunction(taskCtx, userSource, creation.DeepScrape, creation.ApiKey)
gorm, err := database.GetGorm(taskCtx)
log.WithContext(taskCtx).WithError(err).WithField("task_id", id).Error("Failed to get Gorm client")
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
taskSummery, err := Algorithm(taskCtx, s.plugInterface, gorm, userSource, creation.DeepScrape, creation.ApiKey)
if err != nil {
log.WithContext(taskCtx).WithError(err).WithField("task_id", id).Error("Task execution failed")
span.RecordError(err)

View File

@ -28,14 +28,13 @@ type TaskSummery struct {
DeletedPosts int
}
type TaskExecution func(ctx context.Context, userSource models.UserSource, deepScrape bool, apiKey string) (TaskSummery, error)
type SendMessageExecution func(ctx context.Context, userSource models.UserSource, message string) error
type GetMessageExecution func(ctx context.Context, userSource models.UserSource) ([]Message, error)
var (
taskExecutionFunction TaskExecution
sendMessageExecution SendMessageExecution
getMessageExecution GetMessageExecution
sendMessageExecution SendMessageExecution
getMessageExecution GetMessageExecution
plugAPIInterface Plug
)
func Listen(ctx context.Context, listenAddr string, source models.Source) error {
@ -78,7 +77,7 @@ func Listen(ctx context.Context, listenAddr string, source models.Source) error
grpc.StatsHandler(otelgrpc.NewServerHandler()),
)
pb.RegisterPlugConnectorServer(grpcServer, NewGrpcServer(source, taskExecutionFunction, sendMessageExecution, getMessageExecution))
pb.RegisterPlugConnectorServer(grpcServer, NewGrpcServer(source, plugAPIInterface, sendMessageExecution, getMessageExecution))
go func() {
err = grpcServer.Serve(lis)
@ -98,8 +97,8 @@ func Listen(ctx context.Context, listenAddr string, source models.Source) error
return nil
}
func SetTaskExecutionFunction(function TaskExecution) {
taskExecutionFunction = function
func SetTaskExecutionFunction(plugInterface Plug) {
Alphyron marked this conversation as resolved Outdated

irgendwie passt der Name der funktion nicht zu dem was gemacht wird. Es wird weder eine Function gesetzt, noch wird ein execution stuff gesetzt.
Hier ist es ja nur ein Interface als Schnittstelle zu der PlugAPI

irgendwie passt der Name der funktion nicht zu dem was gemacht wird. Es wird weder eine Function gesetzt, noch wird ein execution stuff gesetzt. Hier ist es ja nur ein Interface als Schnittstelle zu der PlugAPI
plugAPIInterface = plugInterface
}
func SetSendMessageExecutionFunction(function SendMessageExecution) {