feat: basic structure with database

This commit is contained in:
Alphyron 2024-08-17 15:59:49 +02:00
parent 5c11b233f0
commit 822a2b39b7
10 changed files with 3008 additions and 1 deletions

2772
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,3 +4,19 @@ version = "0.1.0"
edition = "2021"
[dependencies]
dotenvy = "0.15"
thiserror = { version = "1.0" }
anyhow = "1.0"
cached = { version = "0.53", features = ["proc_macro", "async"] }
chrono = "0.4"
log = "0.4"
env_logger = "0.11"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
tokio = { version = "1.39", features = ["full"] }
image = "0.25"
img_hash = "3.2"
futures = "0.3"
bk-tree = "0.5"
hamming = "0.1"
sqlx = { version = "0.8", features = [ "runtime-tokio", "migrate", "postgres" ] }

14
README.md Normal file
View File

@ -0,0 +1,14 @@
# Fuzzysearch
This project has the goal to implement the reverse image search which can be used at the anthrove infrastructure.
## Requirements
- OtterSpace Postgres Connection
- Redis
## Concept
The OtterSpace is used for getting the Information what Posts the Anthrove-Project already collected and aggregated.
It will check the Posts and check if we had already a duplicate in the Database so this Project can merge them. There
are multiple Scenarios which need to be defined:

10
docker-compose.yml Normal file
View File

@ -0,0 +1,10 @@
services:
db:
image: postgres
ports:
- 5432:5432
environment:
POSTGRES_USER: anthrove
POSTGRES_DB: anthrove
POSTGRES_PASSWORD: anthrove

View File

@ -0,0 +1,6 @@
-- Add migration script here
CREATE TABLE FuzzyHashes(
`id` SERIAL NOT NULL PRIMARY KEY
`hash` BIGINT NOT NULL,
`post_id` CHAR(25) NOT NULL
)

16
src/db.rs Normal file
View File

@ -0,0 +1,16 @@
use sqlx::{Pool, Postgres};
use sqlx::postgres::PgPoolOptions;
use sqlx::sqlx_macros::migrate;
use sqlx::migrate::Migrator;
static MIGRATOR: Migrator = sqlx::migrate!(); // defaults to "./migrations"
pub async fn connect(uri: &str) -> anyhow::Result<Pool<Postgres>> {
Ok(PgPoolOptions::new()
.connect(uri).await?)
}
pub async fn migrate(pool: Pool<Postgres>) -> anyhow::Result<()> {
Ok(MIGRATOR.run(&pool).await?)
}

2
src/fuzzy.rs Normal file
View File

@ -0,0 +1,2 @@
mod image;
mod tree;

22
src/fuzzy/image.rs Normal file
View File

@ -0,0 +1,22 @@
pub async fn generate_hash<T>(img_reader: image::ImageReader<T>) -> anyhow::Result<i64> {
use img_hash::{HashAlg::Gradient, HasherConfig};
let img_hasher = HasherConfig::with_bytes_type::<[u8; 8]>()
.hash_alg(Gradient)
.hash_size(8, 8)
.preproc_dct()
.to_hasher();
let hash = tokio::task::spawn_blocking(move || -> anyhow::Result<i64, anyhow::Error> {
let im = img_reader.decode()?;
let image_hash = img_hasher.hash_image(&im.into());
let hash: [u8; 8] = image_hash.as_bytes().try_into()?;
let hash = i64::from_be_bytes(hash);
Ok(hash)
}).await??;
Ok(hash)
}

144
src/fuzzy/tree.rs Normal file
View File

@ -0,0 +1,144 @@
/*
* Copyright by Syfaro https://github.com/Syfaro/bkapi/blob/8e80419b0ecbde5aa1e0d95124e70081ba0d6119/bkapi/src/tree.rs
*/
use std::sync::Arc;
use tokio::sync::RwLock;
use bk_tree::BKTree;
use log::{debug, error, info};
use serde::{Deserialize, Serialize};
/// A BKTree wrapper to cover common operations.
#[derive(Clone)]
pub struct Tree {
pub tree: Arc<RwLock<BKTree<Node, Hamming>>>,
}
/// A hash and distance pair. May be used for searching or in search results.
#[derive(serde::Serialize)]
pub struct HashDistance {
pub hash: i64,
pub distance: u32,
}
impl Tree {
/// Create an empty tree.
pub fn new() -> Self {
Self {
tree: Arc::new(RwLock::new(BKTree::new(Hamming))),
}
}
/// Replace tree contents with the results of a SQL query.
///
/// The tree is only replaced after it finishes loading, leaving stale/empty
/// data available while running.
pub(crate) async fn reload(&self) -> anyhow::Result<()> {
let mut new_tree = BKTree::new(Hamming);
let mut rows; // TODO get all row hashes!
let mut count = 0;
while let Some(row) = rows.try_next().await? {
let node: Node = row.get::<i64, _>(0).into();
if new_tree.find_exact(&node).is_none() {
new_tree.add(node);
}
count += 1;
if count % 250_000 == 0 {
debug!("loaded more rows");
}
}
let mut tree = self.tree.write().await;
*tree = new_tree;
Ok(())
}
/// Add a hash to the tree, returning if it already existed.
pub async fn add(&self, hash: i64) -> bool {
let node = Node::from(hash);
let is_new_hash = {
let tree = self.tree.read().await;
tree.find_exact(&node).is_none()
};
if is_new_hash {
let mut tree = self.tree.write().await;
tree.add(node);
}
debug!("added hash");
is_new_hash
}
/// Attempt to find any number of hashes within the tree.
pub async fn find<H>(&self, hashes: H) -> Vec<Vec<HashDistance>>
where
H: IntoIterator<Item = HashDistance>,
{
let tree = self.tree.read().await;
hashes
.into_iter()
.map(|HashDistance { hash, distance }| Self::search(&tree, hash, distance))
.collect()
}
/// Search a read-locked tree for a hash with a given distance.
fn search(tree: &BKTree<Node, Hamming>, hash: i64, distance: u32) -> Vec<HashDistance> {
debug!("searching tree");
let results: Vec<_> = tree
.find(&hash.into(), distance)
.map(|item| HashDistance {
distance: item.0,
hash: (*item.1).into(),
})
.collect();
debug!("found results");
results
}
}
/// A hamming distance metric.
#[derive(Serialize, Deserialize)]
pub struct Hamming;
impl bk_tree::Metric<Node> for Hamming {
fn distance(&self, a: &Node, b: &Node) -> u32 {
let distance_result = hamming::distance_fast(&a.0, &b.0);
match distance_result {
Ok(distance) => distance as u32,
Err(err) => {
error!("hashes did not have same byte alignment: {err:?}");
return u32::MAX
}
}
}
fn threshold_distance(&self, a: &Node, b: &Node, _threshold: u32) -> Option<u32> {
Some(self.distance(a, b))
}
}
/// A value of a node in the BK tree.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct Node([u8; 8]);
impl From<i64> for Node {
fn from(num: i64) -> Self {
Self(num.to_be_bytes())
}
}
impl From<Node> for i64 {
fn from(node: Node) -> Self {
i64::from_be_bytes(node.0)
}
}

View File

@ -1,3 +1,8 @@
mod fuzzy;
mod db;
fn main() {
println!("Hello, world!");
}