feat: basic structure with database
This commit is contained in:
parent
5c11b233f0
commit
822a2b39b7
2772
Cargo.lock
generated
2772
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
16
Cargo.toml
@ -4,3 +4,19 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
dotenvy = "0.15"
|
||||||
|
thiserror = { version = "1.0" }
|
||||||
|
anyhow = "1.0"
|
||||||
|
cached = { version = "0.53", features = ["proc_macro", "async"] }
|
||||||
|
chrono = "0.4"
|
||||||
|
log = "0.4"
|
||||||
|
env_logger = "0.11"
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
tokio = { version = "1.39", features = ["full"] }
|
||||||
|
image = "0.25"
|
||||||
|
img_hash = "3.2"
|
||||||
|
futures = "0.3"
|
||||||
|
bk-tree = "0.5"
|
||||||
|
hamming = "0.1"
|
||||||
|
sqlx = { version = "0.8", features = [ "runtime-tokio", "migrate", "postgres" ] }
|
14
README.md
Normal file
14
README.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Fuzzysearch
|
||||||
|
|
||||||
|
This project has the goal to implement the reverse image search which can be used at the anthrove infrastructure.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
- OtterSpace Postgres Connection
|
||||||
|
- Redis
|
||||||
|
|
||||||
|
|
||||||
|
## Concept
|
||||||
|
The OtterSpace is used for getting the Information what Posts the Anthrove-Project already collected and aggregated.
|
||||||
|
It will check the Posts and check if we had already a duplicate in the Database so this Project can merge them. There
|
||||||
|
are multiple Scenarios which need to be defined:
|
||||||
|
|
10
docker-compose.yml
Normal file
10
docker-compose.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: postgres
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: anthrove
|
||||||
|
POSTGRES_DB: anthrove
|
||||||
|
POSTGRES_PASSWORD: anthrove
|
6
migrations/20240815064629_initial-database.sql
Normal file
6
migrations/20240815064629_initial-database.sql
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
-- Add migration script here
|
||||||
|
CREATE TABLE FuzzyHashes(
|
||||||
|
`id` SERIAL NOT NULL PRIMARY KEY
|
||||||
|
`hash` BIGINT NOT NULL,
|
||||||
|
`post_id` CHAR(25) NOT NULL
|
||||||
|
)
|
16
src/db.rs
Normal file
16
src/db.rs
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
use sqlx::{Pool, Postgres};
|
||||||
|
use sqlx::postgres::PgPoolOptions;
|
||||||
|
use sqlx::sqlx_macros::migrate;
|
||||||
|
|
||||||
|
use sqlx::migrate::Migrator;
|
||||||
|
|
||||||
|
static MIGRATOR: Migrator = sqlx::migrate!(); // defaults to "./migrations"
|
||||||
|
|
||||||
|
pub async fn connect(uri: &str) -> anyhow::Result<Pool<Postgres>> {
|
||||||
|
Ok(PgPoolOptions::new()
|
||||||
|
.connect(uri).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn migrate(pool: Pool<Postgres>) -> anyhow::Result<()> {
|
||||||
|
Ok(MIGRATOR.run(&pool).await?)
|
||||||
|
}
|
2
src/fuzzy.rs
Normal file
2
src/fuzzy.rs
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
mod image;
|
||||||
|
mod tree;
|
22
src/fuzzy/image.rs
Normal file
22
src/fuzzy/image.rs
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
pub async fn generate_hash<T>(img_reader: image::ImageReader<T>) -> anyhow::Result<i64> {
|
||||||
|
use img_hash::{HashAlg::Gradient, HasherConfig};
|
||||||
|
|
||||||
|
let img_hasher = HasherConfig::with_bytes_type::<[u8; 8]>()
|
||||||
|
.hash_alg(Gradient)
|
||||||
|
.hash_size(8, 8)
|
||||||
|
.preproc_dct()
|
||||||
|
.to_hasher();
|
||||||
|
|
||||||
|
let hash = tokio::task::spawn_blocking(move || -> anyhow::Result<i64, anyhow::Error> {
|
||||||
|
let im = img_reader.decode()?;
|
||||||
|
|
||||||
|
let image_hash = img_hasher.hash_image(&im.into());
|
||||||
|
let hash: [u8; 8] = image_hash.as_bytes().try_into()?;
|
||||||
|
let hash = i64::from_be_bytes(hash);
|
||||||
|
|
||||||
|
Ok(hash)
|
||||||
|
}).await??;
|
||||||
|
|
||||||
|
Ok(hash)
|
||||||
|
}
|
||||||
|
|
144
src/fuzzy/tree.rs
Normal file
144
src/fuzzy/tree.rs
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
/*
|
||||||
|
* Copyright by Syfaro https://github.com/Syfaro/bkapi/blob/8e80419b0ecbde5aa1e0d95124e70081ba0d6119/bkapi/src/tree.rs
|
||||||
|
*/
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
use bk_tree::BKTree;
|
||||||
|
use log::{debug, error, info};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// A BKTree wrapper to cover common operations.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Tree {
|
||||||
|
pub tree: Arc<RwLock<BKTree<Node, Hamming>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A hash and distance pair. May be used for searching or in search results.
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
pub struct HashDistance {
|
||||||
|
pub hash: i64,
|
||||||
|
pub distance: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Tree {
|
||||||
|
/// Create an empty tree.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
tree: Arc::new(RwLock::new(BKTree::new(Hamming))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replace tree contents with the results of a SQL query.
|
||||||
|
///
|
||||||
|
/// The tree is only replaced after it finishes loading, leaving stale/empty
|
||||||
|
/// data available while running.
|
||||||
|
pub(crate) async fn reload(&self) -> anyhow::Result<()> {
|
||||||
|
let mut new_tree = BKTree::new(Hamming);
|
||||||
|
let mut rows; // TODO get all row hashes!
|
||||||
|
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
|
while let Some(row) = rows.try_next().await? {
|
||||||
|
let node: Node = row.get::<i64, _>(0).into();
|
||||||
|
|
||||||
|
if new_tree.find_exact(&node).is_none() {
|
||||||
|
new_tree.add(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
if count % 250_000 == 0 {
|
||||||
|
debug!("loaded more rows");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut tree = self.tree.write().await;
|
||||||
|
*tree = new_tree;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a hash to the tree, returning if it already existed.
|
||||||
|
pub async fn add(&self, hash: i64) -> bool {
|
||||||
|
let node = Node::from(hash);
|
||||||
|
|
||||||
|
let is_new_hash = {
|
||||||
|
let tree = self.tree.read().await;
|
||||||
|
tree.find_exact(&node).is_none()
|
||||||
|
};
|
||||||
|
|
||||||
|
if is_new_hash {
|
||||||
|
let mut tree = self.tree.write().await;
|
||||||
|
tree.add(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("added hash");
|
||||||
|
is_new_hash
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Attempt to find any number of hashes within the tree.
|
||||||
|
pub async fn find<H>(&self, hashes: H) -> Vec<Vec<HashDistance>>
|
||||||
|
where
|
||||||
|
H: IntoIterator<Item = HashDistance>,
|
||||||
|
{
|
||||||
|
let tree = self.tree.read().await;
|
||||||
|
|
||||||
|
hashes
|
||||||
|
.into_iter()
|
||||||
|
.map(|HashDistance { hash, distance }| Self::search(&tree, hash, distance))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search a read-locked tree for a hash with a given distance.
|
||||||
|
fn search(tree: &BKTree<Node, Hamming>, hash: i64, distance: u32) -> Vec<HashDistance> {
|
||||||
|
debug!("searching tree");
|
||||||
|
|
||||||
|
let results: Vec<_> = tree
|
||||||
|
.find(&hash.into(), distance)
|
||||||
|
.map(|item| HashDistance {
|
||||||
|
distance: item.0,
|
||||||
|
hash: (*item.1).into(),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
debug!("found results");
|
||||||
|
results
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A hamming distance metric.
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct Hamming;
|
||||||
|
|
||||||
|
impl bk_tree::Metric<Node> for Hamming {
|
||||||
|
fn distance(&self, a: &Node, b: &Node) -> u32 {
|
||||||
|
let distance_result = hamming::distance_fast(&a.0, &b.0);
|
||||||
|
|
||||||
|
match distance_result {
|
||||||
|
Ok(distance) => distance as u32,
|
||||||
|
Err(err) => {
|
||||||
|
error!("hashes did not have same byte alignment: {err:?}");
|
||||||
|
return u32::MAX
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn threshold_distance(&self, a: &Node, b: &Node, _threshold: u32) -> Option<u32> {
|
||||||
|
Some(self.distance(a, b))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A value of a node in the BK tree.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub struct Node([u8; 8]);
|
||||||
|
|
||||||
|
impl From<i64> for Node {
|
||||||
|
fn from(num: i64) -> Self {
|
||||||
|
Self(num.to_be_bytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Node> for i64 {
|
||||||
|
fn from(node: Node) -> Self {
|
||||||
|
i64::from_be_bytes(node.0)
|
||||||
|
}
|
||||||
|
}
|
@ -1,3 +1,8 @@
|
|||||||
|
mod fuzzy;
|
||||||
|
mod db;
|
||||||
|
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
println!("Hello, world!");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user