feat: basic structure with database
This commit is contained in:
parent
5c11b233f0
commit
822a2b39b7
2772
Cargo.lock
generated
2772
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
16
Cargo.toml
@ -4,3 +4,19 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
dotenvy = "0.15"
|
||||
thiserror = { version = "1.0" }
|
||||
anyhow = "1.0"
|
||||
cached = { version = "0.53", features = ["proc_macro", "async"] }
|
||||
chrono = "0.4"
|
||||
log = "0.4"
|
||||
env_logger = "0.11"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
tokio = { version = "1.39", features = ["full"] }
|
||||
image = "0.25"
|
||||
img_hash = "3.2"
|
||||
futures = "0.3"
|
||||
bk-tree = "0.5"
|
||||
hamming = "0.1"
|
||||
sqlx = { version = "0.8", features = [ "runtime-tokio", "migrate", "postgres" ] }
|
14
README.md
Normal file
14
README.md
Normal file
@ -0,0 +1,14 @@
|
||||
# Fuzzysearch
|
||||
|
||||
This project has the goal to implement the reverse image search which can be used at the anthrove infrastructure.
|
||||
|
||||
## Requirements
|
||||
- OtterSpace Postgres Connection
|
||||
- Redis
|
||||
|
||||
|
||||
## Concept
|
||||
The OtterSpace is used for getting the Information what Posts the Anthrove-Project already collected and aggregated.
|
||||
It will check the Posts and check if we had already a duplicate in the Database so this Project can merge them. There
|
||||
are multiple Scenarios which need to be defined:
|
||||
|
10
docker-compose.yml
Normal file
10
docker-compose.yml
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
services:
|
||||
db:
|
||||
image: postgres
|
||||
ports:
|
||||
- 5432:5432
|
||||
environment:
|
||||
POSTGRES_USER: anthrove
|
||||
POSTGRES_DB: anthrove
|
||||
POSTGRES_PASSWORD: anthrove
|
6
migrations/20240815064629_initial-database.sql
Normal file
6
migrations/20240815064629_initial-database.sql
Normal file
@ -0,0 +1,6 @@
|
||||
-- Add migration script here
|
||||
CREATE TABLE FuzzyHashes(
|
||||
`id` SERIAL NOT NULL PRIMARY KEY
|
||||
`hash` BIGINT NOT NULL,
|
||||
`post_id` CHAR(25) NOT NULL
|
||||
)
|
16
src/db.rs
Normal file
16
src/db.rs
Normal file
@ -0,0 +1,16 @@
|
||||
use sqlx::{Pool, Postgres};
|
||||
use sqlx::postgres::PgPoolOptions;
|
||||
use sqlx::sqlx_macros::migrate;
|
||||
|
||||
use sqlx::migrate::Migrator;
|
||||
|
||||
static MIGRATOR: Migrator = sqlx::migrate!(); // defaults to "./migrations"
|
||||
|
||||
pub async fn connect(uri: &str) -> anyhow::Result<Pool<Postgres>> {
|
||||
Ok(PgPoolOptions::new()
|
||||
.connect(uri).await?)
|
||||
}
|
||||
|
||||
pub async fn migrate(pool: Pool<Postgres>) -> anyhow::Result<()> {
|
||||
Ok(MIGRATOR.run(&pool).await?)
|
||||
}
|
2
src/fuzzy.rs
Normal file
2
src/fuzzy.rs
Normal file
@ -0,0 +1,2 @@
|
||||
mod image;
|
||||
mod tree;
|
22
src/fuzzy/image.rs
Normal file
22
src/fuzzy/image.rs
Normal file
@ -0,0 +1,22 @@
|
||||
pub async fn generate_hash<T>(img_reader: image::ImageReader<T>) -> anyhow::Result<i64> {
|
||||
use img_hash::{HashAlg::Gradient, HasherConfig};
|
||||
|
||||
let img_hasher = HasherConfig::with_bytes_type::<[u8; 8]>()
|
||||
.hash_alg(Gradient)
|
||||
.hash_size(8, 8)
|
||||
.preproc_dct()
|
||||
.to_hasher();
|
||||
|
||||
let hash = tokio::task::spawn_blocking(move || -> anyhow::Result<i64, anyhow::Error> {
|
||||
let im = img_reader.decode()?;
|
||||
|
||||
let image_hash = img_hasher.hash_image(&im.into());
|
||||
let hash: [u8; 8] = image_hash.as_bytes().try_into()?;
|
||||
let hash = i64::from_be_bytes(hash);
|
||||
|
||||
Ok(hash)
|
||||
}).await??;
|
||||
|
||||
Ok(hash)
|
||||
}
|
||||
|
144
src/fuzzy/tree.rs
Normal file
144
src/fuzzy/tree.rs
Normal file
@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright by Syfaro https://github.com/Syfaro/bkapi/blob/8e80419b0ecbde5aa1e0d95124e70081ba0d6119/bkapi/src/tree.rs
|
||||
*/
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use bk_tree::BKTree;
|
||||
use log::{debug, error, info};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A BKTree wrapper to cover common operations.
|
||||
#[derive(Clone)]
|
||||
pub struct Tree {
|
||||
pub tree: Arc<RwLock<BKTree<Node, Hamming>>>,
|
||||
}
|
||||
|
||||
/// A hash and distance pair. May be used for searching or in search results.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct HashDistance {
|
||||
pub hash: i64,
|
||||
pub distance: u32,
|
||||
}
|
||||
|
||||
impl Tree {
|
||||
/// Create an empty tree.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tree: Arc::new(RwLock::new(BKTree::new(Hamming))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace tree contents with the results of a SQL query.
|
||||
///
|
||||
/// The tree is only replaced after it finishes loading, leaving stale/empty
|
||||
/// data available while running.
|
||||
pub(crate) async fn reload(&self) -> anyhow::Result<()> {
|
||||
let mut new_tree = BKTree::new(Hamming);
|
||||
let mut rows; // TODO get all row hashes!
|
||||
|
||||
let mut count = 0;
|
||||
|
||||
while let Some(row) = rows.try_next().await? {
|
||||
let node: Node = row.get::<i64, _>(0).into();
|
||||
|
||||
if new_tree.find_exact(&node).is_none() {
|
||||
new_tree.add(node);
|
||||
}
|
||||
|
||||
count += 1;
|
||||
if count % 250_000 == 0 {
|
||||
debug!("loaded more rows");
|
||||
}
|
||||
}
|
||||
|
||||
let mut tree = self.tree.write().await;
|
||||
*tree = new_tree;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a hash to the tree, returning if it already existed.
|
||||
pub async fn add(&self, hash: i64) -> bool {
|
||||
let node = Node::from(hash);
|
||||
|
||||
let is_new_hash = {
|
||||
let tree = self.tree.read().await;
|
||||
tree.find_exact(&node).is_none()
|
||||
};
|
||||
|
||||
if is_new_hash {
|
||||
let mut tree = self.tree.write().await;
|
||||
tree.add(node);
|
||||
}
|
||||
|
||||
debug!("added hash");
|
||||
is_new_hash
|
||||
}
|
||||
|
||||
/// Attempt to find any number of hashes within the tree.
|
||||
pub async fn find<H>(&self, hashes: H) -> Vec<Vec<HashDistance>>
|
||||
where
|
||||
H: IntoIterator<Item = HashDistance>,
|
||||
{
|
||||
let tree = self.tree.read().await;
|
||||
|
||||
hashes
|
||||
.into_iter()
|
||||
.map(|HashDistance { hash, distance }| Self::search(&tree, hash, distance))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Search a read-locked tree for a hash with a given distance.
|
||||
fn search(tree: &BKTree<Node, Hamming>, hash: i64, distance: u32) -> Vec<HashDistance> {
|
||||
debug!("searching tree");
|
||||
|
||||
let results: Vec<_> = tree
|
||||
.find(&hash.into(), distance)
|
||||
.map(|item| HashDistance {
|
||||
distance: item.0,
|
||||
hash: (*item.1).into(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
debug!("found results");
|
||||
results
|
||||
}
|
||||
}
|
||||
|
||||
/// A hamming distance metric.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct Hamming;
|
||||
|
||||
impl bk_tree::Metric<Node> for Hamming {
|
||||
fn distance(&self, a: &Node, b: &Node) -> u32 {
|
||||
let distance_result = hamming::distance_fast(&a.0, &b.0);
|
||||
|
||||
match distance_result {
|
||||
Ok(distance) => distance as u32,
|
||||
Err(err) => {
|
||||
error!("hashes did not have same byte alignment: {err:?}");
|
||||
return u32::MAX
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn threshold_distance(&self, a: &Node, b: &Node, _threshold: u32) -> Option<u32> {
|
||||
Some(self.distance(a, b))
|
||||
}
|
||||
}
|
||||
|
||||
/// A value of a node in the BK tree.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct Node([u8; 8]);
|
||||
|
||||
impl From<i64> for Node {
|
||||
fn from(num: i64) -> Self {
|
||||
Self(num.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Node> for i64 {
|
||||
fn from(node: Node) -> Self {
|
||||
i64::from_be_bytes(node.0)
|
||||
}
|
||||
}
|
@ -1,3 +1,8 @@
|
||||
mod fuzzy;
|
||||
mod db;
|
||||
|
||||
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user