feat: more fuzzy search?

This commit is contained in:
Lennard Brinkhaus 2024-12-21 16:56:44 +01:00
parent f9640b3d90
commit 0d4655e50b
5 changed files with 73 additions and 24 deletions

View File

@ -10,6 +10,7 @@ services:
db: db:
image: postgres image: postgres
container_name: fuzzy_search_db
ports: ports:
- 5432:5432 - 5432:5432
environment: environment:

View File

@ -1,6 +1,10 @@
-- Add migration script here -- Add migration script here
CREATE TABLE FuzzyHashes( CREATE TABLE FuzzyHashes(
id SERIAL NOT NULL PRIMARY KEY, id SERIAL NOT NULL PRIMARY KEY,
hash BIGINT NOT NULL, mean_hash BIGINT NOT NULL,
gradient_hash BIGINT NOT NULL,
vert_gradient_hash BIGINT NOT NULL,
double_gradient_hash BIGINT NOT NULL,
block_hash BIGINT NOT NULL,
post_id CHAR(25) NOT NULL post_id CHAR(25) NOT NULL
) )

View File

@ -8,7 +8,11 @@ static MIGRATOR: Migrator = sqlx::migrate!(); // defaults to "./migrations"
pub struct FuzzyHash { pub struct FuzzyHash {
pub id: i32, pub id: i32,
pub hash: i64, pub mean_hash: i64,
pub gradient_hash: i64,
pub vert_gradient_hash: i64,
pub double_gradient_hash: i64,
pub block_hash: i64,
pub post_id: String, pub post_id: String,
} }
@ -22,7 +26,7 @@ pub async fn migrate(pool: &Pool<Postgres>) -> anyhow::Result<()> {
} }
pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHash>> { pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHash>> {
let rows = sqlx::query!("SELECT id, hash, post_id FROM fuzzyhashes WHERE hash != 0") let rows = sqlx::query!("SELECT id, mean_hash, gradient_hash, vert_gradient_hash, double_gradient_hash, block_hash, post_id FROM fuzzyhashes WHERE mean_hash > 0 AND gradient_hash > 0 AND vert_gradient_hash > 0 AND double_gradient_hash > 0 AND block_hash > 0")
.fetch_all(pool).await?; .fetch_all(pool).await?;
let mut data = vec![]; let mut data = vec![];
@ -30,7 +34,11 @@ pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHa
for row in rows { for row in rows {
data.push(FuzzyHash{ data.push(FuzzyHash{
id: row.id, id: row.id,
hash: row.hash, mean_hash: row.mean_hash,
gradient_hash: row.gradient_hash,
vert_gradient_hash: row.vert_gradient_hash,
double_gradient_hash: row.double_gradient_hash,
block_hash: row.block_hash,
post_id: row.post_id, post_id: row.post_id,
}); });
} }
@ -39,7 +47,7 @@ pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHa
} }
pub async fn create_hash(pool: &Pool<Postgres>, fuzzy: FuzzyHash) -> anyhow::Result<i32> { pub async fn create_hash(pool: &Pool<Postgres>, fuzzy: FuzzyHash) -> anyhow::Result<i32> {
let rec = sqlx::query!(r#"INSERT INTO "fuzzyhashes" (hash, post_id) VALUES ($1, $2) RETURNING id"#, fuzzy.hash, fuzzy.post_id).fetch_one(pool).await?; let rec = sqlx::query!(r#"INSERT INTO "fuzzyhashes" (mean_hash, gradient_hash, vert_gradient_hash, double_gradient_hash, block_hash, post_id) VALUES ($1, $2, $3, $4, $5, $6) RETURNING id"#, fuzzy.mean_hash, fuzzy.gradient_hash, fuzzy.vert_gradient_hash, fuzzy.double_gradient_hash, fuzzy.block_hash, fuzzy.post_id).fetch_one(pool).await?;
Ok(rec.id) Ok(rec.id)
} }

View File

@ -1,11 +1,12 @@
use image; use image;
use img_hash::HashAlg;
use img_hash::HasherConfig;
pub async fn generate_hash(path: &str) -> anyhow::Result<i64> { pub async fn generate_hash(path: &str, alg: HashAlg) -> anyhow::Result<i64> {
let img = image::open(path)?; let img = image::open(path)?;
use img_hash::{HashAlg::Gradient, HasherConfig};
let img_hasher = HasherConfig::with_bytes_type::<[u8; 8]>() let img_hasher = HasherConfig::with_bytes_type::<[u8; 8]>()
.hash_alg(Gradient) .hash_alg(alg)
.hash_size(8, 8) .hash_size(8, 8)
.preproc_dct() .preproc_dct()
.to_hasher(); .to_hasher();

View File

@ -5,6 +5,7 @@ use anyhow::anyhow;
use dotenvy::dotenv; use dotenvy::dotenv;
use env_logger::TimestampPrecision; use env_logger::TimestampPrecision;
use image::EncodableLayout; use image::EncodableLayout;
use img_hash::HashAlg;
use log::{debug, info, warn}; use log::{debug, info, warn};
use ratelimit::Ratelimiter; use ratelimit::Ratelimiter;
use tokio::fs::File; use tokio::fs::File;
@ -34,20 +35,31 @@ async fn main() -> anyhow::Result<()> {
info!("Loading hashes..."); info!("Loading hashes...");
let fuzzy_hashes = db::get_all_hashes(&pool).await?; let fuzzy_hashes = db::get_all_hashes(&pool).await?;
let fuzzy_tree = fuzzy::tree::Tree::new(); let gradient_tree = fuzzy::tree::Tree::new();
let vert_gradient_tree = fuzzy::tree::Tree::new();
let double_gradient_tree = fuzzy::tree::Tree::new();
let block_tree = fuzzy::tree::Tree::new();
for fuzzy_hash in fuzzy_hashes { for fuzzy_hash in fuzzy_hashes {
let exist = fuzzy_tree.add(fuzzy_hash.hash).await; let exist_gradient = gradient_tree.add(fuzzy_hash.gradient_hash).await;
let exist_vert_gradient = vert_gradient_tree.add(fuzzy_hash.vert_gradient_hash).await;
let exist_double_gradient = double_gradient_tree.add(fuzzy_hash.double_gradient_hash).await;
let exist_block_hash = block_tree.add(fuzzy_hash.block_hash).await;
if !exist { if !exist_gradient {
warn!("found already existing hash: {}", fuzzy_hash.hash); warn!("found already existing hash (gradient): {}", fuzzy_hash.gradient_hash);
let founds = fuzzy_tree.find(vec!(HashDistance{
hash: fuzzy_hash.hash,
distance: 0
})).await;
founds[0].iter().for_each(|has_dist| warn!("Existing: {}", has_dist.hash))
} }
if !exist_vert_gradient {
warn!("found already existing hash (vert_gradient): {}", fuzzy_hash.vert_gradient_hash);
}
if !exist_double_gradient {
warn!("found already existing hash (double_gradient): {}", fuzzy_hash.double_gradient_hash);
}
if !exist_block_hash {
warn!("found already existing hash (block_hash): {}", fuzzy_hash.block_hash);
}
info!("use {:?}", fuzzy_hash.post_id)
} }
info!("Load success"); info!("Load success");
@ -61,7 +73,11 @@ async fn main() -> anyhow::Result<()> {
// Post is deleted or so.... !! // Post is deleted or so.... !!
db::create_hash(&pool, FuzzyHash{ db::create_hash(&pool, FuzzyHash{
id: 0, id: 0,
hash: 0, mean_hash: 0,
gradient_hash: 0,
block_hash: 0,
double_gradient_hash: 0,
vert_gradient_hash: 0,
post_id: id post_id: id
}).await?; }).await?;
continue continue
@ -76,19 +92,38 @@ async fn main() -> anyhow::Result<()> {
let mut out = File::create(file_name).await?; let mut out = File::create(file_name).await?;
io::copy(& mut body.as_bytes(), &mut out).await?; io::copy(& mut body.as_bytes(), &mut out).await?;
let hash = fuzzy::image::generate_hash(file_name).await?; let gradient_hash = fuzzy::image::generate_hash(file_name, HashAlg::Gradient).await?;
let vert_gradient_hash = fuzzy::image::generate_hash(file_name, HashAlg::VertGradient).await?;
let double_gradient_hash = fuzzy::image::generate_hash(file_name, HashAlg::DoubleGradient).await?;
let block_hash = fuzzy::image::generate_hash(file_name, HashAlg::Blockhash).await?;
let _ = remove_file(file_name)?; let _ = remove_file(file_name)?;
let already_exists = fuzzy_tree.add(hash).await; let exist_gradient = gradient_tree.add(gradient_hash).await;
let exist_vert_gradient = vert_gradient_tree.add(vert_gradient_hash).await;
let exist_double_gradient = double_gradient_tree.add(double_gradient_hash).await;
let exist_block_hash = block_tree.add(block_hash).await;
if already_exists { if !exist_gradient {
info!("Post {id} already in tree with hash: {hash}") warn!("found already existing hash (gradient): {}", gradient_hash);
}
if !exist_vert_gradient {
warn!("found already existing hash (vert_gradient): {}", vert_gradient_hash);
}
if !exist_double_gradient {
warn!("found already existing hash (double_gradient): {}", double_gradient_hash);
}
if !exist_block_hash {
warn!("found already existing hash (block_hash): {}", block_hash);
} }
db::create_hash(&pool, FuzzyHash{ db::create_hash(&pool, FuzzyHash{
id: 0, id: 0,
hash, mean_hash: 0,
gradient_hash,
vert_gradient_hash,
double_gradient_hash,
block_hash,
post_id: id post_id: id
}).await?; }).await?;