feat: more fuzzy search?

This commit is contained in:
Lennard Brinkhaus 2024-12-21 16:56:44 +01:00
parent f9640b3d90
commit 0d4655e50b
5 changed files with 73 additions and 24 deletions

View File

@ -10,6 +10,7 @@ services:
db:
image: postgres
container_name: fuzzy_search_db
ports:
- 5432:5432
environment:

View File

@ -1,6 +1,10 @@
-- Add migration script here
CREATE TABLE FuzzyHashes(
id SERIAL NOT NULL PRIMARY KEY,
hash BIGINT NOT NULL,
mean_hash BIGINT NOT NULL,
gradient_hash BIGINT NOT NULL,
vert_gradient_hash BIGINT NOT NULL,
double_gradient_hash BIGINT NOT NULL,
block_hash BIGINT NOT NULL,
post_id CHAR(25) NOT NULL
)

View File

@ -8,7 +8,11 @@ static MIGRATOR: Migrator = sqlx::migrate!(); // defaults to "./migrations"
pub struct FuzzyHash {
pub id: i32,
pub hash: i64,
pub mean_hash: i64,
pub gradient_hash: i64,
pub vert_gradient_hash: i64,
pub double_gradient_hash: i64,
pub block_hash: i64,
pub post_id: String,
}
@ -22,7 +26,7 @@ pub async fn migrate(pool: &Pool<Postgres>) -> anyhow::Result<()> {
}
pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHash>> {
let rows = sqlx::query!("SELECT id, hash, post_id FROM fuzzyhashes WHERE hash != 0")
let rows = sqlx::query!("SELECT id, mean_hash, gradient_hash, vert_gradient_hash, double_gradient_hash, block_hash, post_id FROM fuzzyhashes WHERE mean_hash > 0 AND gradient_hash > 0 AND vert_gradient_hash > 0 AND double_gradient_hash > 0 AND block_hash > 0")
.fetch_all(pool).await?;
let mut data = vec![];
@ -30,7 +34,11 @@ pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHa
for row in rows {
data.push(FuzzyHash{
id: row.id,
hash: row.hash,
mean_hash: row.mean_hash,
gradient_hash: row.gradient_hash,
vert_gradient_hash: row.vert_gradient_hash,
double_gradient_hash: row.double_gradient_hash,
block_hash: row.block_hash,
post_id: row.post_id,
});
}
@ -39,7 +47,7 @@ pub async fn get_all_hashes(pool: &Pool<Postgres>) -> anyhow::Result<Vec<FuzzyHa
}
pub async fn create_hash(pool: &Pool<Postgres>, fuzzy: FuzzyHash) -> anyhow::Result<i32> {
let rec = sqlx::query!(r#"INSERT INTO "fuzzyhashes" (hash, post_id) VALUES ($1, $2) RETURNING id"#, fuzzy.hash, fuzzy.post_id).fetch_one(pool).await?;
let rec = sqlx::query!(r#"INSERT INTO "fuzzyhashes" (mean_hash, gradient_hash, vert_gradient_hash, double_gradient_hash, block_hash, post_id) VALUES ($1, $2, $3, $4, $5, $6) RETURNING id"#, fuzzy.mean_hash, fuzzy.gradient_hash, fuzzy.vert_gradient_hash, fuzzy.double_gradient_hash, fuzzy.block_hash, fuzzy.post_id).fetch_one(pool).await?;
Ok(rec.id)
}

View File

@ -1,11 +1,12 @@
use image;
use img_hash::HashAlg;
use img_hash::HasherConfig;
pub async fn generate_hash(path: &str) -> anyhow::Result<i64> {
pub async fn generate_hash(path: &str, alg: HashAlg) -> anyhow::Result<i64> {
let img = image::open(path)?;
use img_hash::{HashAlg::Gradient, HasherConfig};
let img_hasher = HasherConfig::with_bytes_type::<[u8; 8]>()
.hash_alg(Gradient)
.hash_alg(alg)
.hash_size(8, 8)
.preproc_dct()
.to_hasher();

View File

@ -5,6 +5,7 @@ use anyhow::anyhow;
use dotenvy::dotenv;
use env_logger::TimestampPrecision;
use image::EncodableLayout;
use img_hash::HashAlg;
use log::{debug, info, warn};
use ratelimit::Ratelimiter;
use tokio::fs::File;
@ -34,20 +35,31 @@ async fn main() -> anyhow::Result<()> {
info!("Loading hashes...");
let fuzzy_hashes = db::get_all_hashes(&pool).await?;
let fuzzy_tree = fuzzy::tree::Tree::new();
let gradient_tree = fuzzy::tree::Tree::new();
let vert_gradient_tree = fuzzy::tree::Tree::new();
let double_gradient_tree = fuzzy::tree::Tree::new();
let block_tree = fuzzy::tree::Tree::new();
for fuzzy_hash in fuzzy_hashes {
let exist = fuzzy_tree.add(fuzzy_hash.hash).await;
let exist_gradient = gradient_tree.add(fuzzy_hash.gradient_hash).await;
let exist_vert_gradient = vert_gradient_tree.add(fuzzy_hash.vert_gradient_hash).await;
let exist_double_gradient = double_gradient_tree.add(fuzzy_hash.double_gradient_hash).await;
let exist_block_hash = block_tree.add(fuzzy_hash.block_hash).await;
if !exist {
warn!("found already existing hash: {}", fuzzy_hash.hash);
let founds = fuzzy_tree.find(vec!(HashDistance{
hash: fuzzy_hash.hash,
distance: 0
})).await;
founds[0].iter().for_each(|has_dist| warn!("Existing: {}", has_dist.hash))
if !exist_gradient {
warn!("found already existing hash (gradient): {}", fuzzy_hash.gradient_hash);
}
if !exist_vert_gradient {
warn!("found already existing hash (vert_gradient): {}", fuzzy_hash.vert_gradient_hash);
}
if !exist_double_gradient {
warn!("found already existing hash (double_gradient): {}", fuzzy_hash.double_gradient_hash);
}
if !exist_block_hash {
warn!("found already existing hash (block_hash): {}", fuzzy_hash.block_hash);
}
info!("use {:?}", fuzzy_hash.post_id)
}
info!("Load success");
@ -61,7 +73,11 @@ async fn main() -> anyhow::Result<()> {
// Post is deleted or so.... !!
db::create_hash(&pool, FuzzyHash{
id: 0,
hash: 0,
mean_hash: 0,
gradient_hash: 0,
block_hash: 0,
double_gradient_hash: 0,
vert_gradient_hash: 0,
post_id: id
}).await?;
continue
@ -76,19 +92,38 @@ async fn main() -> anyhow::Result<()> {
let mut out = File::create(file_name).await?;
io::copy(& mut body.as_bytes(), &mut out).await?;
let hash = fuzzy::image::generate_hash(file_name).await?;
let gradient_hash = fuzzy::image::generate_hash(file_name, HashAlg::Gradient).await?;
let vert_gradient_hash = fuzzy::image::generate_hash(file_name, HashAlg::VertGradient).await?;
let double_gradient_hash = fuzzy::image::generate_hash(file_name, HashAlg::DoubleGradient).await?;
let block_hash = fuzzy::image::generate_hash(file_name, HashAlg::Blockhash).await?;
let _ = remove_file(file_name)?;
let already_exists = fuzzy_tree.add(hash).await;
let exist_gradient = gradient_tree.add(gradient_hash).await;
let exist_vert_gradient = vert_gradient_tree.add(vert_gradient_hash).await;
let exist_double_gradient = double_gradient_tree.add(double_gradient_hash).await;
let exist_block_hash = block_tree.add(block_hash).await;
if already_exists {
info!("Post {id} already in tree with hash: {hash}")
if !exist_gradient {
warn!("found already existing hash (gradient): {}", gradient_hash);
}
if !exist_vert_gradient {
warn!("found already existing hash (vert_gradient): {}", vert_gradient_hash);
}
if !exist_double_gradient {
warn!("found already existing hash (double_gradient): {}", double_gradient_hash);
}
if !exist_block_hash {
warn!("found already existing hash (block_hash): {}", block_hash);
}
db::create_hash(&pool, FuzzyHash{
id: 0,
hash,
mean_hash: 0,
gradient_hash,
vert_gradient_hash,
double_gradient_hash,
block_hash,
post_id: id
}).await?;