diff --git a/Cargo.toml b/Cargo.toml index b324cf36246dea409ec30ad87557f18a9c87224d..2a08cd07d68d3f8392ccf7e06293602731b31ed8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,4 +32,4 @@ regex = "1.10.2" name = "doc_gpt" [workspace] -members = [".", "migration"] \ No newline at end of file +members = [".", "migration"] diff --git a/migration/src/m20220101_000001_create_table.rs b/migration/src/m20220101_000001_create_table.rs index 41287e8d11abe9b9173963ba0b9e2fc8e74e8e52..e6892d7e1383510924f36a462c4a7c034332ef34 100644 --- a/migration/src/m20220101_000001_create_table.rs +++ b/migration/src/m20220101_000001_create_table.rs @@ -201,7 +201,7 @@ impl MigrationTrait for Migration { .col(ColumnDef::new(DocInfo::Location).string().not_null()) .col(ColumnDef::new(DocInfo::Size).big_integer().not_null()) .col(ColumnDef::new(DocInfo::Type).string().not_null()) - .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().not_null()) + .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default("")) .comment("doc type|folder") .col( ColumnDef::new(DocInfo::CreatedAt) @@ -274,28 +274,28 @@ impl MigrationTrait for Migration { .values_panic([ (1).into(), "Video".into(), - ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(), + ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(), (1).into(), (1).into(), ]) .values_panic([ (1).into(), "Picture".into(), - ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(), + ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(), (2).into(), (2).into(), ]) .values_panic([ (1).into(), "Music".into(), - ".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(), + ".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(), (3).into(), (3).into(), ]) .values_panic([ (1).into(), "Document".into(), - ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(), + ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(), (3).into(), (3).into(), ]) diff --git a/python/svr/add_thumbnail2file.py b/python/svr/add_thumbnail2file.py new file mode 100644 index 0000000000000000000000000000000000000000..e4558ca9a2f601b4cced9fd473815e6653a817e8 --- /dev/null +++ b/python/svr/add_thumbnail2file.py @@ -0,0 +1,118 @@ +import sys, datetime, random, re, cv2 +from os.path import dirname, realpath +sys.path.append(dirname(realpath(__file__)) + "/../") +from util.db_conn import Postgres +from util.minio_conn import HuMinio +from util import findMaxDt +import base64 +from io import BytesIO +import pandas as pd +from PIL import Image +import pdfplumber + + +PG = Postgres("infiniflow", "docgpt") +MINIO = HuMinio("infiniflow") +def set_thumbnail(did, base64): + sql = f""" + update doc_info set thumbnail_base64='{base64}' + where + did={did} + """ + PG.update(sql) + + +def collect(comm, mod, tm): + sql = f""" + select + did, uid, doc_name, location, updated_at + from doc_info + where + updated_at >= '{tm}' + and MOD(did, {comm}) = {mod} + and is_deleted=false + and type <> 'folder' + and thumbnail_base64='' + order by updated_at asc + limit 10 + """ + docs = PG.select(sql) + if len(docs) == 0:return pd.DataFrame() + + mtm = str(docs["updated_at"].max())[:19] + print("TOTAL:", len(docs), "To: ", mtm) + return docs + + +def build(row): + if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", + row["doc_name"].lower().strip()): + set_thumbnail(row["did"], "_") + return + + def thumbnail(img, SIZE=128): + w,h = img.size + p = SIZE/max(w, h) + w, h = int(w*p), int(h*p) + img.thumbnail((w, h)) + buffered = BytesIO() + try: + img.save(buffered, format="JPEG") + except Exception as e: + try: + img.save(buffered, format="PNG") + except Exception as ee: + pass + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + + iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"])) + if re.search(r"\.pdf$", row["doc_name"].lower().strip()): + pdf = pdfplumber.open(iobytes) + img = pdf.pages[0].to_image().annotated + set_thumbnail(row["did"], thumbnail(img)) + + if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()): + img = Image.open(iobytes) + set_thumbnail(row["did"], thumbnail(img)) + + if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()): + url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]), + row["location"], + expires=datetime.timedelta(seconds=60) + ) + cap = cv2.VideoCapture(url) + succ = cap.isOpened() + i = random.randint(1, 11) + while succ: + ret, frame = cap.read() + if not ret: break + if i > 0: + i -= 1 + continue + img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + print(img.size) + set_thumbnail(row["did"], thumbnail(img)) + cap.release() + cv2.destroyAllWindows() + + +def main(comm, mod): + global model + tm_fnm = f"res/thumbnail-{comm}-{mod}.tm" + tm = findMaxDt(tm_fnm) + rows = collect(comm, mod, tm) + if len(rows) == 0:return + + tmf = open(tm_fnm, "a+") + for _, r in rows.iterrows(): + build(r) + tmf.write(str(r["updated_at"]) + "\n") + tmf.close() + + +if __name__ == "__main__": + from mpi4py import MPI + comm = MPI.COMM_WORLD + main(comm.Get_size(), comm.Get_rank()) + diff --git a/python/util/minio_conn.py b/python/util/minio_conn.py index a8e20bac7f96a8d234a3e417855bb28445504c15..05875ac515aaf2b9f3c4416b202cc4ebcb2361c5 100644 --- a/python/util/minio_conn.py +++ b/python/util/minio_conn.py @@ -54,11 +54,24 @@ class HuMinio(object): r = self.conn.get_object(bucket, fnm) return r.read() except Exception as e: - logging.error(f"Fail get {bucket}/{fnm}: "+str(e)) + logging.error(f"fail get {bucket}/{fnm}: "+str(e)) self.__open__() time.sleep(1) return + + def get_presigned_url(self, bucket, fnm, expires): + for _ in range(10): + try: + return self.conn.get_presigned_url("GET", bucket, fnm, expires) + except Exception as e: + logging.error(f"fail get {bucket}/{fnm}: "+str(e)) + self.__open__() + time.sleep(1) + return + + + if __name__ == "__main__": conn = HuMinio("infiniflow") fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg" diff --git a/src/api/doc_info.rs b/src/api/doc_info.rs index 9169bf0b84f3667699c68301ee23d2896f8a2e45..af1ee131679fb3b705def8927d75f57699f0f647 100644 --- a/src/api/doc_info.rs +++ b/src/api/doc_info.rs @@ -1,6 +1,7 @@ -use std::collections::HashMap; +use std::collections::{HashMap}; use std::io::BufReader; use actix_multipart_extract::{ File, Multipart, MultipartForm }; +use actix_web::web::Bytes; use actix_web::{ HttpResponse, post, web }; use chrono::{ Utc, FixedOffset }; use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs }; @@ -68,7 +69,7 @@ pub struct UploadForm { fn file_type(filename: &String) -> String { let fnm = filename.to_lowercase(); if - let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$") + let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$") .unwrap() .captures(&fnm) { @@ -76,7 +77,7 @@ fn file_type(filename: &String) -> String { } if let Some(_) = Regex::new( - r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$" + r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$" ) .unwrap() .captures(&fnm) @@ -84,14 +85,14 @@ fn file_type(filename: &String) -> String { return "Picture".to_owned(); } if - let Some(_) = Regex::new(r"\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)$") + let Some(_) = Regex::new(r"\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$") .unwrap() .captures(&fnm) { return "Music".to_owned(); } if - let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$") + let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)$") .unwrap() .captures(&fnm) { @@ -100,6 +101,7 @@ fn file_type(filename: &String) -> String { "Other".to_owned() } + #[post("/v1.0/upload")] async fn upload( payload: Multipart<UploadForm>,