Skip to content
Snippets Groups Projects
Commit cdd95656 authored by KevinHuSh's avatar KevinHuSh Committed by GitHub
Browse files

finish add thumbnail to video,image,pdf files (#18)

parent 3fc700a1
No related branches found
No related tags found
No related merge requests found
...@@ -32,4 +32,4 @@ regex = "1.10.2" ...@@ -32,4 +32,4 @@ regex = "1.10.2"
name = "doc_gpt" name = "doc_gpt"
[workspace] [workspace]
members = [".", "migration"] members = [".", "migration"]
\ No newline at end of file
...@@ -201,7 +201,7 @@ impl MigrationTrait for Migration { ...@@ -201,7 +201,7 @@ impl MigrationTrait for Migration {
.col(ColumnDef::new(DocInfo::Location).string().not_null()) .col(ColumnDef::new(DocInfo::Location).string().not_null())
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null()) .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
.col(ColumnDef::new(DocInfo::Type).string().not_null()) .col(ColumnDef::new(DocInfo::Type).string().not_null())
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().not_null()) .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
.comment("doc type|folder") .comment("doc type|folder")
.col( .col(
ColumnDef::new(DocInfo::CreatedAt) ColumnDef::new(DocInfo::CreatedAt)
...@@ -274,28 +274,28 @@ impl MigrationTrait for Migration { ...@@ -274,28 +274,28 @@ impl MigrationTrait for Migration {
.values_panic([ .values_panic([
(1).into(), (1).into(),
"Video".into(), "Video".into(),
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(), ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
(1).into(), (1).into(),
(1).into(), (1).into(),
]) ])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"Picture".into(), "Picture".into(),
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(), ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
(2).into(), (2).into(),
(2).into(), (2).into(),
]) ])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"Music".into(), "Music".into(),
".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(), ".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
(3).into(), (3).into(),
(3).into(), (3).into(),
]) ])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"Document".into(), "Document".into(),
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(), ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
(3).into(), (3).into(),
(3).into(), (3).into(),
]) ])
......
import sys, datetime, random, re, cv2
from os.path import dirname, realpath
sys.path.append(dirname(realpath(__file__)) + "/../")
from util.db_conn import Postgres
from util.minio_conn import HuMinio
from util import findMaxDt
import base64
from io import BytesIO
import pandas as pd
from PIL import Image
import pdfplumber
PG = Postgres("infiniflow", "docgpt")
MINIO = HuMinio("infiniflow")
def set_thumbnail(did, base64):
sql = f"""
update doc_info set thumbnail_base64='{base64}'
where
did={did}
"""
PG.update(sql)
def collect(comm, mod, tm):
sql = f"""
select
did, uid, doc_name, location, updated_at
from doc_info
where
updated_at >= '{tm}'
and MOD(did, {comm}) = {mod}
and is_deleted=false
and type <> 'folder'
and thumbnail_base64=''
order by updated_at asc
limit 10
"""
docs = PG.select(sql)
if len(docs) == 0:return pd.DataFrame()
mtm = str(docs["updated_at"].max())[:19]
print("TOTAL:", len(docs), "To: ", mtm)
return docs
def build(row):
if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
row["doc_name"].lower().strip()):
set_thumbnail(row["did"], "_")
return
def thumbnail(img, SIZE=128):
w,h = img.size
p = SIZE/max(w, h)
w, h = int(w*p), int(h*p)
img.thumbnail((w, h))
buffered = BytesIO()
try:
img.save(buffered, format="JPEG")
except Exception as e:
try:
img.save(buffered, format="PNG")
except Exception as ee:
pass
return base64.b64encode(buffered.getvalue()).decode("utf-8")
iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
pdf = pdfplumber.open(iobytes)
img = pdf.pages[0].to_image().annotated
set_thumbnail(row["did"], thumbnail(img))
if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
img = Image.open(iobytes)
set_thumbnail(row["did"], thumbnail(img))
if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
row["location"],
expires=datetime.timedelta(seconds=60)
)
cap = cv2.VideoCapture(url)
succ = cap.isOpened()
i = random.randint(1, 11)
while succ:
ret, frame = cap.read()
if not ret: break
if i > 0:
i -= 1
continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
print(img.size)
set_thumbnail(row["did"], thumbnail(img))
cap.release()
cv2.destroyAllWindows()
def main(comm, mod):
global model
tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
tm = findMaxDt(tm_fnm)
rows = collect(comm, mod, tm)
if len(rows) == 0:return
tmf = open(tm_fnm, "a+")
for _, r in rows.iterrows():
build(r)
tmf.write(str(r["updated_at"]) + "\n")
tmf.close()
if __name__ == "__main__":
from mpi4py import MPI
comm = MPI.COMM_WORLD
main(comm.Get_size(), comm.Get_rank())
...@@ -54,11 +54,24 @@ class HuMinio(object): ...@@ -54,11 +54,24 @@ class HuMinio(object):
r = self.conn.get_object(bucket, fnm) r = self.conn.get_object(bucket, fnm)
return r.read() return r.read()
except Exception as e: except Exception as e:
logging.error(f"Fail get {bucket}/{fnm}: "+str(e)) logging.error(f"fail get {bucket}/{fnm}: "+str(e))
self.__open__() self.__open__()
time.sleep(1) time.sleep(1)
return return
def get_presigned_url(self, bucket, fnm, expires):
for _ in range(10):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception as e:
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
self.__open__()
time.sleep(1)
return
if __name__ == "__main__": if __name__ == "__main__":
conn = HuMinio("infiniflow") conn = HuMinio("infiniflow")
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg" fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
......
use std::collections::HashMap; use std::collections::{HashMap};
use std::io::BufReader; use std::io::BufReader;
use actix_multipart_extract::{ File, Multipart, MultipartForm }; use actix_multipart_extract::{ File, Multipart, MultipartForm };
use actix_web::web::Bytes;
use actix_web::{ HttpResponse, post, web }; use actix_web::{ HttpResponse, post, web };
use chrono::{ Utc, FixedOffset }; use chrono::{ Utc, FixedOffset };
use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs }; use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
...@@ -68,7 +69,7 @@ pub struct UploadForm { ...@@ -68,7 +69,7 @@ pub struct UploadForm {
fn file_type(filename: &String) -> String { fn file_type(filename: &String) -> String {
let fnm = filename.to_lowercase(); let fnm = filename.to_lowercase();
if if
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$") let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$")
.unwrap() .unwrap()
.captures(&fnm) .captures(&fnm)
{ {
...@@ -76,7 +77,7 @@ fn file_type(filename: &String) -> String { ...@@ -76,7 +77,7 @@ fn file_type(filename: &String) -> String {
} }
if if
let Some(_) = Regex::new( let Some(_) = Regex::new(
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$" r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$"
) )
.unwrap() .unwrap()
.captures(&fnm) .captures(&fnm)
...@@ -84,14 +85,14 @@ fn file_type(filename: &String) -> String { ...@@ -84,14 +85,14 @@ fn file_type(filename: &String) -> String {
return "Picture".to_owned(); return "Picture".to_owned();
} }
if if
let Some(_) = Regex::new(r"\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)$") let Some(_) = Regex::new(r"\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$")
.unwrap() .unwrap()
.captures(&fnm) .captures(&fnm)
{ {
return "Music".to_owned(); return "Music".to_owned();
} }
if if
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$") let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)$")
.unwrap() .unwrap()
.captures(&fnm) .captures(&fnm)
{ {
...@@ -100,6 +101,7 @@ fn file_type(filename: &String) -> String { ...@@ -100,6 +101,7 @@ fn file_type(filename: &String) -> String {
"Other".to_owned() "Other".to_owned()
} }
#[post("/v1.0/upload")] #[post("/v1.0/upload")]
async fn upload( async fn upload(
payload: Multipart<UploadForm>, payload: Multipart<UploadForm>,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment