diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index c8cdd70ec3c81544e467a76985afff604912791c..360ab5bf8391edc04a84ce768947446fd3b86ec1 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -51,6 +51,7 @@ class TaskService(CommonService): .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ .where( Document.status == StatusEnum.VALID.value, + Document.run == TaskStatus.RUNNING.value, ~(Document.type == FileType.VIRTUAL.value), cls.model.progress == 0, cls.model.update_time >= tm, diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 80c6df34d1dcab7787b036523ec13ed05750fdfb..222899de175c684c36fc77d6486869e1a39e999c 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -42,7 +42,9 @@ class HuPptParser(object): BytesIO(fnm)) txts = [] self.total_page = len(ppt.slides) - for i, slide in enumerate(ppt.slides[from_page: to_page]): + for i, slide in enumerate(ppt.slides): + if i < from_page: continue + if i >= to_page:break texts = [] for shape in slide.shapes: txt = self.__extract(shape) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 7525946a3c617a74abe5bbe3f611932df8f547d7..98622b1753c8af1456879a668fd510564bec49e8 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -13,6 +13,9 @@ import copy import re from io import BytesIO + +from PIL import Image + from rag.nlp import tokenize, is_english from rag.nlp import huqie from deepdoc.parser import PdfParser, PptParser @@ -30,7 +33,7 @@ class Ppt(PptParser): for i, slide in enumerate(presentation.slides[from_page: to_page]): buffered = BytesIO() slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg) - imgs.append(buffered.getvalue()) + imgs.append(Image.open(buffered)) assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) callback(0.9, "Image extraction finished") self.is_english = is_english(txts) diff --git a/rag/app/table.py b/rag/app/table.py index 4cf1c1c36c072e8d4a3bcdf31598dc640d67c481..3a69fe66533245690c6d9f38846dfd4a7e69e0ff 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -58,12 +58,9 @@ class Excel(ExcelParser): continue data.append(row) done += 1 - if done % 999 == 0: - callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + ( - f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else ""))) res.append(pd.DataFrame(np.array(data), columns=headers)) - callback(0.6, ("Extract records: {}. ".format(done) + ( + callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + ( f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) return res @@ -151,7 +148,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese headers = lines[0].split(kwargs.get("delimiter", "\t")) rows = [] for i, line in enumerate(lines[1:]): - if from_page < from_page:continue + if i < from_page:continue if i >= to_page: break row = [l for l in line.split(kwargs.get("delimiter", "\t"))] if len(row) != len(headers): @@ -191,12 +188,15 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese df[clmns[j]] = cln if ty == "text": txts.extend([str(c) for c in cln if c]) - clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) + clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i]) for i in range(len(clmns))] eng = lang.lower() == "english"#is_english(txts) for ii, row in df.iterrows(): - d = {} + d = { + "docnm_kwd": filename, + "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } row_txt = [] for j in range(len(clmns)): if row[clmns[j]] is None: diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index e79ac383127ebc6018134f05365147218a6d59c1..618e2ea699606d94a4e01a7dcfc6a0fca4ee6683 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -91,10 +91,10 @@ def dispatch(): tsks.append(task) elif r["parser_id"] == "table": rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"])) - for i in range(0, rn, 1000): + for i in range(0, rn, 3000): task = new_task() task["from_page"] = i - task["to_page"] = min(i + 1000, rn) + task["to_page"] = min(i + 3000, rn) tsks.append(task) else: tsks.append(new_task()) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 9765957dd44e63c1e8fcac3bb026c39b009e8341..4f0c28a9483c232f8d4e1f417912774d26a9c028 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -128,8 +128,6 @@ def build(row): return - callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks)) - docs = [] doc = { "doc_id": row["doc_id"], @@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None): tk_count += c cnts_ = np.array([]) - for i in range(0, len(cnts), 32): - vts, c = mdl.encode(cnts[i: i+32]) + for i in range(0, len(cnts), 8): + vts, c = mdl.encode(cnts[i: i+8]) if len(cnts_) == 0: cnts_ = vts else: cnts_ = np.concatenate((cnts_, vts), axis=0) tk_count += c @@ -226,6 +224,7 @@ def main(comm, mod): continue # TODO: exception handler ## set_progress(r["did"], -1, "ERROR: ") + callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks)) try: tk_count = embedding(cks, embd_mdl, r["parser_config"], callback) except Exception as e: