diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 4a75922a70a91f515f8ccf6581addd2df060695d..686365cf325ec605e0a56ea887bc5e649bfffa81 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -236,13 +236,16 @@ def run(): try: for id in req["doc_ids"]: info = {"run": str(req["run"]), "progress": 0} - if str(req["run"]) == TaskStatus.RUNNING.value:info["progress_msg"] = "" + if str(req["run"]) == TaskStatus.RUNNING.value: + info["progress_msg"] = "" + info["chunk_num"] = 0 + info["token_num"] = 0 DocumentService.update_by_id(id, info) - if str(req["run"]) == TaskStatus.CANCEL.value: - tenant_id = DocumentService.get_tenant_id(id) - if not tenant_id: - return get_data_error_result(retmsg="Tenant not found!") - ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) + #if str(req["run"]) == TaskStatus.CANCEL.value: + tenant_id = DocumentService.get_tenant_id(id) + if not tenant_id: + return get_data_error_result(retmsg="Tenant not found!") + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) return get_json_result(data=True) except Exception as e: @@ -311,13 +314,17 @@ def change_parser(): if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name): return get_data_error_result(retmsg="Not supported yet!") - e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": ""}) + e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": "0"}) if not e: return get_data_error_result(retmsg="Document not found!") if doc.token_num>0: e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1) if not e: return get_data_error_result(retmsg="Document not found!") + tenant_id = DocumentService.get_tenant_id(req["doc_id"]) + if not tenant_id: + return get_data_error_result(retmsg="Tenant not found!") + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) return get_json_result(data=True) except Exception as e: diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index a63130150d4362347a1e1bc5006b7ef746aa3933..a083fac24b256f419619517fec23085e126f3b0c 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -65,7 +65,7 @@ class TaskService(CommonService): try: task = cls.model.get_by_id(id) _, doc = DocumentService.get_by_id(task.doc_id) - return doc.run == TaskStatus.CANCEL.value + return doc.run == TaskStatus.CANCEL.value or doc.progress < 0 except Exception as e: pass return True diff --git a/api/settings.py b/api/settings.py index 98863dbd39c42625d79254030c0673d1217f3518..4fa73707863a1b0e338d19dc47f88937f8fab0f5 100644 --- a/api/settings.py +++ b/api/settings.py @@ -98,15 +98,6 @@ PROXY_PROTOCOL = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("protocol") DATABASE = decrypt_database_config(name="mysql") -# Logger -LoggerFactory.set_directory(os.path.join(get_project_base_directory(), "logs", "api")) -# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} -LoggerFactory.LEVEL = 10 - -stat_logger = getLogger("stat") -access_logger = getLogger("access") -database_logger = getLogger("database") - # Switch # upload UPLOAD_DATA_FROM_CLIENT = True @@ -144,6 +135,15 @@ CHECK_NODES_IDENTITY = False retrievaler = search.Dealer(ELASTICSEARCH) +# Logger +LoggerFactory.set_directory(os.path.join(get_project_base_directory(), "logs", "api")) +# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} +LoggerFactory.LEVEL = 10 + +stat_logger = getLogger("stat") +access_logger = getLogger("access") +database_logger = getLogger("database") + class CustomEnum(Enum): @classmethod def valid(cls, value): diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 7f7c91929c9095dc73a8fb5df8eb807f479f8810..5ed59b545883ddde55c73c12f52e6e8868643c68 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -8,7 +8,7 @@ import torch import re import pdfplumber import logging -from PIL import Image +from PIL import Image, ImageDraw import numpy as np from api.db import ParserType @@ -930,13 +930,25 @@ class HuParser: def crop(self, text, ZM=3): imgs = [] + poss = [] for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text): pn, left, right, top, bottom = tag.strip( "#").strip("@").split("\t") left, right, top, bottom = float(left), float( right), float(top), float(bottom) + poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) + if not poss: return + + max_width = np.max([right-left for (_, left, right, _, _) in poss]) + GAP = 6 + pos = poss[0] + poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0))) + pos = poss[-1] + poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120))) + + for ii, (pns, left, right, top, bottom) in enumerate(poss): + right = left + max_width bottom *= ZM - pns = [int(p) - 1 for p in pn.split("-")] for pn in pns[1:]: bottom += self.page_images[pn - 1].size[1] imgs.append( @@ -959,16 +971,21 @@ class HuParser: if not imgs: return - GAP = 2 height = 0 for img in imgs: height += img.size[1] + GAP height = int(height) + width = int(np.max([i.size[0] for i in imgs])) pic = Image.new("RGB", - (int(np.max([i.size[0] for i in imgs])), height), + (width, height), (245, 245, 245)) height = 0 - for img in imgs: + for ii, img in enumerate(imgs): + if ii == 0 or ii + 1 == len(imgs): + img = img.convert('RGBA') + overlay = Image.new('RGBA', img.size, (0, 0, 0, 0)) + overlay.putalpha(128) + img = Image.alpha_composite(img, overlay).convert("RGB") pic.paste(img, (0, int(height))) height += img.size[1] + GAP return pic diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index 52feabad18cbea4c96077297041690f12ef97402..39e6e92b526e791404886699bdc62e383603b51c 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -34,7 +34,7 @@ class LayoutRecognizer(Recognizer): "Equation", ] def __init__(self, domain): - super().__init__(self.labels, domain) #, os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) + super().__init__(self.labels, domain, os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16): def __is_garbage(b): diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index 8e149a5787f852d838d990f681acfc8f3b534f58..bfcf37e58ba25111b6bf8792a945b45eda8285ae 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -33,7 +33,7 @@ class TableStructureRecognizer(Recognizer): ] def __init__(self): - super().__init__(self.labels, "tsr")#,os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) + super().__init__(self.labels, "tsr",os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) def __call__(self, images, thr=0.2): tbls = super().__call__(images, thr) diff --git a/rag/app/book.py b/rag/app/book.py index 2f0066889d4834613af46b18ea5db82cc246036a..e79ee5f709e3f6600b71c2cde75bef2ced5c258a 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -13,7 +13,7 @@ import copy import re from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ - hierarchical_merge, make_colon_as_title, naive_merge, random_choices + hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table from rag.nlp import huqie from deepdoc.parser import PdfParser, DocxParser @@ -90,25 +90,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca make_colon_as_title(sections) bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)]) if bull >= 0: cks = hierarchical_merge(bull, sections, 3) - else: cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;ďĽďĽź")) + else: + sections = [s.split("@") for s in sections] + sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2] + cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;ďĽďĽź")) - sections = [t for t, _ in sections] # is it English - eng = lang.lower() == "english"#is_english(random_choices(sections, k=218)) + eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218)) + + res = tokenize_table(tbls, doc, eng) - res = [] - # add tables - for img, rows in tbls: - bs = 10 - de = ";" if eng else ";" - for i in range(0, len(rows), bs): - d = copy.deepcopy(doc) - r = de.join(rows[i:i + bs]) - r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) - tokenize(d, r, eng) - d["image"] = img - res.append(d) - print("TABLE", d["content_with_weight"]) # wrap up to es documents for ck in cks: d = copy.deepcopy(doc) diff --git a/rag/app/manual.py b/rag/app/manual.py index 0ef3195782aefe45ebf15bff83dd20b4ef94f32f..91a102c60c4788c81769b174ec0d6b200fcc997b 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -2,7 +2,7 @@ import copy import re from api.db import ParserType -from rag.nlp import huqie, tokenize +from rag.nlp import huqie, tokenize, tokenize_table from deepdoc.parser import PdfParser from rag.utils import num_tokens_from_string @@ -81,18 +81,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # is it English eng = lang.lower() == "english"#pdf_parser.is_english - res = [] - # add tables - for img, rows in tbls: - bs = 10 - de = ";" if eng else ";" - for i in range(0, len(rows), bs): - d = copy.deepcopy(doc) - r = de.join(rows[i:i + bs]) - r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) - tokenize(d, r, eng) - d["image"] = img - res.append(d) + res = tokenize_table(tbls, doc, eng) i = 0 chunk = [] diff --git a/rag/app/naive.py b/rag/app/naive.py index c40d8542b3d4a1dfa9e7084cd132629ef48dc928..71d3d350d14218d96e45bb6e992298abd3880e74 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -13,7 +13,7 @@ import copy import re from rag.app import laws -from rag.nlp import huqie, is_english, tokenize, naive_merge +from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table from deepdoc.parser import PdfParser from rag.settings import cron_logger @@ -72,17 +72,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca pdf_parser = Pdf() sections, tbls = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) - # add tables - for img, rows in tbls: - bs = 10 - de = ";" if eng else ";" - for i in range(0, len(rows), bs): - d = copy.deepcopy(doc) - r = de.join(rows[i:i + bs]) - r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) - tokenize(d, r, eng) - d["image"] = img - res.append(d) + res = tokenize_table(tbls, doc, eng) elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" @@ -106,6 +96,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # wrap up to es documents for ck in cks: print("--", ck) + if not ck:continue d = copy.deepcopy(doc) if pdf_parser: d["image"] = pdf_parser.crop(ck) diff --git a/rag/app/paper.py b/rag/app/paper.py index a050c1105a196ce06d8bb01b724473ac5694b624..df7a28fb9dc3dda71520d527035d11b775f1867e 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -15,7 +15,7 @@ import re from collections import Counter from api.db import ParserType -from rag.nlp import huqie, tokenize +from rag.nlp import huqie, tokenize, tokenize_table from deepdoc.parser import PdfParser import numpy as np from rag.utils import num_tokens_from_string @@ -158,18 +158,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca eng = lang.lower() == "english"#pdf_parser.is_english print("It's English.....", eng) - res = [] - # add tables - for img, rows in paper["tables"]: - bs = 10 - de = ";" if eng else ";" - for i in range(0, len(rows), bs): - d = copy.deepcopy(doc) - r = de.join(rows[i:i + bs]) - r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) - tokenize(d, r) - d["image"] = img - res.append(d) + res = tokenize_table(paper["tables"], doc, eng) if paper["abstract"]: d = copy.deepcopy(doc) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index a82e514a0fa0063c9a8c497abc4b940e7eb441d3..0a041ee0445d6e47556105759e37ca94a54a4b25 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -20,7 +20,7 @@ from deepdoc.parser import PdfParser, PptParser class Ppt(PptParser): def __call__(self, fnm, from_page, to_page, callback=None): - txts = super.__call__(fnm, from_page, to_page) + txts = super().__call__(fnm, from_page, to_page) callback(0.5, "Text extraction finished.") import aspose.slides as slides diff --git a/rag/app/resume.py b/rag/app/resume.py index 8b4ca01333a0e581fd782bb600bccb660d60ebd0..044c44a86de8703a862176f74d5fbdc9c9f39e71 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -79,7 +79,7 @@ def chunk(filename, binary=None, callback=None, **kwargs): resume = remote_call(filename, binary) if len(resume.keys()) < 7: callback(-1, "Resume is not successfully parsed.") - return [] + raise Exception("Resume parser remote call fail!") callback(0.6, "Done parsing. Chunking...") print(json.dumps(resume, ensure_ascii=False, indent=2)) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index fcb306e9849baceabe0a180c0d4fa8428910efa0..a3c8dc7dd82b31892992644ccfd170339a956152 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1,4 +1,4 @@ - +import copy from nltk.stem import PorterStemmer stemmer = PorterStemmer() @@ -80,6 +80,20 @@ def tokenize(d, t, eng): d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) +def tokenize_table(tbls, doc, eng, batch_size=10): + res = [] + # add tables + for img, rows in tbls: + de = "; " if eng else "; " + for i in range(0, len(rows), batch_size): + d = copy.deepcopy(doc) + r = de.join(rows[i:i + batch_size]) + tokenize(d, r, eng) + d["image"] = img + res.append(d) + return res + + def remove_contents_table(sections, eng=False): i = 0 while i < len(sections): @@ -201,10 +215,12 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;ďĽďĽź"): tnum = num_tokens_from_string(t) if tnum < 8: pos = "" if tk_nums[-1] > chunk_token_num: - cks.append(t + pos) + if t.find(pos) < 0: t += pos + cks.append(t) tk_nums.append(tnum) else: - cks[-1] += t + pos + if cks[-1].find(pos) < 0: t += pos + cks[-1] += t tk_nums[-1] += tnum for sec, pos in sections: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index e03188810f8cab7dad53a86d69b953d52f596557..7e9a24a1777129a4968b4e5e407eec588434734b 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- import json import re +from copy import deepcopy + from elasticsearch_dsl import Q, Search from typing import List, Optional, Dict, Union from dataclasses import dataclass @@ -98,7 +100,7 @@ class Dealer: del s["highlight"] q_vec = s["knn"]["query_vector"] es_logger.info("ă€Q】: {}".format(json.dumps(s))) - res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src) + res = self.es.search(deepcopy(s), idxnm=idxnm, timeout="600s", src=src) es_logger.info("TOTAL: {}".format(self.es.getTotal(res))) if self.es.getTotal(res) == 0 and "knn" in s: bqry, _ = self.qryr.question(qst, min_match="10%") diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index cd08b9f455f5bf58d7ab7f28d01300424a779ffc..29f05f9d19d651083bdbb88a5d2301e0a56fcaa8 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -90,7 +90,7 @@ def dispatch(): tsks.append(task) else: tsks.append(new_task()) - print(tsks) + bulk_insert_into_db(Task, tsks, True) set_dispatching(r["id"]) tmf.write(str(r["update_time"]) + "\n") diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 285ce961879798f2a79258220ece01792b83390a..8d39aaa9bf2360173fe8237adbbb7806fba0bd3e 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -114,7 +114,7 @@ def build(row): kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"]) except Exception as e: if re.search("(No such file|not found)", str(e)): - callback(-1, "Can not find file <%s>" % row["doc_name"]) + callback(-1, "Can not find file <%s>" % row["name"]) else: callback(-1, f"Internal server error: %s" % str(e).replace("'", ""))