diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index c85e0b3cd634f3ea8582e183fe5e716717efef92..ba6d60024cb97998c4aea532500a4969b3717c73 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -183,9 +183,7 @@ def chat(dialog, messages, **kwargs): ## try to use sql if field mapping is good to go if field_map: chat_logger.info("Use SQL to retrieval:{}".format(questions[-1])) - markdown_tbl, chunks = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl) - if markdown_tbl: - return {"answer": markdown_tbl, "reference": {"chunks": chunks, "doc_aggs": []}} + return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl) prompt_config = dialog.prompt_config for p in prompt_config["parameters"]: @@ -311,7 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl): clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)] # compose markdown table - clmns = "|"+"|".join([re.sub(r"(/.*|ďĽ[^ďĽďĽ‰]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|") + clmns = "|"+"|".join([re.sub(r"(/.*|ďĽ[^ďĽďĽ‰]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|") line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "") rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]] if not docid_idx or not docnm_idx: @@ -322,4 +320,8 @@ def use_sql(question, field_map, tenant_id, chat_mdl): rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) docid_idx = list(docid_idx)[0] docnm_idx = list(docnm_idx)[0] - return "\n".join([clmns, line, rows]), [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]] + return { + "answer": "\n".join([clmns, line, rows]), + "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]], + "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]} + } diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 079376df6f40d4780f98a609877436aac62e1b96..767cfcfd0a2f1867388034fd099841d4150081a2 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -996,7 +996,7 @@ class HuParser: if need_position: return None, None return - max_width = np.max([right - left for (_, left, right, _, _) in poss]) + max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 pos = poss[0] poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py index e41653d6469a8a437284e9ebe3576d6778c1e04a..865651994799508207b10acc4cdb0b9171ae8f9e 100644 --- a/deepdoc/vision/ocr.py +++ b/deepdoc/vision/ocr.py @@ -14,9 +14,6 @@ import copy import time import os - -from huggingface_hub import snapshot_download - from .operators import * import numpy as np import onnxruntime as ort @@ -24,7 +21,6 @@ import onnxruntime as ort from .postprocess import build_post_process from rag.settings import cron_logger - def transform(data, ops=None): """ transform """ if ops is None: @@ -82,7 +78,7 @@ class TextRecognizer(object): self.rec_batch_num = 16 postprocess_params = { 'name': 'CTCLabelDecode', - "character_dict_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "ocr.res"), + "character_dict_path": os.path.join(model_dir, "ocr.res"), "use_space_char": True } self.postprocess_op = build_post_process(postprocess_params) diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index bfcf37e58ba25111b6bf8792a945b45eda8285ae..be2430a03c414c5588882f9fe71794054db3e8cc 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -16,6 +16,7 @@ import re from collections import Counter import numpy as np +from huggingface_hub import snapshot_download from api.utils.file_utils import get_project_base_directory from rag.nlp import huqie @@ -33,7 +34,8 @@ class TableStructureRecognizer(Recognizer): ] def __init__(self): - super().__init__(self.labels, "tsr",os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) + model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) def __call__(self, images, thr=0.2): tbls = super().__call__(images, thr) diff --git a/rag/app/laws.py b/rag/app/laws.py index d5b29e5ec7c8be84ceadcfef1c5e9d695afd0eb4..94a1e7a4565fa85957ec8dcd7420b40bddbd774a 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -68,7 +68,7 @@ class Pdf(PdfParser): callback(0.8, "Text extraction finished") - return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] + return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): @@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser() for txt, poss in pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback): + from_page=from_page, to_page=to_page, callback=callback)[0]: sections.append(txt + poss) elif re.search(r"\.txt$", filename, re.IGNORECASE): diff --git a/rag/app/paper.py b/rag/app/paper.py index 37b4df990e2fe4c28e6d101f20163d9e0549fee7..c3cb2980597df71bea67c86b5789d3a6f78b4fdf 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca "title": filename, "authors": " ", "abstract": "", - "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page), + "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], "tables": [] } else: diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 356414542e5a353308332e2e8a28586111baff69..1d1c38eed9113e7abd0bed04a1aa1bf0b261bf27 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -66,7 +66,7 @@ class Pdf(PdfParser): class PlainPdf(PlainParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): - self.pdf = pdf2_read(filename if not binary else BytesIO(filename)) + self.pdf = pdf2_read(filename if not binary else BytesIO(binary)) page_txt = [] for page in self.pdf.pages[from_page: to_page]: page_txt.append(page.extract_text()) diff --git a/rag/app/resume.py b/rag/app/resume.py index 044c44a86de8703a862176f74d5fbdc9c9f39e71..cced85689e248efd87535ae63e561efc35ed7198 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -40,7 +40,7 @@ def remote_call(filename, binary): "encrypt_type": "base64", "filename": filename, "langtype": '', - "fileori": base64.b64encode(binary.stream.read()).decode('utf-8') + "fileori": base64.b64encode(binary).decode('utf-8') }, "c": "resume_parse_module", "m": "resume_parse" diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index 7ce0d8771293e5f4b292c250badc3f81f044f4fd..f446a2a5e793d6721d5289358d3e381fa4f17c79 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -20,10 +20,10 @@ from openai import OpenAI from FlagEmbedding import FlagModel import torch import numpy as np - +from huggingface_hub import snapshot_download from rag.utils import num_tokens_from_string -flag_model = FlagModel("BAAI/bge-large-zh-v1.5", +flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True), query_instruction_for_retrieval="为这个句ĺ生ćčˇ¨ç¤şä»Ąç”¨äşŽćŁ€ç´˘ç›¸ĺ…łć–‡ç« ďĽš", use_fp16=torch.cuda.is_available()) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 8359cef467e7ad3f86e106a8b7769bc44c2d40e0..aac5d2aa78b5f02e250a2c4776ed5e5e3621cca7 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -53,7 +53,7 @@ class EsQueryer: if not self.isChinese(txt): tks = huqie.qie(txt).split(" ") - q = tks + q = copy.deepcopy(tks) for i in range(1, len(tks)): q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) if not q: @@ -138,7 +138,7 @@ class EsQueryer: def toDict(tks): d = {} - if isinstance(tks, type("")): + if isinstance(tks, str): tks = tks.split(" ") for t, c in self.tw.weights(tks): if t not in d: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 03be167f791dd4e8d4de382f4447e0e6530935b4..94fbe8e217fa6687095bb62833c58362d25c1a28 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -234,13 +234,13 @@ class Dealer: assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( len(ans_v[0]), len(chunk_v[0])) - chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks] + chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks] cites = {} for i, a in enumerate(pieces_): sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], chunk_v, huqie.qie( - pieces_[i]).split(" "), + self.qryr.rmWWW(pieces_[i])).split(" "), chunks_tks, tkweight, vtweight) mx = np.max(sim) * 0.99 diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index c17cbb072de102e8504891ff40ede94508af77b0..7be9d558b4aeaa926ed9d8f04f03f8d6ad8dfca2 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -150,9 +150,10 @@ class Dealer: return 6 def ner(t): + if re.match(r"[0-9,.]{2,}$", t): return 2 + if re.match(r"[a-z]{1,2}$", t): return 0.01 if not self.ne or t not in self.ne: return 1 - if re.match(r"[0-9,.]+$", t): return 2 m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3, "firstnm": 1} return m[self.ne[t]] @@ -170,11 +171,11 @@ class Dealer: return 1 def freq(t): - if re.match(r"[0-9\. -]+$", t): - return 10000 + if re.match(r"[0-9. -]{2,}$", t): + return 3 s = huqie.freq(t) - if not s and re.match(r"[a-z\. -]+$", t): - return 10 + if not s and re.match(r"[a-z. -]+$", t): + return 300 if not s: s = 0 @@ -188,12 +189,12 @@ class Dealer: return max(s, 10) def df(t): - if re.match(r"[0-9\. -]+$", t): - return 100000 + if re.match(r"[0-9. -]{2,}$", t): + return 5 if t in self.df: return self.df[t] + 3 - elif re.match(r"[a-z\. -]+$", t): - return 3 + elif re.match(r"[a-z. -]+$", t): + return 300 elif len(t) >= 4: s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] if len(s) > 1: diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index 2e7216f9ed060469824d1ba7ce464d9d9dc91518..87a296570bf380d0d1433e4d1b4e1ea8a542952a 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -87,7 +87,9 @@ def dispatch(): if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22) if r["parser_id"] == "one": page_size = 1000000000 if not do_layout: page_size = 1000000000 - for s,e in r["parser_config"].get("pages", [(1, 100000)]): + page_ranges = r["parser_config"].get("pages") + if not page_ranges: page_ranges = [(1, 100000)] + for s,e in page_ranges: s -= 1 s = max(0, s) e = min(e-1, pages)