diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 2d8aeb006e03ec86562c1ea3f7cbdd9dc3b1529e..886a02f9a8460595eb4b8a99a0b41251c9db7166 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -60,7 +60,8 @@ def list(): for id in sres.ids: d = { "chunk_id": id, - "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get("content_with_weight", ""), + "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get( + "content_with_weight", ""), "doc_id": sres.field[id]["doc_id"], "docnm_kwd": sres.field[id]["docnm_kwd"], "important_kwd": sres.field[id].get("important_kwd", []), @@ -68,10 +69,12 @@ def list(): "available_int": sres.field[id].get("available_int", 1), "positions": sres.field[id].get("position_int", "").split("\t") } - poss = [] - for i in range(0, len(d["positions"]), 5): - poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])]) - d["positions"] = poss + if len(d["positions"]) % 5 == 0: + poss = [] + for i in range(0, len(d["positions"]), 5): + poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), + float(d["positions"][i + 3]), float(d["positions"][i + 4])]) + d["positions"] = poss res["chunks"].append(d) return get_json_result(data=res) except Exception as e: @@ -137,10 +140,10 @@ def set(): return get_data_error_result(retmsg="Document not found!") if doc.parser_id == ParserType.QA: - arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1] + arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1] if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.") q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] - d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a])) + d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a])) v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] @@ -189,7 +192,8 @@ def create(): md5 = hashlib.md5() md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) chunck_id = md5.hexdigest() - d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]} + d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), + "content_with_weight": req["content_with_weight"]} d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["important_kwd"] = req.get("important_kwd", []) d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) diff --git a/api/db/db_models.py b/api/db/db_models.py index 4fb66d58780327a66ba83aa65d147c40d7a8c3c2..1057f9bf9ccfadb120bd092f3d88aaa0c1a5a46b 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -527,7 +527,7 @@ class Dialog(DataBaseModel): tenant_id = CharField(max_length=32, null=False) name = CharField(max_length=255, null=True, help_text="dialog application name") description = TextField(null=True, help_text="Dialog description") - icon = CharField(max_length=16, null=False, help_text="dialog icon") + icon = TextField(null=True, help_text="icon base64 string") language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese") llm_id = CharField(max_length=32, null=False, help_text="default llm ID") llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7, diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 92928c47450b4f6d6ce937b57d5ccec0e8d09d83..8e03d7d9e751e164790267d658b33abb05eebbbf 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -35,6 +35,7 @@ class HuParser: self.updown_cnt_mdl.set_param({"device": "cuda"}) self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", filename="updown_concat_xgb.model")) + self.page_from = 0 """ If you have trouble downloading HuggingFace models, -_^ this might help!! @@ -683,7 +684,7 @@ class HuParser: "layoutno", ""))) left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] - poss.append((pn, left, right, top, bott)) + poss.append((pn+self.page_from, left, right, top, bott)) return self.page_images[pn] \ .crop((left * ZM, top * ZM, right * ZM, bott * ZM)) @@ -863,6 +864,7 @@ class HuParser: self.garbages = {} self.page_cum_height = [0] self.page_layout = [] + self.page_from = page_from try: self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in @@ -947,7 +949,9 @@ class HuParser: left, right, top, bottom = float(left), float( right), float(top), float(bottom) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) - if not poss: return + if not poss: + if need_position: return None, None + return max_width = np.max([right-left for (_, left, right, _, _) in poss]) GAP = 6 @@ -969,7 +973,8 @@ class HuParser: bottom, self.page_images[pns[0]].size[1]) )) ) - positions.append((pns[0], left, right, top, min( + if 0 < ii < len(poss)-1: + positions.append((pns[0]+self.page_from, left, right, top, min( bottom, self.page_images[pns[0]].size[1])/ZM)) bottom -= self.page_images[pns[0]].size[1] for pn in pns[1:]: @@ -980,8 +985,9 @@ class HuParser: self.page_images[pn].size[1]) )) ) - positions.append((pn, left, right, 0, min( - bottom, self.page_images[pn].size[1]) / ZM)) + if 0 < ii < len(poss) - 1: + positions.append((pn+self.page_from, left, right, 0, min( + bottom, self.page_images[pn].size[1]) / ZM)) bottom -= self.page_images[pn].size[1] if not imgs: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index c8b39d1887cef997eecc5c4b7e90f93d4cef745c..f1e7c1ec666280fa5f774c14cc4c3362258ca284 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -10,7 +10,7 @@ PY=/root/miniconda3/envs/py11/bin/python function task_exe(){ sleep 60; - while [ 1 -eq 1 ];do mpirun -n 2 --allow-run-as-root $PY rag/svr/task_executor.py ; done + while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done } function watch_broker(){ diff --git a/rag/app/book.py b/rag/app/book.py index 4a5ce626bf2354985839e2d375ccd881536ed874..df962cbc3295207067eada99ceb126a934fcc0ee 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -41,7 +41,7 @@ class Pdf(PdfParser): self._filter_forpages() self._merge_with_same_bullet() callback(0.75, "Text merging finished.") - tbls = self._extract_table_figure(True, zoomin, False, True) + tbls = self._extract_table_figure(True, zoomin, True, True) callback(0.8, "Text extraction finished") diff --git a/rag/app/manual.py b/rag/app/manual.py index 98f52af3c9a4680af96a803e759bcffad9c63160..68b3faf63d478546b35614a85928e16d53c4b0f5 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -33,7 +33,7 @@ class Pdf(PdfParser): self._concat_downward(concat_between_pages=False) self._filter_forpages() callback(0.77, "Text merging finished") - tbls = self._extract_table_figure(True, zoomin, False, True) + tbls = self._extract_table_figure(True, zoomin, True, True) # clean mess for b in self.boxes: diff --git a/rag/app/naive.py b/rag/app/naive.py index b80d600b3db2a45f009d2534aeaaeadd00f44895..72f53a9d6862dcb98aa63e2fbd5384fd97ad8cc0 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -40,7 +40,7 @@ class Pdf(PdfParser): self._concat_downward(concat_between_pages=False) self._filter_forpages() callback(0.77, "Text merging finished") - tbls = self._extract_table_figure(True, zoomin, False, True) + tbls = self._extract_table_figure(True, zoomin, True, True) cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) #self._naive_vertical_merge() diff --git a/rag/app/paper.py b/rag/app/paper.py index a34afd235d8dac6947a7b54938092719cfe7b90a..8738ddb2098023242c45f6575a1ea62e4d60b344 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -48,7 +48,7 @@ class Pdf(PdfParser): self._concat_downward(concat_between_pages=False) self._filter_forpages() callback(0.75, "Text merging finished.") - tbls = self._extract_table_figure(True, zoomin, False, True) + tbls = self._extract_table_figure(True, zoomin, True, True) # clean mess if column_width < self.page_images[0].size[0] / zoomin / 2: diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 6a571df7e796c5f03b5400b722f7eefdc1ba2150..f2c9f436ff802adc59bfb1eb6ce5cbbbc1f12956 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;ďĽďĽź"): tk_nums[-1] += tnum for sec, pos in sections: + add_chunk(sec, pos) + continue s, e = 0, 1 while e < len(sec): if sec[e] in delimiter: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 8be50c1397a31bed45dc198e67fdd22ce18a0f62..f229055ddf9ff5cbe9015851c9e9073344da5452 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -83,7 +83,7 @@ class Dealer: else: s = s.sort( {"page_num_int": {"order": "asc", "unmapped_type": "float"}}, - {"top_int": {"order": "asc", "unmapped_type": "float"}}, + {"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}}, {"create_time": {"order": "desc", "unmapped_type": "date"}}, {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} ) diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index 29f05f9d19d651083bdbb88a5d2301e0a56fcaa8..8b2648e79334aa9b086365e534d4dd526612d029 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -83,10 +83,10 @@ def dispatch(): pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) for s,e in r["parser_config"].get("pages", [(0,100000)]): e = min(e, pages) - for p in range(s, e, 10): + for p in range(s, e, 5): task = new_task() task["from_page"] = p - task["to_page"] = min(p + 10, e) + task["to_page"] = min(p + 5, e) tsks.append(task) else: tsks.append(new_task())