diff --git a/api/db/db_models.py b/api/db/db_models.py index 1057f9bf9ccfadb120bd092f3d88aaa0c1a5a46b..f28d37b7f7461b81ac8ef92c4bf3d60bfbd40773 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin): avatar = TextField(null=True, help_text="avatar base64 string") language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese") color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark") + timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai") last_login_time = DateTimeField(null=True) is_authenticated = CharField(max_length=1, null=False, default="1") is_active = CharField(max_length=1, null=False, default="1") diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 2c379fc1d7c32596fbc38bdf91c911a874538f1f..9ac5ecc0c8c824426cb695bb5bb10d8f8a7f1645 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -313,9 +313,19 @@ class HuParser: while i < len(bxs) - 1: b = bxs[i] b_ = bxs[i + 1] - if b.get("layoutno", "0") != b_.get("layoutno", "1"): + if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: i += 1 continue + if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3: + # merge + bxs[i]["x1"] = b_["x1"] + bxs[i]["top"] = (b["top"] + b_["top"]) / 2 + bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 + bxs[i]["text"] += b_["text"] + bxs.pop(i + 1) + continue + i += 1 + continue dis_thr = 1 dis = b["x1"] - b_["x0"] @@ -642,9 +652,9 @@ class HuParser: tk, tv = nearest(tables) fk, fv = nearest(figures) - if min(tv, fv) > 2000: - i += 1 - continue + #if min(tv, fv) > 2000: + # i += 1 + # continue if tv < fv: tables[tk].insert(0, c) logging.debug( @@ -711,12 +721,7 @@ class HuParser: # crop figure out and add caption for k, bxs in figures.items(): - txt = "\n".join( - [b["text"] for b in bxs - if not re.match(r"[0-9a-z.\+%-]", b["text"].strip()) - and len(b["text"].strip()) >= 4 - ] - ) + txt = "\n".join([b["text"] for b in bxs]) if not txt: continue diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index e7ad76105acb5d3c42f8a00f88712b5e50f95c24..4851b8ebefa3cec2c4d57f746188d144b6cafd9f 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer): continue bxs[i]["layoutno"] = f"{ty}-{ii}" - bxs[i]["layout_type"] = lts_[ii]["type"] + bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure" i += 1 for lt in ["footer", "header", "reference", "figure caption", @@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer): # add box to figure layouts which has not text box for i, lt in enumerate( - [lt for lt in lts if lt["type"] == "figure"]): + [lt for lt in lts if lt["type"] in ["figure","equation"]]): if lt.get("visited"): continue lt = deepcopy(lt) diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py index 6e08d7c22270de5781926cc99e50aa334731a48b..13043e4591941a8835fba92516fb11719129ab88 100644 --- a/deepdoc/vision/ocr.py +++ b/deepdoc/vision/ocr.py @@ -21,7 +21,6 @@ from .operators import * import numpy as np import onnxruntime as ort -from api.utils.file_utils import get_project_base_directory from .postprocess import build_post_process from rag.settings import cron_logger diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py index 3aac0086faddd636bf32591577872e9fa6d14f44..2f9123b99214a2ae79b5e9c1f3d19e54dd3a6bd8 100644 --- a/deepdoc/vision/recognizer.py +++ b/deepdoc/vision/recognizer.py @@ -276,18 +276,18 @@ class Recognizer(object): def find_overlapped_with_threashold(box, boxes, thr=0.3): if not boxes: return - max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0 + max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 s, e = 0, len(boxes) for i in range(s, e): ov = Recognizer.overlapped_area(box, boxes[i]) _ov = Recognizer.overlapped_area(boxes[i], box) - if (ov, _ov) < (max_overlaped, _max_overlaped): + if (ov, _ov) < (max_overlapped, _max_overlapped): continue - max_overlaped_i = i - max_overlaped = ov - _max_overlaped = _ov + max_overlapped_i = i + max_overlapped = ov + _max_overlapped = _ov - return max_overlaped_i + return max_overlapped_i def preprocess(self, image_list): inputs = [] diff --git a/rag/app/naive.py b/rag/app/naive.py index 4d5ec8cbd1775e299174e15a2155db028adecf04..7b7aa4a07668ff0f8273e474e145c427897c220b 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca d = copy.deepcopy(doc) if pdf_parser: d["image"], poss = pdf_parser.crop(ck, need_position=True) - add_positions(d, poss) + add_positions(d, poss, from_page) ck = pdf_parser.remove_tag(ck) tokenize(d, ck, eng) res.append(d) @@ -112,7 +112,7 @@ if __name__ == "__main__": import sys - def dummy(a, b): + def dummy(prog=None, msg=""): pass diff --git a/rag/nlp/search.py b/rag/nlp/search.py index b87bb27a815ee9d2a1b95d557f521098029d3336..3603970125a6a318a6d56fd91101c33bc9690541 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -82,8 +82,8 @@ class Dealer: ) else: s = s.sort( - {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}}, - {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}}, + {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, + {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, {"create_time": {"order": "desc", "unmapped_type": "date"}}, {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} )