From 8f86ab9f7ff71f222139e01ea2586b4dbbefb193 Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Fri, 8 Mar 2024 11:24:24 +0800
Subject: [PATCH] refine pdf parser, add time zone to userinfo (#112)

---
 api/db/db_models.py                 |  1 +
 deepdoc/parser/pdf_parser.py        | 25 +++++++++++++++----------
 deepdoc/vision/layout_recognizer.py |  4 ++--
 deepdoc/vision/ocr.py               |  1 -
 deepdoc/vision/recognizer.py        | 12 ++++++------
 rag/app/naive.py                    |  4 ++--
 rag/nlp/search.py                   |  4 ++--
 7 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/api/db/db_models.py b/api/db/db_models.py
index 1057f9b..f28d37b 100644
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin):
     avatar = TextField(null=True, help_text="avatar base64 string")
     language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
     color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
+    timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
     last_login_time = DateTimeField(null=True)
     is_authenticated = CharField(max_length=1, null=False, default="1")
     is_active = CharField(max_length=1, null=False, default="1")
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index 2c379fc..9ac5ecc 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -313,9 +313,19 @@ class HuParser:
         while i < len(bxs) - 1:
             b = bxs[i]
             b_ = bxs[i + 1]
-            if b.get("layoutno", "0") != b_.get("layoutno", "1"):
+            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
                 i += 1
                 continue
+            if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
+                # merge
+                bxs[i]["x1"] = b_["x1"]
+                bxs[i]["top"] = (b["top"] + b_["top"]) / 2
+                bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
+                bxs[i]["text"] += b_["text"]
+                bxs.pop(i + 1)
+                continue
+            i += 1
+            continue
 
             dis_thr = 1
             dis = b["x1"] - b_["x0"]
@@ -642,9 +652,9 @@ class HuParser:
 
             tk, tv = nearest(tables)
             fk, fv = nearest(figures)
-            if min(tv, fv) > 2000:
-                i += 1
-                continue
+            #if min(tv, fv) > 2000:
+            #    i += 1
+            #    continue
             if tv < fv:
                 tables[tk].insert(0, c)
                 logging.debug(
@@ -711,12 +721,7 @@ class HuParser:
 
         # crop figure out and add caption
         for k, bxs in figures.items():
-            txt = "\n".join(
-                [b["text"] for b in bxs
-                 if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
-                 and len(b["text"].strip()) >= 4
-                 ]
-            )
+            txt = "\n".join([b["text"] for b in bxs])
             if not txt:
                 continue
 
diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py
index e7ad761..4851b8e 100644
--- a/deepdoc/vision/layout_recognizer.py
+++ b/deepdoc/vision/layout_recognizer.py
@@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer):
                         continue
 
                     bxs[i]["layoutno"] = f"{ty}-{ii}"
-                    bxs[i]["layout_type"] = lts_[ii]["type"]
+                    bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
                     i += 1
 
             for lt in ["footer", "header", "reference", "figure caption",
@@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer):
 
             # add box to figure layouts which has not text box
             for i, lt in enumerate(
-                    [lt for lt in lts if lt["type"] == "figure"]):
+                    [lt for lt in lts if lt["type"] in ["figure","equation"]]):
                 if lt.get("visited"):
                     continue
                 lt = deepcopy(lt)
diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py
index 6e08d7c..13043e4 100644
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@@ -21,7 +21,6 @@ from .operators import *
 import numpy as np
 import onnxruntime as ort
 
-from api.utils.file_utils import get_project_base_directory
 from .postprocess import build_post_process
 from rag.settings import cron_logger
 
diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py
index 3aac008..2f9123b 100644
--- a/deepdoc/vision/recognizer.py
+++ b/deepdoc/vision/recognizer.py
@@ -276,18 +276,18 @@ class Recognizer(object):
     def find_overlapped_with_threashold(box, boxes, thr=0.3):
         if not boxes:
             return
-        max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
+        max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
         s, e = 0, len(boxes)
         for i in range(s, e):
             ov = Recognizer.overlapped_area(box, boxes[i])
             _ov = Recognizer.overlapped_area(boxes[i], box)
-            if (ov, _ov) < (max_overlaped, _max_overlaped):
+            if (ov, _ov) < (max_overlapped, _max_overlapped):
                 continue
-            max_overlaped_i = i
-            max_overlaped = ov
-            _max_overlaped = _ov
+            max_overlapped_i = i
+            max_overlapped = ov
+            _max_overlapped = _ov
 
-        return max_overlaped_i
+        return max_overlapped_i
 
     def preprocess(self, image_list):
         inputs = []
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 4d5ec8c..7b7aa4a 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         d = copy.deepcopy(doc)
         if pdf_parser:
             d["image"], poss = pdf_parser.crop(ck, need_position=True)
-            add_positions(d, poss)
+            add_positions(d, poss, from_page)
             ck = pdf_parser.remove_tag(ck)
         tokenize(d, ck, eng)
         res.append(d)
@@ -112,7 +112,7 @@ if __name__ == "__main__":
     import sys
 
 
-    def dummy(a, b):
+    def dummy(prog=None, msg=""):
         pass
 
 
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index b87bb27..3603970 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -82,8 +82,8 @@ class Dealer:
                 )
             else:
                 s = s.sort(
-                    {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
-                    {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
+                    {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
+                    {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
                     {"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
                 )
-- 
GitLab