From 6999598101aad554da03df753a27e60ccc1b99f0 Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Wed, 20 Mar 2024 16:56:16 +0800
Subject: [PATCH] refine for English corpus (#135)

---
 deepdoc/parser/excel_parser.py | 21 ++++++++
 deepdoc/parser/pdf_parser.py   | 95 ++++++++++++++++++++++------------
 rag/app/manual.py              | 17 +-----
 rag/app/naive.py               |  6 ++-
 rag/app/paper.py               | 88 +++++++++++++++++++++----------
 rag/app/qa.py                  | 10 ++--
 rag/app/table.py               |  6 +--
 rag/nlp/__init__.py            | 17 ++----
 rag/nlp/huqie.py               | 16 +++++-
 rag/nlp/query.py               | 34 ++++++------
 rag/nlp/search.py              | 25 +++++++--
 rag/svr/task_broker.py         |  6 ++-
 12 files changed, 216 insertions(+), 125 deletions(-)

diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py
index 79c45e8..4b436ec 100644
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -5,6 +5,27 @@ from io import BytesIO
 
 
 class HuExcelParser:
+    def html(self, fnm):
+        if isinstance(fnm, str):
+            wb = load_workbook(fnm)
+        else:
+            wb = load_workbook(BytesIO(fnm))
+        tb = ""
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            tb += f"<table><caption>{sheetname}</caption><tr>"
+            for t in list(rows[0]): tb += f"<th>{t.value}</th>"
+            tb += "</tr>"
+            for r in list(rows[1:]):
+                tb += "<tr>"
+                for i,c in enumerate(r):
+                    if c.value is None: tb += "<td></td>"
+                    else: tb += f"<td>{c.value}</td>"
+                tb += "</tr>"
+            tb += "</table>\n"
+        return tb
+
     def __call__(self, fnm):
         if isinstance(fnm, str):
             wb = load_workbook(fnm)
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index 2f6bec0..4d8e025 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -17,7 +17,6 @@ from rag.nlp import huqie
 from copy import deepcopy
 from huggingface_hub import hf_hub_download
 
-
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 
 
@@ -25,7 +24,7 @@ class HuParser:
     def __init__(self):
         self.ocr = OCR()
         if hasattr(self, "model_speciess"):
-            self.layouter = LayoutRecognizer("layout."+self.model_speciess)
+            self.layouter = LayoutRecognizer("layout." + self.model_speciess)
         else:
             self.layouter = LayoutRecognizer("layout")
         self.tbl_det = TableStructureRecognizer()
@@ -141,7 +140,7 @@ class HuParser:
             for j in range(i, -1, -1):
                 # restore the order using th
                 if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
-                        and arr[j + 1]["top"] < arr[j]["top"]\
+                        and arr[j + 1]["top"] < arr[j]["top"] \
                         and arr[j + 1]["page_number"] == arr[j]["page_number"]:
                     tmp = arr[j]
                     arr[j] = arr[j + 1]
@@ -278,8 +277,10 @@ class HuParser:
 
         for b in bxs:
             if not b["text"]:
-                left, right, top, bott = b["x0"]*ZM, b["x1"]*ZM, b["top"]*ZM, b["bottom"]*ZM
-                b["text"] = self.ocr.recognize(np.array(img), np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
+                left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
+                b["text"] = self.ocr.recognize(np.array(img),
+                                               np.array([[left, top], [right, top], [right, bott], [left, bott]],
+                                                        dtype=np.float32))
             del b["txt"]
         bxs = [b for b in bxs if b["text"]]
         if self.mean_height[-1] == 0:
@@ -315,7 +316,8 @@ class HuParser:
         while i < len(bxs) - 1:
             b = bxs[i]
             b_ = bxs[i + 1]
-            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
+            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
+                                                                                                 "equation"]:
                 i += 1
                 continue
             if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
@@ -376,9 +378,13 @@ class HuParser:
                 b["page_number"] == b_["page_number"] and b_["top"] - \
                 b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
                 b["page_number"] < b_["page_number"] and abs(
-                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
+                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
             ]
-            if any(feats) and not any(concatting_feats):
+            # split features
+            detach_feats = [b["x1"] < b_["x0"],
+                            b["x0"] > b_["x1"]]
+            if (any(feats) and not any(concatting_feats)) or any(detach_feats):
+                print(b["text"], b_["text"], any(feats), any(concatting_feats), any(detach_feats))
                 i += 1
                 continue
             # merge up and down
@@ -503,18 +509,21 @@ class HuParser:
         findit = False
         i = 0
         while i < len(self.boxes):
-            if not re.match(r"(contents|ç›®ĺ˝•|ç›®ć¬ˇ|table of contents|č‡´č°˘|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
+            if not re.match(r"(contents|ç›®ĺ˝•|ç›®ć¬ˇ|table of contents|č‡´č°˘|acknowledge)$",
+                            re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
                 i += 1
                 continue
             findit = True
             eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
             self.boxes.pop(i)
             if i >= len(self.boxes): break
-            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
+            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
+                self.boxes[i]["text"].strip().split(" ")[:2])
             while not prefix:
                 self.boxes.pop(i)
                 if i >= len(self.boxes): break
-                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
+                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
+                    self.boxes[i]["text"].strip().split(" ")[:2])
             self.boxes.pop(i)
             if i >= len(self.boxes) or not prefix: break
             for j in range(i, min(i + 128, len(self.boxes))):
@@ -522,13 +531,13 @@ class HuParser:
                     continue
                 for k in range(i, j): self.boxes.pop(i)
                 break
-        if findit:return
+        if findit: return
 
         page_dirty = [0] * len(self.page_images)
         for b in self.boxes:
             if re.search(r"(Â·Â·|Â·Â·|Â·Â·)", b["text"]):
-                page_dirty[b["page_number"]-1] += 1
-        page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
+                page_dirty[b["page_number"] - 1] += 1
+        page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
         if not page_dirty: return
         i = 0
         while i < len(self.boxes):
@@ -546,7 +555,7 @@ class HuParser:
                 self.boxes.pop(i)
                 continue
             if not b_["text"].strip():
-                self.boxes.pop(i+1)
+                self.boxes.pop(i + 1)
                 continue
 
             if b["text"].strip()[0] != b_["text"].strip()[0] \
@@ -574,8 +583,10 @@ class HuParser:
                 continue
             lout_no = str(self.boxes[i]["page_number"]) + \
                       "-" + str(self.boxes[i]["layoutno"])
-            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
-                                                                                  "figure caption", "reference"]:
+            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
+                                                                                                      "title",
+                                                                                                      "figure caption",
+                                                                                                      "reference"]:
                 nomerge_lout_no.append(lst_lout_no)
             if self.boxes[i]["layout_type"] == "table":
                 if re.match(r"(ć•°ćŤ®|čµ„ć–™|ĺ›ľčˇ¨)*ćťĄćş[:ďĽš ]", self.boxes[i]["text"]):
@@ -654,7 +665,7 @@ class HuParser:
 
             tk, tv = nearest(tables)
             fk, fv = nearest(figures)
-            #if min(tv, fv) > 2000:
+            # if min(tv, fv) > 2000:
             #    i += 1
             #    continue
             if tv < fv and tk:
@@ -699,7 +710,7 @@ class HuParser:
                             "layoutno", "")))
 
                 left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
-                poss.append((pn+self.page_from, left, right, top, bott))
+                poss.append((pn + self.page_from, left, right, top, bott))
                 return self.page_images[pn] \
                     .crop((left * ZM, top * ZM,
                            right * ZM, bott * ZM))
@@ -738,7 +749,7 @@ class HuParser:
         for k, bxs in tables.items():
             if not bxs:
                 continue
-            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
+            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
             poss = []
             res.append((cropout(bxs, "table", poss),
                         self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
@@ -879,7 +890,8 @@ class HuParser:
             self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
             self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                 enumerate(self.pdf.pages[page_from:page_to])]
-            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
+            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
+                               self.pdf.pages[page_from:page_to]]
             self.total_page = len(self.pdf.pages)
         except Exception as e:
             self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
@@ -888,8 +900,8 @@ class HuParser:
             mat = fitz.Matrix(zoomin, zoomin)
             self.total_page = len(self.pdf)
             for i, page in enumerate(self.pdf):
-                if i < page_from:continue
-                if i >= page_to:break
+                if i < page_from: continue
+                if i >= page_to: break
                 pix = page.get_pixmap(matrix=mat)
                 img = Image.frombytes("RGB", [pix.width, pix.height],
                                       pix.samples)
@@ -897,7 +909,9 @@ class HuParser:
                 self.page_chars.append([])
 
         logging.info("Images converted.")
-        self.is_english = [re.search(r"[a-zA-Z0-9,/Â¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
+        self.is_english = [re.search(r"[a-zA-Z0-9,/Â¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
+            random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
+                           range(len(self.page_chars))]
         if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
             self.is_english = True
         else:
@@ -927,11 +941,12 @@ class HuParser:
             #         self.page_cum_height.append(
             #             np.max([c["bottom"] for c in chars]))
             self.__ocr(i + 1, img, chars, zoomin)
-            if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
+            if callback: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
 
         if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
             bxes = [b for bxs in self.boxes for b in bxs]
-            self.is_english = re.search(r"[\na-zA-Z0-9,/Â¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
+            self.is_english = re.search(r"[\na-zA-Z0-9,/Â¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
+                                        "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
 
         logging.info("Is it English:", self.is_english)
 
@@ -964,12 +979,13 @@ class HuParser:
             if need_position: return None, None
             return
 
-        max_width = np.max([right-left for (_, left, right, _, _) in poss])
+        max_width = np.max([right - left for (_, left, right, _, _) in poss])
         GAP = 6
         pos = poss[0]
-        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0)))
+        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
         pos = poss[-1]
-        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
+        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
+                     min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
 
         positions = []
         for ii, (pns, left, right, top, bottom) in enumerate(poss):
@@ -984,9 +1000,9 @@ class HuParser:
                     bottom, self.page_images[pns[0]].size[1])
                                                ))
             )
-            if 0 < ii < len(poss)-1:
-                positions.append((pns[0]+self.page_from, left, right, top, min(
-                    bottom, self.page_images[pns[0]].size[1])/ZM))
+            if 0 < ii < len(poss) - 1:
+                positions.append((pns[0] + self.page_from, left, right, top, min(
+                    bottom, self.page_images[pns[0]].size[1]) / ZM))
             bottom -= self.page_images[pns[0]].size[1]
             for pn in pns[1:]:
                 imgs.append(
@@ -997,7 +1013,7 @@ class HuParser:
                                                ))
                 )
                 if 0 < ii < len(poss) - 1:
-                    positions.append((pn+self.page_from, left, right, 0, min(
+                    positions.append((pn + self.page_from, left, right, 0, min(
                         bottom, self.page_images[pn].size[1]) / ZM))
                 bottom -= self.page_images[pn].size[1]
 
@@ -1026,6 +1042,19 @@ class HuParser:
             return pic, positions
         return pic
 
+    def get_position(self, bx, ZM):
+        poss = []
+        pn = bx["page_number"]
+        top = bx["top"] - self.page_cum_height[pn - 1]
+        bott = bx["bottom"] - self.page_cum_height[pn - 1]
+        poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
+        while bott * ZM > self.page_images[pn - 1].size[1]:
+            bott -= self.page_images[pn - 1].size[1] / ZM
+            top = 0
+            pn += 1
+            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
+        return poss
+
 
 if __name__ == "__main__":
     pass
diff --git a/rag/app/manual.py b/rag/app/manual.py
index b3dc688..b8b4d7a 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -30,19 +30,6 @@ class Pdf(PdfParser):
         #        print(b)
         print("OCR:", timer()-start)
 
-        def get_position(bx):
-            poss = []
-            pn = bx["page_number"]
-            top = bx["top"] - self.page_cum_height[pn - 1]
-            bott = bx["bottom"] - self.page_cum_height[pn - 1]
-            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
-            while bott * zoomin > self.page_images[pn - 1].size[1]:
-                bott -= self.page_images[pn- 1].size[1] / zoomin
-                top = 0
-                pn += 1
-                poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
-            return poss
-
         def tag(pn, left, right, top, bottom):
             return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
                 .format(pn, left, right, top, bottom)
@@ -54,7 +41,7 @@ class Pdf(PdfParser):
         callback(0.67, "Table analysis finished.")
         self._text_merge()
         tbls = self._extract_table_figure(True, zoomin, True, True)
-        self._naive_vertical_merge()
+        self._concat_downward()
         self._filter_forpages()
         callback(0.68, "Text merging finished")
 
@@ -74,7 +61,7 @@ class Pdf(PdfParser):
             sec_ids.append(sid)
             #print(lvl, self.boxes[i]["text"], most_level)
 
-        sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
+        sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
         for (img, rows), poss in tbls:
             sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
 
diff --git a/rag/app/naive.py b/rag/app/naive.py
index cc48f5f..4c82e56 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -14,7 +14,7 @@ import copy
 import re
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
-from deepdoc.parser import PdfParser
+from deepdoc.parser import PdfParser, ExcelParser
 from rag.settings import cron_logger
 
 
@@ -74,6 +74,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         sections, tbls = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
         res = tokenize_table(tbls, doc, eng)
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = [(excel_parser.html(binary), "")]
     elif re.search(r"\.txt$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         txt = ""
diff --git a/rag/app/paper.py b/rag/app/paper.py
index 19efa22..c993502 100644
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -15,7 +15,7 @@ import re
 from collections import Counter
 
 from api.db import ParserType
-from rag.nlp import huqie, tokenize, tokenize_table, add_positions
+from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string
@@ -46,11 +46,11 @@ class Pdf(PdfParser):
         self._table_transformer_job(zoomin)
         callback(0.68, "Table analysis finished")
         self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
         column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
-        self._concat_downward(concat_between_pages=False)
+        self._concat_downward()
         self._filter_forpages()
         callback(0.75, "Text merging finished.")
-        tbls = self._extract_table_figure(True, zoomin, True, True)
 
         # clean mess
         if column_width < self.page_images[0].size[0] / zoomin / 2:
@@ -59,24 +59,24 @@ class Pdf(PdfParser):
             self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
         for b in self.boxes:
             b["text"] = re.sub(r"([\t ă€€]|\u3000){2,}", " ", b["text"].strip())
-        freq = Counter([b["text"] for b in self.boxes])
-        garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
-        i = 0
-        while i < len(self.boxes):
-            if self.boxes[i]["text"] in garbage \
-                    or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
-                    or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
-                self.boxes.pop(i)
-            elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
-                                                                                                         '1'):
-                # merge within same layouts
-                self.boxes[i + 1]["top"] = self.boxes[i]["top"]
-                self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
-                self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
-                self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
-                self.boxes.pop(i)
-            else:
-                i += 1
+        # freq = Counter([b["text"] for b in self.boxes])
+        # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
+        # i = 0
+        # while i < len(self.boxes):
+        #     if self.boxes[i]["text"] in garbage \
+        #             or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
+        #             or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
+        #         self.boxes.pop(i)
+        #     elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
+        #                                                                                                  '1'):
+        #         # merge within same layouts
+        #         self.boxes[i + 1]["top"] = self.boxes[i]["top"]
+        #         self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
+        #         self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
+        #         self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
+        #         self.boxes.pop(i)
+        #     else:
+        #         i += 1
 
         def _begin(txt):
             return re.match(
@@ -88,7 +88,7 @@ class Pdf(PdfParser):
                 "title":"",
                 "authors": "",
                 "abstract": "",
-                "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
                           re.match(r"(text|title)", b.get("layoutno", "text"))],
                 "tables": tbls
             }
@@ -119,11 +119,10 @@ class Pdf(PdfParser):
             if re.match("(abstract|ć‘č¦)", txt):
                 if len(txt.split(" ")) > 32 or len(txt) > 64:
                     abstr = txt + self._line_tag(b, zoomin)
-                    i += 1
                     break
-                txt = self.boxes[i + 1]["text"].lower().strip()
+                txt = self.boxes[i]["text"].lower().strip()
                 if len(txt.split(" ")) > 32 or len(txt) > 64:
-                    abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
+                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
                 i += 1
                 break
         if not abstr: i = 0
@@ -136,7 +135,7 @@ class Pdf(PdfParser):
             "title": title if title else filename,
             "authors": " ".join(authors),
             "abstract": abstr,
-            "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
                       re.match(r"(text|title)", b.get("layoutno", "text"))],
             "tables": tbls
         }
@@ -153,7 +152,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         paper = pdf_parser(filename if not binary else binary,
                            from_page=from_page, to_page=to_page, callback=callback)
     else: raise NotImplementedError("file type not supported yet(pdf supported)")
-    doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
+
+    doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
            "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
@@ -173,6 +173,38 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         tokenize(d, txt, eng)
         res.append(d)
 
+    sorted_sections = paper["sections"]
+    # set pivot using the most frequent type of title,
+    # then merge between 2 pivot
+    bull = bullets_category([txt for txt, _ in sorted_sections])
+    most_level, levels = title_frequency(bull, sorted_sections)
+    assert len(sorted_sections) == len(levels)
+    sec_ids = []
+    sid = 0
+    for i, lvl in enumerate(levels):
+        if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
+        sec_ids.append(sid)
+        print(lvl, sorted_sections[i][0], most_level, sid)
+
+    chunks = []
+    last_sid = -2
+    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
+        if sec_id == last_sid:
+            if chunks:
+                chunks[-1] += "\n" + txt
+                continue
+        chunks.append(txt)
+        last_sid = sec_id
+    for txt in chunks:
+        d = copy.deepcopy(doc)
+        d["image"], poss = pdf_parser.crop(txt, need_position=True)
+        add_positions(d, poss)
+        tokenize(d, pdf_parser.remove_tag(txt), eng)
+        res.append(d)
+        print("----------------------\n", pdf_parser.remove_tag(txt))
+
+    return res
+
     readed = [0] * len(paper["lines"])
     # find colon firstly
     i = 0
@@ -252,6 +284,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
 
 if __name__ == "__main__":
     import sys
-    def dummy(a, b):
+    def dummy(prog=None, msg=""):
         pass
     chunk(sys.argv[1], callback=dummy)
diff --git a/rag/app/qa.py b/rag/app/qa.py
index 0a54692..1b42f1a 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -16,7 +16,7 @@ from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.nlp import is_english, random_choices
-from rag.nlp import huqie, stemmer
+from rag.nlp import huqie
 from deepdoc.parser import ExcelParser
 
 
@@ -73,12 +73,8 @@ def beAdoc(d, q, a, eng):
     aprefix = "Answer: " if eng else "ĺ›žç”ďĽš"
     d["content_with_weight"] = "\t".join(
         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
-    if eng:
-        d["content_ltks"] = " ".join([stemmer.stem(w)
-                                     for w in word_tokenize(q)])
-    else:
-        d["content_ltks"] = huqie.qie(q)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+    d["content_ltks"] = huqie.qie(q)
+    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
     return d
 
 
diff --git a/rag/app/table.py b/rag/app/table.py
index 6f40da4..9512e9f 100644
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -74,9 +74,9 @@ def trans_datatime(s):
 
 def trans_bool(s):
     if re.match(r"(true|yes|ćŻ|\*|âś“|âś”|â‘|âś…|âš)$", str(s).strip(), flags=re.IGNORECASE):
-        return ["yes", "ćŻ"]
+        return "yes"
     if re.match(r"(false|no|ĺ¦|âŤ»|Ă—)$", str(s).strip(), flags=re.IGNORECASE):
-        return ["no", "ĺ¦"]
+        return "no"
 
 
 def column_data_type(arr):
@@ -92,7 +92,7 @@ def column_data_type(arr):
             counts["int"] += 1
         elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
             counts["float"] += 1
-        elif re.match(r"(true|false|yes|no|ćŻ|ĺ¦)$", str(a), flags=re.IGNORECASE):
+        elif re.match(r"(true|yes|ćŻ|\*|âś“|âś”|â‘|âś…|âš|false|no|ĺ¦|âŤ»|Ă—)$", str(a), flags=re.IGNORECASE):
             counts["bool"] += 1
         elif trans_datatime(str(a)):
             counts["datetime"] += 1
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 4afd376..92fbcdc 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -3,14 +3,9 @@ from collections import Counter
 
 from rag.utils import num_tokens_from_string
 from . import huqie
-from nltk import word_tokenize
 import re
 import copy
 
-from nltk.stem import PorterStemmer
-
-stemmer = PorterStemmer()
-
 
 BULLET_PATTERN = [[
     r"ç¬¬[é›¶ä¸€äşŚä¸‰ĺ››äş”ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+(ĺ†?çĽ–|é¨ĺ†)",
@@ -77,13 +72,8 @@ def is_english(texts):
 def tokenize(d, t, eng):
     d["content_with_weight"] = t
     t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
-    if eng:
-        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
-        d["content_ltks"] = " ".join([stemmer.stem(w)
-                                     for w in word_tokenize(t)])
-    else:
-        d["content_ltks"] = huqie.qie(t)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+    d["content_ltks"] = huqie.qie(t)
+    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 
 
 def tokenize_table(tbls, doc, eng, batch_size=10):
@@ -94,8 +84,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
             continue
         if isinstance(rows, str):
             d = copy.deepcopy(doc)
-            r = re.sub(r"<[^<>]{,12}>", "", rows)
-            tokenize(d, r, eng)
+            tokenize(d, rows, eng)
             d["content_with_weight"] = rows
             d["image"] = img
             add_positions(d, poss)
diff --git a/rag/nlp/huqie.py b/rag/nlp/huqie.py
index 2bdcaf9..548c2d2 100644
--- a/rag/nlp/huqie.py
+++ b/rag/nlp/huqie.py
@@ -8,7 +8,8 @@ import re
 import string
 import sys
 from hanziconv import HanziConv
-
+from nltk import word_tokenize
+from nltk.stem import PorterStemmer, WordNetLemmatizer
 from api.utils.file_utils import get_project_base_directory
 
 
@@ -45,6 +46,9 @@ class Huqie:
         self.trie_ = datrie.Trie(string.printable)
         self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
 
+        self.stemmer = PorterStemmer()
+        self.lemmatizer = WordNetLemmatizer()
+
         self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=ă€Šă€‹ďĽŚă€‚ďĽźă€ďĽ›â€â€™ďĽšâ€śâ€ťă€ă€‘~ďĽďżĄ%â€¦â€¦ďĽďĽ‰â€”â€”-]+|[a-z\.-]+|[0-9,\.-]+)"
         try:
             self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
@@ -239,6 +243,10 @@ class Huqie:
     def qie(self, line):
         line = self._strQ2B(line).lower()
         line = self._tradi2simp(line)
+        zh_num = len([1 for c in line if is_chinese(c)])
+        if zh_num < len(line) * 0.2:
+            return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
+
         arr = re.split(self.SPLIT_CHAR, line)
         res = []
         for L in arr:
@@ -290,8 +298,12 @@ class Huqie:
         return self.merge_(res)
 
     def qieqie(self, tks):
+        tks = tks.split(" ")
+        zh_num = len([1 for c in tks if c and is_chinese(c[0])])
+        if zh_num < len(tks) * 0.2:return " ".join(tks)
+
         res = []
-        for tk in tks.split(" "):
+        for tk in tks:
             if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
                 res.append(tk)
                 continue
diff --git a/rag/nlp/query.py b/rag/nlp/query.py
index 608f18f..8359cef 100644
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -4,8 +4,8 @@ import json
 import re
 import logging
 import copy
-import math
-from elasticsearch_dsl import Q, Search
+from elasticsearch_dsl import Q
+
 from rag.nlp import huqie, term_weight, synonym
 
 
@@ -33,12 +33,14 @@ class EsQueryer:
 
     @staticmethod
     def rmWWW(txt):
-        txt = re.sub(
-            r"ćŻ*(ä»€äąć ·çš„|ĺ“Şĺ®¶|é‚Łĺ®¶|ĺ•Ąć ·|ĺ’‹ć ·äş†|ä»€äąć—¶ĺ€™|ä˝•ć—¶|ä˝•ĺś°|ä˝•äşş|ćŻĺ¦|ćŻä¸ŤćŻ|ĺ¤šĺ°‘|ĺ“Şé‡Ś|ć€Žäą|ĺ“Şĺ„ż|ć€Žäąć ·|ĺ¦‚ä˝•|ĺ“Şäş›|ćŻĺ•Ą|ĺ•ĄćŻ|ĺ•Š|ĺ—|ĺ‘˘|ĺ§|ĺ’‹|ä»€äą|ćś‰ć˛ˇćś‰|ĺ‘€)ćŻ*",
-            "",
-            txt)
-        return re.sub(
-            r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
+        patts = [
+            (r"ćŻ*(ä»€äąć ·çš„|ĺ“Şĺ®¶|é‚Łĺ®¶|ĺ•Ąć ·|ĺ’‹ć ·äş†|ä»€äąć—¶ĺ€™|ä˝•ć—¶|ä˝•ĺś°|ä˝•äşş|ćŻĺ¦|ćŻä¸ŤćŻ|ĺ¤šĺ°‘|ĺ“Şé‡Ś|ć€Žäą|ĺ“Şĺ„ż|ć€Žäąć ·|ĺ¦‚ä˝•|ĺ“Şäş›|ćŻĺ•Ą|ĺ•ĄćŻ|ĺ•Š|ĺ—|ĺ‘˘|ĺ§|ĺ’‹|ä»€äą|ćś‰ć˛ˇćś‰|ĺ‘€)ćŻ*", ""),
+            (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
+            (r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down)", " ")
+        ]
+        for r, p in patts:
+            txt = re.sub(r, p, txt, flags=re.IGNORECASE)
+        return txt
 
     def question(self, txt, tbl="qa", min_match="60%"):
         txt = re.sub(
@@ -50,7 +52,7 @@ class EsQueryer:
         txt = EsQueryer.rmWWW(txt)
 
         if not self.isChinese(txt):
-            tks = [t for t in txt.split(" ") if t.strip()]
+            tks = huqie.qie(txt).split(" ")
             q = tks
             for i in range(1, len(tks)):
                 q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@@ -58,9 +60,9 @@ class EsQueryer:
                 q.append(txt)
             return Q("bool",
                      must=Q("query_string", fields=self.flds,
-                            type="best_fields", query=" OR ".join(q),
+                            type="best_fields", query=" ".join(q),
                             boost=1, minimum_should_match=min_match)
-                     ), txt.split(" ")
+                     ), tks
 
         def needQieqie(tk):
             if len(tk) < 4:
@@ -160,8 +162,8 @@ class EsQueryer:
                 s += v# * dtwt[k]
         q = 1e-9
         for k, v in qtwt.items():
-            q += v * v
-        d = 1e-9
-        for k, v in dtwt.items():
-            d += v * v
-        return s / q#math.sqrt(q) / math.sqrt(d)
+            q += v #* v
+        #d = 1e-9
+        #for k, v in dtwt.items():
+        #    d += v * v
+        return s / q #math.sqrt(q) / math.sqrt(d)
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index 6b99165..f9fbcf2 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -196,7 +196,24 @@ class Dealer:
     def insert_citations(self, answer, chunks, chunk_v,
                          embd_mdl, tkweight=0.7, vtweight=0.3):
         assert len(chunks) == len(chunk_v)
-        pieces = re.split(r"([ďĽ›ă€‚ďĽź!ďĽ\n]|[a-z][.?;!][ \n])", answer)
+        pieces = re.split(r"(```)", answer)
+        if len(pieces) >= 3:
+            i = 0
+            pieces_ = []
+            while i < len(pieces):
+                if pieces[i] == "```":
+                    st = i
+                    i += 1
+                    while i<len(pieces) and pieces[i] != "```":
+                        i += 1
+                    if i < len(pieces): i += 1
+                    pieces_.append("".join(pieces[st: i])+"\n")
+                else:
+                    pieces_.extend(re.split(r"([^\|][ďĽ›ă€‚ďĽź!ďĽ\n]|[a-z][.?;!][ \n])", pieces[i]))
+                    i += 1
+            pieces = pieces_
+        else:
+            pieces = re.split(r"([^\|][ďĽ›ă€‚ďĽź!ďĽ\n]|[a-z][.?;!][ \n])", answer)
         for i in range(1, len(pieces)):
             if re.match(r"[a-z][.?;!][ \n]", pieces[i]):
                 pieces[i - 1] += pieces[i][0]
@@ -226,7 +243,7 @@ class Dealer:
                                                             chunks_tks,
                                                             tkweight, vtweight)
             mx = np.max(sim) * 0.99
-            if mx < 0.66:
+            if mx < 0.7:
                 continue
             cites[idx[i]] = list(
                 set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
@@ -249,6 +266,7 @@ class Dealer:
 
     def rerank(self, sres, query, tkweight=0.3,
                vtweight=0.7, cfield="content_ltks"):
+        _, keywords = self.qryr.question(query)
         ins_embd = [
             Dealer.trans2floats(
                 sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
@@ -258,8 +276,7 @@ class Dealer:
                   for i in sres.ids]
         sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                         ins_embd,
-                                                        huqie.qie(
-                                                            query).split(" "),
+                                                        keywords,
                                                         ins_tw, tkweight, vtweight)
         return sim, tksim, vtsim
 
diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py
index 618e2ea..665ab1e 100644
--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
@@ -82,12 +82,14 @@ def dispatch():
         tsks = []
         if r["type"] == FileType.PDF.value:
             pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
+            page_size = 5
+            if r["parser_id"] == "paper": page_size = 12
             for s,e in r["parser_config"].get("pages", [(0,100000)]):
                 e = min(e, pages)
-                for p in range(s, e, 5):
+                for p in range(s, e, page_size):
                     task = new_task()
                     task["from_page"] = p
-                    task["to_page"] = min(p + 5, e)
+                    task["to_page"] = min(p + page_size, e)
                     tsks.append(task)
         elif r["parser_id"] == "table":
                 rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
-- 
GitLab