diff --git a/rag/app/__init__.py b/rag/app/__init__.py index 1ef52a4068b6985341ac7e9e9f22f61e9bfbe22f..6d390cab1171b5a267f1a61397edcb69edaa5560 100644 --- a/rag/app/__init__.py +++ b/rag/app/__init__.py @@ -1,5 +1,9 @@ import re +from nltk import word_tokenize + +from rag.nlp import stemmer, huqie + def callback__(progress, msg, func): if not func :return @@ -46,3 +50,21 @@ def bullets_category(sections): res = i maxium = h return res + +def is_english(texts): + eng = 0 + for t in texts: + if re.match(r"[a-zA-Z]", t.strip()): + eng += 1 + if eng / len(texts) > 0.8: + return True + return False + +def tokenize(d, t, eng): + d["content_with_weight"] = t + if eng: + t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) + d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)]) + else: + d["content_ltks"] = huqie.qie(t) + d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) \ No newline at end of file diff --git a/rag/app/laws.py b/rag/app/laws.py index dfb70d5c3e6c7d14b3040a72cb589a1c8e082876..465213e56986a7a16dc7efc8dfced50d1ec1063c 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -3,12 +3,13 @@ import re from io import BytesIO from docx import Document import numpy as np -from rag.app import callback__, bullets_category, BULLET_PATTERN +from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize from rag.nlp import huqie +from rag.parser.docx_parser import HuDocxParser from rag.parser.pdf_parser import HuParser -class Docx(object): +class Docx(HuDocxParser): def __init__(self): pass @@ -42,14 +43,7 @@ class Pdf(HuParser): print("paddle layouts:", timer()-start) bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) # is it English - eng = 0 - for b in bxs: - if re.match(r"[a-zA-Z]", b["text"].strip()): - eng += 1 - if eng / len(bxs) > 0.8: - eng = True - else: - eng = False + eng = is_english([b["text"] for b in bxs]) # Merge vertically i = 0 while i + 1 < len(bxs): @@ -59,7 +53,7 @@ class Pdf(HuParser): bxs.pop(i) continue concatting_feats = [ - b["text"].strip()[-1] in ",;:'\",ă€â€â€śďĽ›ďĽš", + b["text"].strip()[-1] in ",;:'\",ă€â€â€śďĽ›ďĽš-", len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",â€â€śă€ďĽ›ďĽš", b["text"].strip()[0] in "。;?ďĽ?”)),,ă€ďĽš", ] @@ -118,14 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): sections = [l for l in sections if l] # is it English - eng = 0 - for sec in sections: - if re.match(r"[a-zA-Z]", sec.strip()): - eng += 1 - if eng / len(sections) > 0.8: - eng = True - else: - eng = False + eng = is_english(sections) # Remove 'Contents' part i = 0 while i < len(sections): @@ -181,8 +168,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if pdf_parser: d["image"] = pdf_parser.crop(ck) ck = pdf_parser.remove_tag(ck) - d["content_ltks"] = huqie.qie(ck) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + tokenize(d, ck, eng) res.append(d) return res diff --git a/rag/app/manual.py b/rag/app/manual.py new file mode 100644 index 0000000000000000000000000000000000000000..420b6788ae64501bc8585057bc7cf04e37f80cac --- /dev/null +++ b/rag/app/manual.py @@ -0,0 +1,140 @@ +import copy +import re +from collections import Counter +from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize +from rag.nlp import huqie, stemmer +from rag.parser.docx_parser import HuDocxParser +from rag.parser.pdf_parser import HuParser +from nltk.tokenize import word_tokenize +import numpy as np +from rag.utils import num_tokens_from_string + + +class Pdf(HuParser): + def __call__(self, filename, binary=None, from_page=0, + to_page=100000, zoomin=3, callback=None): + self.__images__( + filename if not binary else binary, + zoomin, + from_page, + to_page) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback) + + from timeit import default_timer as timer + start = timer() + self._layouts_paddle(zoomin) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback) + print("paddle layouts:", timer() - start) + self._table_transformer_job(zoomin) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback) + self._text_merge() + column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) + self._concat_downward(concat_between_pages=False) + self._filter_forpages() + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback) + tbls = self._extract_table_figure(True, zoomin, False) + + # clean mess + for b in self.boxes: + b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) + + # merge chunks with the same bullets + i = 0 + while i + 1 < len(self.boxes): + b = self.boxes[i] + b_ = self.boxes[i + 1] + if b["text"].strip()[0] != b_["text"].strip()[0] \ + or b["page_number"]!=b_["page_number"] \ + or b["top"] > b_["bottom"]: + i += 1 + continue + b_["text"] = b["text"] + "\n" + b_["text"] + b_["x0"] = min(b["x0"], b_["x0"]) + b_["x1"] = max(b["x1"], b_["x1"]) + b_["top"] = b["top"] + self.boxes.pop(i) + # merge title with decent chunk + i = 0 + while i + 1 < len(self.boxes): + b = self.boxes[i] + if b.get("layoutno","").find("title") < 0: + i += 1 + continue + b_ = self.boxes[i + 1] + b_["text"] = b["text"] + "\n" + b_["text"] + b_["x0"] = min(b["x0"], b_["x0"]) + b_["x1"] = max(b["x1"], b_["x1"]) + b_["top"] = b["top"] + self.boxes.pop(i) + + for b in self.boxes: print(b["text"], b.get("layoutno")) + + print(tbls) + return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls + + +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): + pdf_parser = None + paper = {} + + if re.search(r"\.pdf$", filename, re.IGNORECASE): + pdf_parser = Pdf() + cks, tbls = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) + doc = { + "docnm_kwd": filename + } + doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) + doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + # is it English + eng = pdf_parser.is_english + + res = [] + # add tables + for img, rows in tbls: + bs = 10 + de = ";" if eng else ";" + for i in range(0, len(rows), bs): + d = copy.deepcopy(doc) + r = de.join(rows[i:i + bs]) + r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) + tokenize(d, r, eng) + d["image"] = img + res.append(d) + + i = 0 + chunk = [] + tk_cnt = 0 + def add_chunk(): + nonlocal chunk, res, doc, pdf_parser, tk_cnt + d = copy.deepcopy(doc) + ck = "\n".join(chunk) + tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english) + d["image"] = pdf_parser.crop(ck) + res.append(d) + chunk = [] + tk_cnt = 0 + + while i < len(cks): + if tk_cnt > 128: add_chunk() + txt = cks[i] + txt_ = pdf_parser.remove_tag(txt) + i += 1 + cnt = num_tokens_from_string(txt_) + chunk.append(txt) + tk_cnt += cnt + if chunk: add_chunk() + for i, d in enumerate(res): + print(d) + # d["image"].save(f"./logs/{i}.jpg") + return res + + +if __name__ == "__main__": + import sys + + chunk(sys.argv[1]) diff --git a/rag/app/paper.py b/rag/app/paper.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c4aed8e0efb3a37f974193ddeb8ccd0a256566 --- /dev/null +++ b/rag/app/paper.py @@ -0,0 +1,240 @@ +import copy +import re +from collections import Counter +from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize +from rag.nlp import huqie, stemmer +from rag.parser.docx_parser import HuDocxParser +from rag.parser.pdf_parser import HuParser +from nltk.tokenize import word_tokenize +import numpy as np +from rag.utils import num_tokens_from_string + + +class Pdf(HuParser): + def __call__(self, filename, binary=None, from_page=0, + to_page=100000, zoomin=3, callback=None): + self.__images__( + filename if not binary else binary, + zoomin, + from_page, + to_page) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback) + + from timeit import default_timer as timer + start = timer() + self._layouts_paddle(zoomin) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback) + print("paddle layouts:", timer() - start) + self._table_transformer_job(zoomin) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback) + self._text_merge() + column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) + self._concat_downward(concat_between_pages=False) + self._filter_forpages() + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4, + "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback) + tbls = self._extract_table_figure(True, zoomin, False) + + # clean mess + if column_width < self.page_images[0].size[0] / zoomin / 2: + print("two_column...................", column_width, + self.page_images[0].size[0] / zoomin / 2) + self.boxes = self.sort_X_by_page(self.boxes, column_width / 2) + for b in self.boxes: + b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) + freq = Counter([b["text"] for b in self.boxes]) + garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6]) + i = 0 + while i < len(self.boxes): + if self.boxes[i]["text"] in garbage \ + or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \ + or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]): + self.boxes.pop(i) + elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno", + '1'): + # merge within same layouts + self.boxes[i + 1]["top"] = self.boxes[i]["top"] + self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"]) + self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"]) + self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"] + self.boxes.pop(i) + else: + i += 1 + + def _begin(txt): + return re.match( + "[0-9. 一ă€i]*(introduction|abstract|ć‘č¦|引言|keywords|key words|关键词|background|čŚć™Ż|目录|前言|contents)", + txt.lower().strip()) + + # get title and authors + title = "" + authors = [] + i = 0 + while i < min(32, len(self.boxes)): + b = self.boxes[i] + i += 1 + if b.get("layoutno", "").find("title") >= 0: + title = b["text"] + if _begin(title): + title = "" + break + for j in range(3): + if _begin(self.boxes[i + j]["text"]): break + authors.append(self.boxes[i + j]["text"]) + break + break + # get abstract + abstr = "" + i = 0 + while i + 1 < min(32, len(self.boxes)): + b = self.boxes[i] + i += 1 + txt = b["text"].lower().strip() + if re.match("(abstract|ć‘č¦)", txt): + if len(txt.split(" ")) > 32 or len(txt) > 64: + abstr = txt + self._line_tag(b, zoomin) + i += 1 + break + txt = self.boxes[i + 1]["text"].lower().strip() + if len(txt.split(" ")) > 32 or len(txt) > 64: + abstr = txt + self._line_tag(self.boxes[i + 1], zoomin) + i += 1 + break + if not abstr: i = 0 + + for b in self.boxes: print(b["text"], b.get("layoutno")) + print(tbls) + + return { + "title": title if title else filename, + "authors": " ".join(authors), + "abstract": abstr, + "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if + re.match(r"(text|title)", b.get("layoutno", "text"))], + "tables": tbls + } + + +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): + pdf_parser = None + paper = {} + + if re.search(r"\.pdf$", filename, re.IGNORECASE): + pdf_parser = Pdf() + paper = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) + doc = { + "docnm_kwd": paper["title"] if paper["title"] else filename, + "authors_tks": paper["authors"] + } + doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) + doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"]) + # is it English + eng = pdf_parser.is_english + print("It's English.....", eng) + + res = [] + # add tables + for img, rows in paper["tables"]: + bs = 10 + de = ";" if eng else ";" + for i in range(0, len(rows), bs): + d = copy.deepcopy(doc) + r = de.join(rows[i:i + bs]) + r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) + tokenize(d, r) + d["image"] = img + res.append(d) + + if paper["abstract"]: + d = copy.deepcopy(doc) + txt = pdf_parser.remove_tag(paper["abstract"]) + d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"] + d["important_tks"] = " ".join(d["important_kwd"]) + d["image"] = pdf_parser.crop(paper["abstract"]) + tokenize(d, txt, eng) + res.append(d) + + readed = [0] * len(paper["lines"]) + # find colon firstly + i = 0 + while i + 1 < len(paper["lines"]): + txt = pdf_parser.remove_tag(paper["lines"][i][0]) + j = i + if txt.strip("\n").strip()[-1] not in "::": + i += 1 + continue + i += 1 + while i < len(paper["lines"]) and not paper["lines"][i][0]: + i += 1 + if i >= len(paper["lines"]): break + proj = [paper["lines"][i][0].strip()] + i += 1 + while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]: + proj.append(paper["lines"][i]) + i += 1 + for k in range(j, i): readed[k] = True + txt = txt[::-1] + if eng: + r = re.search(r"(.*?) ([\.;?!]|$)", txt) + txt = r.group(1)[::-1] if r else txt[::-1] + else: + r = re.search(r"(.*?) ([。?;ďĽ]|$)", txt) + txt = r.group(1)[::-1] if r else txt[::-1] + for p in proj: + d = copy.deepcopy(doc) + txt += "\n" + pdf_parser.remove_tag(p) + d["image"] = pdf_parser.crop(p) + tokenize(d, txt) + res.append(d) + + i = 0 + chunk = [] + tk_cnt = 0 + def add_chunk(): + nonlocal chunk, res, doc, pdf_parser, tk_cnt + d = copy.deepcopy(doc) + ck = "\n".join(chunk) + tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english) + d["image"] = pdf_parser.crop(ck) + res.append(d) + chunk = [] + tk_cnt = 0 + + while i < len(paper["lines"]): + if tk_cnt > 128: + add_chunk() + if readed[i]: + i += 1 + continue + readed[i] = True + txt, layouts = paper["lines"][i] + txt_ = pdf_parser.remove_tag(txt) + i += 1 + cnt = num_tokens_from_string(txt_) + if any([ + layouts.find("title") >= 0 and chunk, + cnt + tk_cnt > 128 and tk_cnt > 32, + ]): + add_chunk() + chunk = [txt] + tk_cnt = cnt + else: + chunk.append(txt) + tk_cnt += cnt + + if chunk: add_chunk() + for i, d in enumerate(res): + print(d) + # d["image"].save(f"./logs/{i}.jpg") + return res + + +if __name__ == "__main__": + import sys + + chunk(sys.argv[1]) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 2713093240012cee8b3bfbb80b2f9c5c7cc9b6a0..303af34fa1dd5d7916c858517ad0d18ef7b25059 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -3,7 +3,7 @@ import re from io import BytesIO from pptx import Presentation -from rag.app import callback__ +from rag.app import callback__, tokenize, is_english from rag.nlp import huqie from rag.parser.pdf_parser import HuParser @@ -57,7 +57,7 @@ class Ppt(object): assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) callback__((min(to_page, self.total_page) - from_page) / self.total_page, "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback) - + self.is_english = is_english(txts) return [(txts[i], imgs[i]) for i in range(len(txts))] @@ -103,19 +103,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): - for txt,img in Ppt()(filename if not binary else binary, from_page, to_page, callback): + ppt_parser = Ppt() + for txt,img in ppt_parser(filename if not binary else binary, from_page, to_page, callback): d = copy.deepcopy(doc) - d["content_ltks"] = huqie.qie(txt) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["image"] = img + tokenize(d, txt, ppt_parser.is_english) res.append(d) return res if re.search(r"\.pdf$", filename, re.IGNORECASE): - for txt,img in Pdf()(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback): + pdf_parser = Pdf() + for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback): d = copy.deepcopy(doc) - d["content_ltks"] = huqie.qie(txt) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["image"] = img + tokenize(d, txt, pdf_parser.is_english) res.append(d) return res callback__(-1, "This kind of presentation document did not support yet!", callback) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 316f5265063f02f82113e64a46280c3de8ded747..37f5d0e210c10828bb2ea565e754372da76e9718 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1,4 +1,7 @@ from . import search from rag.utils import ELASTICSEARCH -retrievaler = search.Dealer(ELASTICSEARCH) \ No newline at end of file +retrievaler = search.Dealer(ELASTICSEARCH) + +from nltk.stem import PorterStemmer +stemmer = PorterStemmer() diff --git a/rag/parser/pdf_parser.py b/rag/parser/pdf_parser.py index 519dbb5ef3405aaab254132d9991be9ab3e8743e..53cfbdb4e150f841ef2e7a8a079c4b3da2c06f5b 100644 --- a/rag/parser/pdf_parser.py +++ b/rag/parser/pdf_parser.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import random + import fitz import xgboost as xgb from io import BytesIO @@ -14,6 +16,7 @@ from copy import deepcopy from rag.cv.table_recognize import TableTransformer from rag.cv.ppdetection import PPDet from huggingface_hub import hf_hub_download + logging.getLogger("pdfminer").setLevel(logging.WARNING) @@ -22,8 +25,8 @@ class HuParser: from paddleocr import PaddleOCR logging.getLogger("ppocr").setLevel(logging.ERROR) self.ocr = PaddleOCR(use_angle_cls=False, lang="ch") - self.layouter = PPDet() - self.tbl_det = TableTransformer() + self.layouter = PPDet("/data/newpeak/medical-gpt/res/ppdet") + self.tbl_det = PPDet("/data/newpeak/medical-gpt/res/ppdet.tbl") self.updown_cnt_mdl = xgb.Booster() if torch.cuda.is_available(): @@ -55,7 +58,7 @@ class HuParser: def _y_dis( self, a, b): return ( - b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2 + b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2 def _match_proj(self, b): proj_patt = [ @@ -78,9 +81,9 @@ class HuParser: tks_down = huqie.qie(down["text"][:LEN]).split(" ") tks_up = huqie.qie(up["text"][-LEN:]).split(" ") tks_all = up["text"][-LEN:].strip() \ - + (" " if re.match(r"[a-zA-Z0-9]+", - up["text"][-1] + down["text"][0]) else "") \ - + down["text"][:LEN].strip() + + (" " if re.match(r"[a-zA-Z0-9]+", + up["text"][-1] + down["text"][0]) else "") \ + + down["text"][:LEN].strip() tks_all = huqie.qie(tks_all).split(" ") fea = [ up.get("R", -1) == down.get("R", -1), @@ -102,7 +105,7 @@ class HuParser: True if re.search(r"[,,][^。.]+$", up["text"]) else False, True if re.search(r"[,,][^。.]+$", up["text"]) else False, True if re.search(r"[\(ďĽ][^\))]+$", up["text"]) - and re.search(r"[\))]", down["text"]) else False, + and re.search(r"[\))]", down["text"]) else False, self._match_proj(down), True if re.match(r"[A-Z]", down["text"]) else False, True if re.match(r"[A-Z]", up["text"][-1]) else False, @@ -141,6 +144,21 @@ class HuParser: arr[j + 1] = deepcopy(tmp) return arr + @staticmethod + def sort_X_by_page(arr, threashold): + # sort using y1 first and then x1 + arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) + for i in range(len(arr) - 1): + for j in range(i, -1, -1): + # restore the order using th + if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ + and arr[j + 1]["top"] < arr[j]["top"]\ + and arr[j + 1]["page_number"] == arr[j]["page_number"]: + tmp = arr[j] + arr[j] = arr[j + 1] + arr[j + 1] = tmp + return arr + @staticmethod def sort_R_firstly(arr, thr=0): # sort using y1 first and then x1 @@ -219,7 +237,7 @@ class HuParser: assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format( tp, btm, x0, x1, b) ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \ - x0 != 0 and btm - tp != 0 else 0 + x0 != 0 and btm - tp != 0 else 0 if ov > 0 and ratio: ov /= (x1 - x0) * (btm - tp) return ov @@ -326,7 +344,7 @@ class HuParser: return layouts def __table_paddle(self, images): - tbls = self.tbl_det([img for img in images], threshold=0.5) + tbls = self.tbl_det([np.array(img) for img in images], thr=0.5) res = [] # align left&right for rows, align top&bottom for columns for tbl in tbls: @@ -384,7 +402,7 @@ class HuParser: continue for tb in tbls: # for table left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \ - tb["x1"] + MARGIN, tb["bottom"] + MARGIN + tb["x1"] + MARGIN, tb["bottom"] + MARGIN left *= ZM top *= ZM right *= ZM @@ -482,10 +500,13 @@ class HuParser: continue ch = c["bottom"] - c["top"] bh = bxs[ii]["bottom"] - bxs[ii]["top"] - if abs(ch - bh) / max(ch, bh) >= 0.7: + if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ': self.lefted_chars.append(c) continue - bxs[ii]["text"] += c["text"] + if c["text"] == " " and bxs[ii]["text"]: + if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " " + else: + bxs[ii]["text"] += c["text"] for b in bxs: if not b["text"]: @@ -629,7 +650,7 @@ class HuParser: i += 1 self.boxes = bxs - def _concat_downward(self): + def _concat_downward(self, concat_between_pages=True): # count boxes in the same row as a feature for i in range(len(self.boxes)): mh = self.mean_height[self.boxes[i]["page_number"] - 1] @@ -665,6 +686,8 @@ class HuParser: if not smpg and ydis > mh * 16: break down = boxes[i] + if not concat_between_pages and down["page_number"] > up["page_number"]: + break if up.get("R", "") != down.get( "R", "") and up["text"][-1] != ",": @@ -735,43 +758,29 @@ class HuParser: self.boxes = self.sort_Y_firstly(boxes, 0) - def __filter_forpages(self): + def _filter_forpages(self): if not self.boxes: return - to = min(7, len(self.page_images) // 5) - pg_hits = [0 for _ in range(to)] - - def possible(c): - if c.get("layout_type", "") == "reference": - return True - if c["bottom"] - c["top"] >= 2 * \ - self.mean_height[c["page_number"] - 1]: - return False - if c["text"].find("....") >= 0 \ - or (c["x1"] - c["x0"] > 250 and re.search(r"[0-9]+$", - c["text"].strip())): - return True - return self.is_caption(c) and re.search( - r"[0-9]+$", c["text"].strip()) - - for c in self.boxes: - if c["page_number"] >= to: - break - if possible(c): - pg_hits[c["page_number"] - 1] += 1 - - st, ed = -1, -1 - for i in range(len(self.boxes)): - c = self.boxes[i] - if c["page_number"] >= to: + i = 0 + while i < len(self.boxes): + if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): + i += 1 + continue + eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) + self.boxes.pop(i) + if i >= len(self.boxes): break + prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2]) + while not prefix: + self.boxes.pop(i) + if i >= len(self.boxes): break + prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2]) + self.boxes.pop(i) + if i >= len(self.boxes) or not prefix: break + for j in range(i, min(i + 128, len(self.boxes))): + if not re.match(prefix, self.boxes[j]["text"]): + continue + for k in range(i, j): self.boxes.pop(i) break - if pg_hits[c["page_number"] - 1] >= 3 and possible(c): - if st < 0: - st = i - else: - ed = i - for _ in range(st, ed + 1): - self.boxes.pop(st) def _blockType(self, b): patt = [ @@ -918,7 +927,7 @@ class HuParser: lst_r = rows[-1] if lst_r[-1].get("R", "") != b.get("R", "") \ or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2") - ): # new row + ): # new row btm = b["bottom"] b["rn"] += 1 rows.append([b]) @@ -968,9 +977,9 @@ class HuParser: j += 1 continue f = (j > 0 and tbl[ii][j - 1] and tbl[ii] - [j - 1][0].get("text")) or j == 0 + [j - 1][0].get("text")) or j == 0 ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii] - [j + 1][0].get("text")) or j + 1 >= len(tbl[ii]) + [j + 1][0].get("text")) or j + 1 >= len(tbl[ii]) if f and ff: j += 1 continue @@ -1031,9 +1040,9 @@ class HuParser: i += 1 continue f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1] - [jj][0].get("text")) or i == 0 + [jj][0].get("text")) or i == 0 ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1] - [jj][0].get("text")) or i + 1 >= len(tbl) + [jj][0].get("text")) or i + 1 >= len(tbl) if f and ff: i += 1 continue @@ -1153,6 +1162,7 @@ class HuParser: headers = {} hdrset = set() lst_hdr = [] + de = "çš„" if not self.is_english else " for " for r in sorted(list(hdr_rowno)): headers[r] = ["" for _ in range(clmno)] for i in range(clmno): @@ -1184,12 +1194,12 @@ class HuParser: if headers[j][k].find(headers[j - 1][k]) >= 0: continue if len(headers[j][k]) > len(headers[j - 1][k]): - headers[j][k] += ("çš„" if headers[j][k] + headers[j][k] += (de if headers[j][k] else "") + headers[j - 1][k] else: headers[j][k] = headers[j - 1][k] \ - + ("çš„" if headers[j - 1][k] else "") \ - + headers[j][k] + + (de if headers[j - 1][k] else "") \ + + headers[j][k] logging.debug( f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}") @@ -1241,7 +1251,11 @@ class HuParser: row_txt.append("; ".join(rtxt)) if cap: - row_txt = [t + f"\t——来自“{cap}”" for t in row_txt] + if self.is_english: + from_ = " in " + else: + from_ = "来自" + row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt] return row_txt @staticmethod @@ -1254,7 +1268,7 @@ class HuParser: return True return False - def __extract_table_figure(self, need_image, ZM, return_html): + def _extract_table_figure(self, need_image, ZM, return_html): tables = {} figures = {} # extract figure and table boxes @@ -1266,7 +1280,7 @@ class HuParser: i += 1 continue lout_no = str(self.boxes[i]["page_number"]) + \ - "-" + str(self.boxes[i]["layoutno"]) + "-" + str(self.boxes[i]["layoutno"]) if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title", "figure caption", "reference"]: nomerge_lout_no.append(lst_lout_no) @@ -1574,8 +1588,14 @@ class HuParser: self.page_chars.append([]) logging.info("Images converted.") + self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))] + if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: + self.is_english = True + else: + self.is_english = False + for i, img in enumerate(self.page_images): - chars = self.page_chars[i] + chars = self.page_chars[i] if not self.is_english else [] self.mean_height.append( np.median(sorted([c["height"] for c in chars])) if chars else 0 ) @@ -1583,6 +1603,14 @@ class HuParser: np.median(sorted([c["width"] for c in chars])) if chars else 8 ) self.page_cum_height.append(img.size[1] / zoomin) + j = 0 + while j + 1 < len(chars): + if chars[j]["text"] and chars[j + 1]["text"] \ + and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \ + and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"], + chars[j]["width"]) / 2: + chars[j]["text"] += " " + j += 1 # if i > 0: # if not chars: # self.page_cum_height.append(img.size[1] / zoomin) @@ -1591,8 +1619,13 @@ class HuParser: # np.max([c["bottom"] for c in chars])) self.__ocr_paddle(i + 1, img, chars, zoomin) + if not self.is_english and not all([c for c in self.page_chars]) and self.boxes: + self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)])) + + logging.info("Is it English:", self.is_english) + self.page_cum_height = np.cumsum(self.page_cum_height) - assert len(self.page_cum_height) == len(self.page_images)+1 + assert len(self.page_cum_height) == len(self.page_images) + 1 def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): self.__images__(fnm, zoomin) @@ -1600,8 +1633,8 @@ class HuParser: self._table_transformer_job(zoomin) self._text_merge() self._concat_downward() - self.__filter_forpages() - tbls = self.__extract_table_figure(need_image, zoomin, return_html) + self._filter_forpages() + tbls = self._extract_table_figure(need_image, zoomin, return_html) return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls def remove_tag(self, txt): @@ -1622,7 +1655,7 @@ class HuParser: self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min( - bottom, self.page_images[pns[0]].size[1]) + bottom, self.page_images[pns[0]].size[1]) )) ) bottom -= self.page_images[pns[0]].size[1]