From 51482f3e2af252daa791450427f7cc65ef82bd5a Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Fri, 2 Feb 2024 19:21:37 +0800 Subject: [PATCH] Some document API refined. (#53) Add naive chunking method to RAG --- api/apps/document_app.py | 18 +-- api/db/services/document_service.py | 3 +- rag/app/__init__.py | 91 ------------ rag/app/book.py | 74 ++-------- rag/app/laws.py | 110 ++++---------- rag/app/manual.py | 9 +- rag/app/naive.py | 79 ++++++++++ rag/app/paper.py | 9 +- rag/app/presentation.py | 8 +- rag/app/qa.py | 8 +- rag/parser/__init__.py | 217 ++++++++++++++++++++++++++++ rag/parser/docx_parser.py | 15 +- rag/parser/pdf_parser.py | 74 +++++++++- 13 files changed, 447 insertions(+), 268 deletions(-) create mode 100644 rag/app/naive.py diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 207ae84..e43bfc7 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -133,9 +133,9 @@ def list(): orderby = request.args.get("orderby", "create_time") desc = request.args.get("desc", True) try: - docs = DocumentService.get_by_kb_id( + docs, tol = DocumentService.get_by_kb_id( kb_id, page_number, items_per_page, orderby, desc, keywords) - return get_json_result(data=docs) + return get_json_result(data={"total":tol, "docs": docs}) except Exception as e: return server_error_response(e) @@ -228,20 +228,18 @@ def run(): @manager.route('/rename', methods=['POST']) @login_required -@validate_request("doc_id", "name", "old_name") +@validate_request("doc_id", "name") def rename(): req = request.json - if pathlib.Path(req["name"].lower()).suffix != pathlib.Path( - req["old_name"].lower()).suffix: - get_json_result( - data=False, - retmsg="The extension of file can't be changed", - retcode=RetCode.ARGUMENT_ERROR) - try: e, doc = DocumentService.get_by_id(req["doc_id"]) if not e: return get_data_error_result(retmsg="Document not found!") + if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix: + return get_json_result( + data=False, + retmsg="The extension of file can't be changed", + retcode=RetCode.ARGUMENT_ERROR) if DocumentService.query(name=req["name"], kb_id=doc.kb_id): return get_data_error_result( retmsg="Duplicated document name in the same knowledgebase.") diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 50b54ab..b17ee89 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -36,6 +36,7 @@ class DocumentService(CommonService): cls.model.name.like(f"%%{keywords}%%")) else: docs = cls.model.select().where(cls.model.kb_id == kb_id) + count = docs.count() if desc: docs = docs.order_by(cls.model.getter_by(orderby).desc()) else: @@ -43,7 +44,7 @@ class DocumentService(CommonService): docs = docs.paginate(page_number, items_per_page) - return list(docs.dicts()) + return list(docs.dicts()), count @classmethod @DB.connection_context() diff --git a/rag/app/__init__.py b/rag/app/__init__.py index 06787b8..e69de29 100644 --- a/rag/app/__init__.py +++ b/rag/app/__init__.py @@ -1,91 +0,0 @@ -import re - -from nltk import word_tokenize - -from rag.nlp import stemmer, huqie - -BULLET_PATTERN = [[ - r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+(编|é¨ĺ†)", - r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+ç« ", - r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+节", - r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+条", - r"[\(ďĽ][零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[\))]", - ], [ - r"[0-9]{,3}[\. ă€]", - r"[0-9]{,2}\.[0-9]{,2}", - r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", - r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", - ], [ - r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+ç« ", - r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+节", - r"[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[ ă€]", - r"[\(ďĽ][零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[\))]", - r"[\(ďĽ][0-9]{,2}[\))]", - ] ,[ - r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", - r"Chapter (I+V?|VI*|XI|IX|X)", - r"Section [0-9]+", - r"Article [0-9]+" - ] - ] - - -def bullets_category(sections): - global BULLET_PATTERN - hits = [0] * len(BULLET_PATTERN) - for i, pro in enumerate(BULLET_PATTERN): - for sec in sections: - for p in pro: - if re.match(p, sec): - hits[i] += 1 - break - maxium = 0 - res = -1 - for i,h in enumerate(hits): - if h <= maxium:continue - res = i - maxium = h - return res - -def is_english(texts): - eng = 0 - for t in texts: - if re.match(r"[a-zA-Z]{2,}", t.strip()): - eng += 1 - if eng / len(texts) > 0.8: - return True - return False - -def tokenize(d, t, eng): - d["content_with_weight"] = t - if eng: - t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) - d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)]) - else: - d["content_ltks"] = huqie.qie(t) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) - - -def remove_contents_table(sections, eng=False): - i = 0 - while i < len(sections): - def get(i): - nonlocal sections - return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip() - if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): - i += 1 - continue - sections.pop(i) - if i >= len(sections): break - prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) - while not prefix: - sections.pop(i) - if i >= len(sections): break - prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) - sections.pop(i) - if i >= len(sections) or not prefix: break - for j in range(i, min(i+128, len(sections))): - if not re.match(prefix, get(j)): - continue - for _ in range(i, j):sections.pop(i) - break \ No newline at end of file diff --git a/rag/app/book.py b/rag/app/book.py index 59948ef..a478f17 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -1,10 +1,9 @@ import copy import random import re -from io import BytesIO -from docx import Document import numpy as np -from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table +from rag.parser import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table, \ + hierarchical_merge, make_colon_as_title, naive_merge from rag.nlp import huqie from rag.parser.docx_parser import HuDocxParser from rag.parser.pdf_parser import HuParser @@ -28,7 +27,6 @@ class Pdf(HuParser): self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() - column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) self._concat_downward(concat_between_pages=False) self._filter_forpages() self._merge_with_same_bullet() @@ -37,10 +35,10 @@ class Pdf(HuParser): callback(0.8, "Text extraction finished") - return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes] + return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -52,8 +50,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): callback(0.1, "Start to parse.") doc_parser = HuDocxParser() # TODO: table of contents need to be removed - sections, tbls = doc_parser(binary if binary else filename) - remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200))) + sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page) + remove_contents_table(sections, eng=is_english(random.choices([t for t,_ in sections], k=200))) callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() @@ -75,54 +73,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): callback(0.8, "Finish parsing.") else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") - bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)]) - projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections) - levels = [[]] * len(BULLET_PATTERN[bull]) + 2 - for i, (txt, layout) in enumerate(sections): - for j, p in enumerate(BULLET_PATTERN[bull]): - if re.match(p, txt.strip()): - projs[i] = j - levels[j].append(i) - break - else: - if re.search(r"(title|head)", layout): - projs[i] = BULLET_PATTERN[bull] - levels[BULLET_PATTERN[bull]].append(i) - else: - levels[BULLET_PATTERN[bull] + 1].append(i) - sections = [t for t,_ in sections] - - def binary_search(arr, target): - if target > arr[-1]: return len(arr) - 1 - if target > arr[0]: return -1 - s, e = 0, len(arr) - while e - s > 1: - i = (e + s) // 2 - if target > arr[i]: - s = i - continue - elif target < arr[i]: - e = i - continue - else: - assert False - return s - - cks = [] - readed = [False] * len(sections) - levels = levels[::-1] - for i, arr in enumerate(levels): - for j in arr: - if readed[j]: continue - readed[j] = True - cks.append([j]) - if i + 1 == len(levels) - 1: continue - for ii in range(i + 1, len(levels)): - jj = binary_search(levels[ii], j) - if jj < 0: break - if jj > cks[-1][-1]: cks[-1].pop(-1) - cks[-1].append(levels[ii][jj]) + make_colon_as_title(sections) + bull = bullets_category([t for t in random.choices([t for t,_ in sections], k=100)]) + if bull >= 0: cks = hierarchical_merge(bull, sections, 3) + else: cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;ďĽďĽź")) + sections = [t for t, _ in sections] # is it English eng = is_english(random.choices(sections, k=218)) @@ -138,11 +94,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): tokenize(d, r, eng) d["image"] = img res.append(d) + print("TABLE", d["content_with_weight"]) # wrap up to es documents for ck in cks: - print("\n-".join(ck[::-1])) - ck = "\n".join(ck[::-1]) d = copy.deepcopy(doc) + ck = "\n".join(ck) if pdf_parser: d["image"] = pdf_parser.crop(ck) ck = pdf_parser.remove_tag(ck) @@ -153,4 +109,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if __name__ == "__main__": import sys - chunk(sys.argv[1]) + def dummy(a, b): + pass + chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy) diff --git a/rag/app/laws.py b/rag/app/laws.py index c68d3b8..7e9a964 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -3,10 +3,12 @@ import re from io import BytesIO from docx import Document import numpy as np -from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize +from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ + make_colon_as_title from rag.nlp import huqie from rag.parser.docx_parser import HuDocxParser from rag.parser.pdf_parser import HuParser +from rag.settings import cron_logger class Docx(HuDocxParser): @@ -17,10 +19,20 @@ class Docx(HuDocxParser): line = re.sub(r"\u3000", " ", line).strip() return line - def __call__(self, filename, binary=None): + def __call__(self, filename, binary=None, from_page=0, to_page=100000): self.doc = Document( filename) if not binary else Document(BytesIO(binary)) - lines = [self.__clean(p.text) for p in self.doc.paragraphs] + pn = 0 + lines = [] + for p in self.doc.paragraphs: + if pn > to_page:break + if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text)) + for run in p.runs: + if 'lastRenderedPageBreak' in run._element.xml: + pn += 1 + continue + if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + pn += 1 return [l for l in lines if l] @@ -38,49 +50,15 @@ class Pdf(HuParser): start = timer() self._layouts_paddle(zoomin) callback(0.77, "Layout analysis finished") - print("paddle layouts:", timer()-start) - bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) - # is it English - eng = is_english([b["text"] for b in bxs]) - # Merge vertically - i = 0 - while i + 1 < len(bxs): - b = bxs[i] - b_ = bxs[i + 1] - if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): - bxs.pop(i) - continue - concatting_feats = [ - b["text"].strip()[-1] in ",;:'\",ă€â€â€śďĽ›ďĽš-", - len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",â€â€śă€ďĽ›ďĽš", - b["text"].strip()[0] in "。;?ďĽ?”)),,ă€ďĽš", - ] - # features for not concating - feats = [ - b.get("layoutno",0) != b.get("layoutno",0), - b["text"].strip()[-1] in "。?ďĽ?", - eng and b["text"].strip()[-1] in ".!?", - b["page_number"] == b_["page_number"] and b_["top"] - \ - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, - b["page_number"] < b_["page_number"] and abs( - b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4 - ] - if any(feats) and not any(concatting_feats): - i += 1 - continue - # merge up and down - b["bottom"] = b_["bottom"] - b["text"] += b_["text"] - b["x0"] = min(b["x0"], b_["x0"]) - b["x1"] = max(b["x1"], b_["x1"]) - bxs.pop(i + 1) + cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1))) + self._naive_vertical_merge() callback(0.8, "Text extraction finished") - return [b["text"] + self._line_tag(b, zoomin) for b in bxs] + return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes] -def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -116,50 +94,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): # is it English eng = is_english(sections) # Remove 'Contents' part - i = 0 - while i < len(sections): - if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)): - i += 1 - continue - sections.pop(i) - if i >= len(sections): break - prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2]) - while not prefix: - sections.pop(i) - if i >= len(sections): break - prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2]) - sections.pop(i) - if i >= len(sections) or not prefix: break - for j in range(i, min(i+128, len(sections))): - if not re.match(prefix, sections[j]): - continue - for _ in range(i, j):sections.pop(i) - break + remove_contents_table(sections, eng) + make_colon_as_title(sections) bull = bullets_category(sections) - projs = [len(BULLET_PATTERN[bull])] * len(sections) - for i, sec in enumerate(sections): - for j,p in enumerate(BULLET_PATTERN[bull]): - if re.match(p, sec.strip()): - projs[i] = j - break - readed = [0] * len(sections) - cks = [] - for pr in range(len(BULLET_PATTERN[bull])-1, 1, -1): - for i in range(len(sections)): - if readed[i] or projs[i] < pr: - continue - # find father and grand-father and grand...father - p = projs[i] - readed[i] = 1 - ck = [sections[i]] - for j in range(i-1, -1, -1): - if projs[j] >= p:continue - ck.append(sections[j]) - readed[j] = 1 - p = projs[j] - if p == 0: break - cks.append(ck[::-1]) + cks = hierarchical_merge(bull, sections, 3) + if not cks: callback(0.99, "No chunk parsed out.") res = [] # wrap up to es documents @@ -177,4 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if __name__ == "__main__": import sys - chunk(sys.argv[1]) + def dummy(a, b): + pass + chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/manual.py b/rag/app/manual.py index 241fdd1..a35fdfd 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -1,6 +1,6 @@ import copy import re -from rag.app import tokenize +from rag.parser import tokenize from rag.nlp import huqie from rag.parser.pdf_parser import HuParser from rag.utils import num_tokens_from_string @@ -57,7 +57,7 @@ class Pdf(HuParser): return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): pdf_parser = None paper = {} @@ -117,5 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if __name__ == "__main__": import sys - - chunk(sys.argv[1]) + def dummy(a, b): + pass + chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/naive.py b/rag/app/naive.py new file mode 100644 index 0000000..14bc1f8 --- /dev/null +++ b/rag/app/naive.py @@ -0,0 +1,79 @@ +import copy +import re +from rag.app import laws +from rag.parser import is_english, tokenize, naive_merge +from rag.nlp import huqie +from rag.parser.pdf_parser import HuParser +from rag.settings import cron_logger + +class Pdf(HuParser): + def __call__(self, filename, binary=None, from_page=0, + to_page=100000, zoomin=3, callback=None): + self.__images__( + filename if not binary else binary, + zoomin, + from_page, + to_page) + callback(0.1, "OCR finished") + + from timeit import default_timer as timer + start = timer() + self._layouts_paddle(zoomin) + callback(0.77, "Layout analysis finished") + cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1))) + self._naive_vertical_merge() + return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] + + +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + doc = { + "docnm_kwd": filename, + "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } + doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + pdf_parser = None + sections = [] + if re.search(r"\.docx?$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + for txt in laws.Docx()(filename, binary): + sections.append((txt, "")) + callback(0.8, "Finish parsing.") + elif re.search(r"\.pdf$", filename, re.IGNORECASE): + pdf_parser = Pdf() + sections = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) + elif re.search(r"\.txt$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + txt = "" + if binary:txt = binary.decode("utf-8") + else: + with open(filename, "r") as f: + while True: + l = f.readline() + if not l:break + txt += l + sections = txt.split("\n") + sections = [(l,"") for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") + + cks = naive_merge(sections, kwargs.get("chunk_token_num", 128), kwargs.get("delimer", "\n。;ďĽďĽź")) + eng = is_english(cks) + res = [] + # wrap up to es documents + for ck in cks: + print("--", ck) + d = copy.deepcopy(doc) + if pdf_parser: + d["image"] = pdf_parser.crop(ck) + ck = pdf_parser.remove_tag(ck) + tokenize(d, ck, eng) + res.append(d) + return res + + +if __name__ == "__main__": + import sys + def dummy(a, b): + pass + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) diff --git a/rag/app/paper.py b/rag/app/paper.py index 220852c..131582f 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -1,7 +1,7 @@ import copy import re from collections import Counter -from rag.app import tokenize +from rag.parser import tokenize from rag.nlp import huqie from rag.parser.pdf_parser import HuParser import numpy as np @@ -113,7 +113,7 @@ class Pdf(HuParser): } -def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): pdf_parser = None paper = {} @@ -232,5 +232,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if __name__ == "__main__": import sys - - chunk(sys.argv[1]) + def dummy(a, b): + pass + chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 0495adb..ff805bb 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -3,7 +3,7 @@ import re from io import BytesIO from pptx import Presentation -from rag.app import tokenize, is_english +from rag.parser import tokenize, is_english from rag.nlp import huqie from rag.parser.pdf_parser import HuParser @@ -93,7 +93,7 @@ class Pdf(HuParser): return res -def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -122,5 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if __name__== "__main__": import sys - print(chunk(sys.argv[1])) + def dummy(a, b): + pass + chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/qa.py b/rag/app/qa.py index b9ce843..4012984 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -3,7 +3,7 @@ import re from io import BytesIO from nltk import word_tokenize from openpyxl import load_workbook -from rag.app import is_english +from rag.parser import is_english from rag.nlp import huqie, stemmer @@ -55,7 +55,7 @@ def beAdoc(d, q, a, eng): return d -def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): +def chunk(filename, binary=None, callback=None, **kwargs): res = [] if re.search(r"\.xlsx?$", filename, re.IGNORECASE): @@ -98,7 +98,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): if __name__== "__main__": import sys - def kk(rat, ss): + def dummy(a, b): pass - print(chunk(sys.argv[1], callback=kk)) + chunk(sys.argv[1], callback=dummy) diff --git a/rag/parser/__init__.py b/rag/parser/__init__.py index be7af49..ed98049 100644 --- a/rag/parser/__init__.py +++ b/rag/parser/__init__.py @@ -1,3 +1,220 @@ +import copy + from .pdf_parser import HuParser as PdfParser from .docx_parser import HuDocxParser as DocxParser from .excel_parser import HuExcelParser as ExcelParser + +import re + +from nltk import word_tokenize + +from rag.nlp import stemmer, huqie +from ..utils import num_tokens_from_string + +BULLET_PATTERN = [[ + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+(ĺ†?编|é¨ĺ†)", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+ç« ", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+节", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+条", + r"[\(ďĽ][零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[\))]", +], [ + r"第[0-9]+ç« ", + r"第[0-9]+节", + r"[0-9]{,3}[\. ă€]", + r"[0-9]{,2}\.[0-9]{,2}", + r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", + r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", +], [ + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+ç« ", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+节", + r"[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[ ă€]", + r"[\(ďĽ][零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[\))]", + r"[\(ďĽ][0-9]{,2}[\))]", +], [ + r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", + r"Chapter (I+V?|VI*|XI|IX|X)", + r"Section [0-9]+", + r"Article [0-9]+" +] +] + + +def bullets_category(sections): + global BULLET_PATTERN + hits = [0] * len(BULLET_PATTERN) + for i, pro in enumerate(BULLET_PATTERN): + for sec in sections: + for p in pro: + if re.match(p, sec): + hits[i] += 1 + break + maxium = 0 + res = -1 + for i, h in enumerate(hits): + if h <= maxium: continue + res = i + maxium = h + return res + + +def is_english(texts): + eng = 0 + for t in texts: + if re.match(r"[a-zA-Z]{2,}", t.strip()): + eng += 1 + if eng / len(texts) > 0.8: + return True + return False + + +def tokenize(d, t, eng): + d["content_with_weight"] = t + if eng: + t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) + d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)]) + else: + d["content_ltks"] = huqie.qie(t) + d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + + +def remove_contents_table(sections, eng=False): + i = 0 + while i < len(sections): + def get(i): + nonlocal sections + return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip() + + if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", + re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): + i += 1 + continue + sections.pop(i) + if i >= len(sections): break + prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) + while not prefix: + sections.pop(i) + if i >= len(sections): break + prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) + sections.pop(i) + if i >= len(sections) or not prefix: break + for j in range(i, min(i + 128, len(sections))): + if not re.match(prefix, get(j)): + continue + for _ in range(i, j): sections.pop(i) + break + + +def make_colon_as_title(sections): + if not sections: return [] + if type(sections[0]) == type(""): return sections + i = 0 + while i < len(sections): + txt, layout = sections[i] + i += 1 + txt = txt.split("@")[0].strip() + if not txt: + continue + if txt[-1] not in "::": + continue + txt = txt[::-1] + arr = re.split(r"([。?ďĽ!?;;]| .)", txt) + if len(arr) < 2 or len(arr[1]) < 32: + continue + sections.insert(i - 1, (arr[0][::-1], "title")) + i += 1 + + +def hierarchical_merge(bull, sections, depth): + if not sections or bull < 0: return [] + if type(sections[0]) == type(""): sections = [(s, "") for s in sections] + sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())] + bullets_size = len(BULLET_PATTERN[bull]) + levels = [[] for _ in range(bullets_size + 2)] + + def not_title(txt): + if re.match(r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ0-9]+条", txt): return False + if len(txt) >= 128: return True + return re.search(r"[,;,。;ďĽ!]", txt) + + for i, (txt, layout) in enumerate(sections): + for j, p in enumerate(BULLET_PATTERN[bull]): + if re.match(p, txt.strip()) and not not_title(txt): + levels[j].append(i) + break + else: + if re.search(r"(title|head)", layout): + levels[bullets_size].append(i) + else: + levels[bullets_size + 1].append(i) + sections = [t for t, _ in sections] + for s in sections: print("--", s) + + def binary_search(arr, target): + if not arr: return -1 + if target > arr[-1]: return len(arr) - 1 + if target < arr[0]: return -1 + s, e = 0, len(arr) + while e - s > 1: + i = (e + s) // 2 + if target > arr[i]: + s = i + continue + elif target < arr[i]: + e = i + continue + else: + assert False + return s + + cks = [] + readed = [False] * len(sections) + levels = levels[::-1] + for i, arr in enumerate(levels[:depth]): + for j in arr: + if readed[j]: continue + readed[j] = True + cks.append([j]) + if i + 1 == len(levels) - 1: continue + for ii in range(i + 1, len(levels)): + jj = binary_search(levels[ii], j) + if jj < 0: continue + if jj > cks[-1][-1]: cks[-1].pop(-1) + cks[-1].append(levels[ii][jj]) + for ii in cks[-1]: readed[ii] = True + for i in range(len(cks)): + cks[i] = [sections[j] for j in cks[i][::-1]] + print("--------------\n", "\n* ".join(cks[i])) + + return cks + + +def naive_merge(sections, chunk_token_num=128, delimiter="\n。;ďĽďĽź"): + if not sections: return [] + if type(sections[0]) == type(""): sections = [(s, "") for s in sections] + cks = [""] + tk_nums = [0] + def add_chunk(t, pos): + nonlocal cks, tk_nums, delimiter + tnum = num_tokens_from_string(t) + if tnum < 8: pos = "" + if tk_nums[-1] > chunk_token_num: + cks.append(t + pos) + tk_nums.append(tnum) + else: + cks[-1] += t + pos + tk_nums[-1] += tnum + + for sec, pos in sections: + s, e = 0, 1 + while e < len(sec): + if sec[e] in delimiter: + add_chunk(sec[s: e+1], pos) + s = e + 1 + e = s + 1 + else: + e += 1 + if s < e: add_chunk(sec[s: e], pos) + + return cks + + diff --git a/rag/parser/docx_parser.py b/rag/parser/docx_parser.py index ae63a68..2ee0edb 100644 --- a/rag/parser/docx_parser.py +++ b/rag/parser/docx_parser.py @@ -98,8 +98,19 @@ class HuDocxParser: return lines return ["\n".join(lines)] - def __call__(self, fnm): + def __call__(self, fnm, from_page=0, to_page=100000): self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm)) - secs = [(p.text, p.style.name) for p in self.doc.paragraphs] + pn = 0 + secs = [] + for p in self.doc.paragraphs: + if pn > to_page: break + if from_page <= pn < to_page and p.text.strip(): secs.append((p.text, p.style.name)) + for run in p.runs: + if 'lastRenderedPageBreak' in run._element.xml: + pn += 1 + continue + if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + pn += 1 + tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] return secs, tbls diff --git a/rag/parser/pdf_parser.py b/rag/parser/pdf_parser.py index 5935580..9cc3451 100644 --- a/rag/parser/pdf_parser.py +++ b/rag/parser/pdf_parser.py @@ -650,6 +650,41 @@ class HuParser: i += 1 self.boxes = bxs + def _naive_vertical_merge(self): + bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) + i = 0 + while i + 1 < len(bxs): + b = bxs[i] + b_ = bxs[i + 1] + if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): + bxs.pop(i) + continue + concatting_feats = [ + b["text"].strip()[-1] in ",;:'\",ă€â€â€śďĽ›ďĽš-", + len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",â€â€śă€ďĽ›ďĽš", + b["text"].strip()[0] in "。;?ďĽ?”)),,ă€ďĽš", + ] + # features for not concating + feats = [ + b.get("layoutno", 0) != b.get("layoutno", 0), + b["text"].strip()[-1] in "。?ďĽ?", + self.is_english and b["text"].strip()[-1] in ".!?", + b["page_number"] == b_["page_number"] and b_["top"] - \ + b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, + b["page_number"] < b_["page_number"] and abs( + b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4 + ] + if any(feats) and not any(concatting_feats): + i += 1 + continue + # merge up and down + b["bottom"] = b_["bottom"] + b["text"] += b_["text"] + b["x0"] = min(b["x0"], b_["x0"]) + b["x1"] = max(b["x1"], b_["x1"]) + bxs.pop(i + 1) + self.boxes = bxs + def _concat_downward(self, concat_between_pages=True): # count boxes in the same row as a feature for i in range(len(self.boxes)): @@ -761,11 +796,13 @@ class HuParser: def _filter_forpages(self): if not self.boxes: return + findit = False i = 0 while i < len(self.boxes): if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): i += 1 continue + findit = True eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) self.boxes.pop(i) if i >= len(self.boxes): break @@ -781,14 +818,36 @@ class HuParser: continue for k in range(i, j): self.boxes.pop(i) break + if findit:return + + page_dirty = [0] * len(self.page_images) + for b in self.boxes: + if re.search(r"(··|··|··)", b["text"]): + page_dirty[b["page_number"]-1] += 1 + page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3]) + if not page_dirty: return + i = 0 + while i < len(self.boxes): + if self.boxes[i]["page_number"] in page_dirty: + self.boxes.pop(i) + continue + i += 1 def _merge_with_same_bullet(self): i = 0 while i + 1 < len(self.boxes): b = self.boxes[i] b_ = self.boxes[i + 1] + if not b["text"].strip(): + self.boxes.pop(i) + continue + if not b_["text"].strip(): + self.boxes.pop(i+1) + continue + if b["text"].strip()[0] != b_["text"].strip()[0] \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ + or huqie.is_chinese(b["text"].strip()[0]) \ or b["top"] > b_["bottom"]: i += 1 continue @@ -1596,8 +1655,7 @@ class HuParser: self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] - self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in - range(len(self.page_images))] + self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] self.total_page = len(self.pdf.pages) except Exception as e: self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf") @@ -1605,15 +1663,17 @@ class HuParser: self.page_chars = [] mat = fitz.Matrix(zoomin, zoomin) self.total_page = len(self.pdf) - for page in self.pdf[page_from:page_to]: - pix = page.getPixmap(matrix=mat) + for i, page in enumerate(self.pdf): + if i < page_from:continue + if i >= page_to:break + pix = page.get_pixmap(matrix=mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) self.page_images.append(img) self.page_chars.append([]) logging.info("Images converted.") - self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))] + self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))] if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: self.is_english = True else: @@ -1644,8 +1704,8 @@ class HuParser: # np.max([c["bottom"] for c in chars])) self.__ocr_paddle(i + 1, img, chars, zoomin) - if not self.is_english and not all([c for c in self.page_chars]) and self.boxes: - self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)])) + if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: + self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices([b for bxs in self.boxes for b in bxs], k=30)])) logging.info("Is it English:", self.is_english) -- GitLab