From 072f9dd5bc27f95cd7236f3c9490b7a088f14185 Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Thu, 25 Jan 2024 18:57:39 +0800 Subject: [PATCH] Add app to rag module: presentaion & laws (#43) --- api/utils/file_utils.py | 2 +- rag/app/__init__.py | 48 ++++++++++ rag/app/laws.py | 192 +++++++++++++++++++++++++++++++++++++ rag/app/presentation.py | 127 +++++++++++++++++++++++++ rag/nlp/huchunk.py | 5 - rag/parser/pdf_parser.py | 201 ++++++++++++++++++++------------------- rag/settings.py | 1 + 7 files changed, 473 insertions(+), 103 deletions(-) create mode 100644 rag/app/__init__.py create mode 100644 rag/app/laws.py create mode 100644 rag/app/presentation.py diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index c3446b2..14a2e3c 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -150,4 +150,4 @@ def filename_type(filename): return FileType.AURAL.value if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): - return FileType.VISUAL + return FileType.VISUAL \ No newline at end of file diff --git a/rag/app/__init__.py b/rag/app/__init__.py new file mode 100644 index 0000000..1ef52a4 --- /dev/null +++ b/rag/app/__init__.py @@ -0,0 +1,48 @@ +import re + + +def callback__(progress, msg, func): + if not func :return + func(progress, msg) + + +BULLET_PATTERN = [[ + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+编", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+ç« ", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+节", + r"第[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+条", + r"[\(ďĽ][零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[\))]", + ], [ + r"[0-9]{,3}[\. ă€]", + r"[0-9]{,2}\.[0-9]{,2}", + r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", + r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", + ], [ + r"[零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[ ă€]", + r"[\(ďĽ][零一二三四五ĺ…ä¸ĺ…«äąťĺŤç™ľ]+[\))]", + r"[\(ďĽ][0-9]{,2}[\))]", + ] ,[ + r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", + r"Chapter (I+V?|VI*|XI|IX|X)", + r"Section [0-9]+", + r"Article [0-9]+" + ] + ] + + +def bullets_category(sections): + global BULLET_PATTERN + hits = [0] * len(BULLET_PATTERN) + for i, pro in enumerate(BULLET_PATTERN): + for sec in sections: + for p in pro: + if re.match(p, sec): + hits[i] += 1 + break + maxium = 0 + res = -1 + for i,h in enumerate(hits): + if h <= maxium:continue + res = i + maxium = h + return res diff --git a/rag/app/laws.py b/rag/app/laws.py new file mode 100644 index 0000000..dfb70d5 --- /dev/null +++ b/rag/app/laws.py @@ -0,0 +1,192 @@ +import copy +import re +from io import BytesIO +from docx import Document +import numpy as np +from rag.app import callback__, bullets_category, BULLET_PATTERN +from rag.nlp import huqie +from rag.parser.pdf_parser import HuParser + + +class Docx(object): + def __init__(self): + pass + + def __clean(self, line): + line = re.sub(r"\u3000", " ", line).strip() + return line + + def __call__(self, filename, binary=None): + self.doc = Document( + filename) if not binary else Document(BytesIO(binary)) + lines = [self.__clean(p.text) for p in self.doc.paragraphs] + return [l for l in lines if l] + + +class Pdf(HuParser): + def __call__(self, filename, binary=None, from_page=0, + to_page=100000, zoomin=3, callback=None): + self.__images__( + filename if not binary else binary, + zoomin, + from_page, + to_page) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2, + "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback) + + from timeit import default_timer as timer + start = timer() + self._layouts_paddle(zoomin) + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2, + "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback) + print("paddle layouts:", timer()-start) + bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) + # is it English + eng = 0 + for b in bxs: + if re.match(r"[a-zA-Z]", b["text"].strip()): + eng += 1 + if eng / len(bxs) > 0.8: + eng = True + else: + eng = False + # Merge vertically + i = 0 + while i + 1 < len(bxs): + b = bxs[i] + b_ = bxs[i + 1] + if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): + bxs.pop(i) + continue + concatting_feats = [ + b["text"].strip()[-1] in ",;:'\",ă€â€â€śďĽ›ďĽš", + len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",â€â€śă€ďĽ›ďĽš", + b["text"].strip()[0] in "。;?ďĽ?”)),,ă€ďĽš", + ] + # features for not concating + feats = [ + b.get("layoutno",0) != b.get("layoutno",0), + b["text"].strip()[-1] in "。?ďĽ?", + eng and b["text"].strip()[-1] in ".!?", + b["page_number"] == b_["page_number"] and b_["top"] - \ + b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, + b["page_number"] < b_["page_number"] and abs( + b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4 + ] + if any(feats) and not any(concatting_feats): + i += 1 + continue + # merge up and down + b["bottom"] = b_["bottom"] + b["text"] += b_["text"] + b["x0"] = min(b["x0"], b_["x0"]) + b["x1"] = max(b["x1"], b_["x1"]) + bxs.pop(i + 1) + + callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2, + "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback) + + return [b["text"] + self._line_tag(b, zoomin) for b in bxs] + + +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): + doc = { + "docnm_kwd": filename, + "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } + doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + pdf_parser = None + sections = [] + if re.search(r"\.docx?$", filename, re.IGNORECASE): + for txt in Docx()(filename, binary): + sections.append(txt) + if re.search(r"\.pdf$", filename, re.IGNORECASE): + pdf_parser = Pdf() + for txt in pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback): + sections.append(txt) + if re.search(r"\.txt$", filename, re.IGNORECASE): + txt = "" + if binary:txt = binary.decode("utf-8") + else: + with open(filename, "r") as f: + while True: + l = f.readline() + if not l:break + txt += l + sections = txt.split("\n") + sections = [l for l in sections if l] + + # is it English + eng = 0 + for sec in sections: + if re.match(r"[a-zA-Z]", sec.strip()): + eng += 1 + if eng / len(sections) > 0.8: + eng = True + else: + eng = False + # Remove 'Contents' part + i = 0 + while i < len(sections): + if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])): + i += 1 + continue + sections.pop(i) + if i >= len(sections): break + prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2]) + while not prefix: + sections.pop(i) + if i >= len(sections): break + prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2]) + sections.pop(i) + if i >= len(sections) or not prefix: break + for j in range(i, min(i+128, len(sections))): + if not re.match(prefix, sections[j]): + continue + for k in range(i, j):sections.pop(i) + break + + bull = bullets_category(sections) + projs = [len(BULLET_PATTERN[bull])] * len(sections) + for i, sec in enumerate(sections): + for j,p in enumerate(BULLET_PATTERN[bull]): + if re.match(p, sec.strip()): + projs[i] = j + break + readed = [0] * len(sections) + cks = [] + for pr in range(len(BULLET_PATTERN[bull])-1, 1, -1): + for i in range(len(sections)): + if readed[i] or projs[i] < pr: + continue + # find father and grand-father and grand...father + p = projs[i] + readed[i] = 1 + ck = [sections[i]] + for j in range(i-1, -1, -1): + if projs[j] >= p:continue + ck.append(sections[j]) + readed[j] = 1 + p = projs[j] + if p == 0: break + cks.append(ck[::-1]) + + res = [] + # wrap up to es documents + for ck in cks: + print("\n-".join(ck)) + ck = "\n".join(ck) + d = copy.deepcopy(doc) + if pdf_parser: + d["image"] = pdf_parser.crop(ck) + ck = pdf_parser.remove_tag(ck) + d["content_ltks"] = huqie.qie(ck) + d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + res.append(d) + return res + + +if __name__ == "__main__": + import sys + chunk(sys.argv[1]) diff --git a/rag/app/presentation.py b/rag/app/presentation.py new file mode 100644 index 0000000..2713093 --- /dev/null +++ b/rag/app/presentation.py @@ -0,0 +1,127 @@ +import copy +import re +from io import BytesIO +from pptx import Presentation + +from rag.app import callback__ +from rag.nlp import huqie +from rag.parser.pdf_parser import HuParser + + +class Ppt(object): + def __init__(self): + super().__init__() + + def __extract(self, shape): + if shape.shape_type == 19: + tb = shape.table + rows = [] + for i in range(1, len(tb.rows)): + rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) + return "\n".join(rows) + + if shape.has_text_frame: + return shape.text_frame.text + + if shape.shape_type == 6: + texts = [] + for p in shape.shapes: + t = self.__extract(p) + if t: texts.append(t) + return "\n".join(texts) + + def __call__(self, fnm, from_page, to_page, callback=None): + ppt = Presentation(fnm) if isinstance( + fnm, str) else Presentation( + BytesIO(fnm)) + txts = [] + self.total_page = len(ppt.slides) + for i, slide in enumerate(ppt.slides[from_page: to_page]): + texts = [] + for shape in slide.shapes: + txt = self.__extract(shape) + if txt: texts.append(txt) + txts.append("\n".join(texts)) + callback__((i+1)/self.total_page/2, "", callback) + + callback__((min(to_page, self.total_page) - from_page) / self.total_page, + "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback) + import aspose.slides as slides + import aspose.pydrawing as drawing + imgs = [] + with slides.Presentation(BytesIO(fnm)) as presentation: + for i, slide in enumerate(presentation.slides[from_page: to_page]): + buffered = BytesIO() + slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg) + imgs.append(buffered.getvalue()) + assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) + callback__((min(to_page, self.total_page) - from_page) / self.total_page, + "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback) + + return [(txts[i], imgs[i]) for i in range(len(txts))] + + +class Pdf(HuParser): + def __init__(self): + super().__init__() + + def __garbage(self, txt): + txt = txt.lower().strip() + if re.match(r"[0-9\.,%/-]+$", txt): return True + if len(txt) < 3:return True + return False + + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): + self.__images__(filename if not binary else binary, zoomin, from_page, to_page) + callback__((min(to_page, self.total_page)-from_page) / self.total_page, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback) + assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) + res = [] + #################### More precisely ################### + # self._layouts_paddle(zoomin) + # self._text_merge() + # pages = {} + # for b in self.boxes: + # if self.__garbage(b["text"]):continue + # if b["page_number"] not in pages: pages[b["page_number"]] = [] + # pages[b["page_number"]].append(b["text"]) + # for i, lines in pages.items(): + # res.append(("\n".join(lines), self.page_images[i-1])) + # return res + ######################################## + + for i in range(len(self.boxes)): + lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])]) + res.append((lines, self.page_images[i])) + return res + + +def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): + doc = { + "docnm_kwd": filename, + "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } + doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + res = [] + if re.search(r"\.pptx?$", filename, re.IGNORECASE): + for txt,img in Ppt()(filename if not binary else binary, from_page, to_page, callback): + d = copy.deepcopy(doc) + d["content_ltks"] = huqie.qie(txt) + d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + d["image"] = img + res.append(d) + return res + if re.search(r"\.pdf$", filename, re.IGNORECASE): + for txt,img in Pdf()(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback): + d = copy.deepcopy(doc) + d["content_ltks"] = huqie.qie(txt) + d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + d["image"] = img + res.append(d) + return res + callback__(-1, "This kind of presentation document did not support yet!", callback) + + +if __name__== "__main__": + import sys + print(chunk(sys.argv[1])) + diff --git a/rag/nlp/huchunk.py b/rag/nlp/huchunk.py index c8f9e47..ba81a46 100644 --- a/rag/nlp/huchunk.py +++ b/rag/nlp/huchunk.py @@ -352,11 +352,6 @@ class ExcelChunker(HuChunker): class PptChunker(HuChunker): - @dataclass - class Fields: - text_chunks: List = None - table_chunks: List = None - def __init__(self): super().__init__() diff --git a/rag/parser/pdf_parser.py b/rag/parser/pdf_parser.py index 31accd7..519dbb5 100644 --- a/rag/parser/pdf_parser.py +++ b/rag/parser/pdf_parser.py @@ -370,7 +370,7 @@ class HuParser: res.append(lts) return res - def __table_transformer_job(self, ZM): + def _table_transformer_job(self, ZM): logging.info("Table processing...") imgs, pos = [], [] tbcnt = [0] @@ -416,6 +416,50 @@ class HuParser: pg.append(it) self.tb_cpns.extend(pg) + def gather(kwd, fzy=10, ption=0.6): + eles = self.sort_Y_firstly( + [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy) + eles = self.__layouts_cleanup(self.boxes, eles, 5, ption) + return self.sort_Y_firstly(eles, 0) + + # add R,H,C,SP tag to boxes within table layout + headers = gather(r".*header$") + rows = gather(r".* (row|header)") + spans = gather(r".*spanning") + clmns = sorted([r for r in self.tb_cpns if re.match( + r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"])) + clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5) + for b in self.boxes: + if b.get("layout_type", "") != "table": + continue + ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3) + if ii is not None: + b["R"] = ii + b["R_top"] = rows[ii]["top"] + b["R_bott"] = rows[ii]["bottom"] + + ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3) + if ii is not None: + b["H_top"] = headers[ii]["top"] + b["H_bott"] = headers[ii]["bottom"] + b["H_left"] = headers[ii]["x0"] + b["H_right"] = headers[ii]["x1"] + b["H"] = ii + + ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3) + if ii is not None: + b["C"] = ii + b["C_left"] = clmns[ii]["x0"] + b["C_right"] = clmns[ii]["x1"] + + ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3) + if ii is not None: + b["H_top"] = spans[ii]["top"] + b["H_bott"] = spans[ii]["bottom"] + b["H_left"] = spans[ii]["x0"] + b["H_right"] = spans[ii]["x1"] + b["SP"] = ii + def __ocr_paddle(self, pagenum, img, chars, ZM=3): bxs = self.ocr.ocr(np.array(img), cls=True)[0] if not bxs: @@ -453,7 +497,7 @@ class HuParser: self.boxes.append(bxs) - def __layouts_paddle(self, ZM): + def _layouts_paddle(self, ZM): assert len(self.page_images) == len(self.boxes) # Tag layout type boxes = [] @@ -524,7 +568,24 @@ class HuParser: self.boxes = boxes - def __text_merge(self, garbage): + garbage = set() + for k in self.garbages.keys(): + self.garbages[k] = Counter(self.garbages[k]) + for g, c in self.garbages[k].items(): + if c > 1: + garbage.add(g) + + logging.debug("GARBAGE:" + ",".join(garbage)) + self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage] + + # cumlative Y + for i in range(len(self.boxes)): + self.boxes[i]["top"] += \ + self.page_cum_height[self.boxes[i]["page_number"] - 1] + self.boxes[i]["bottom"] += \ + self.page_cum_height[self.boxes[i]["page_number"] - 1] + + def _text_merge(self): # merge adjusted boxes bxs = self.boxes @@ -537,6 +598,7 @@ class HuParser: tt = b.get("text", "").strip() return tt and any([tt.find(t.strip()) == 0 for t in txts]) + # horizontally merge adjacent box with the same layout i = 0 while i < len(bxs) - 1: b = bxs[i] @@ -567,7 +629,8 @@ class HuParser: i += 1 self.boxes = bxs - # count boxes in the same row + def _concat_downward(self): + # count boxes in the same row as a feature for i in range(len(self.boxes)): mh = self.mean_height[self.boxes[i]["page_number"] - 1] self.boxes[i]["in_row"] = 0 @@ -583,49 +646,6 @@ class HuParser: break j += 1 - def gather(kwd, fzy=10, ption=0.6): - eles = self.sort_Y_firstly( - [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy) - eles = self.__layouts_cleanup(self.boxes, eles, 5, ption) - return self.sort_Y_firstly(eles, 0) - - headers = gather(r".*header$") - rows = gather(r".* (row|header)") - spans = gather(r".*spanning") - clmns = sorted([r for r in self.tb_cpns if re.match( - r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"])) - clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5) - for b in self.boxes: - if b.get("layout_type", "") != "table": - continue - ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3) - if ii is not None: - b["R"] = ii - b["R_top"] = rows[ii]["top"] - b["R_bott"] = rows[ii]["bottom"] - - ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3) - if ii is not None: - b["H_top"] = headers[ii]["top"] - b["H_bott"] = headers[ii]["bottom"] - b["H_left"] = headers[ii]["x0"] - b["H_right"] = headers[ii]["x1"] - b["H"] = ii - - ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3) - if ii is not None: - b["C"] = ii - b["C_left"] = clmns[ii]["x0"] - b["C_right"] = clmns[ii]["x1"] - - ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3) - if ii is not None: - b["H_top"] = spans[ii]["top"] - b["H_bott"] = spans[ii]["bottom"] - b["H_left"] = spans[ii]["x0"] - b["H_right"] = spans[ii]["x1"] - b["SP"] = ii - # concat between rows boxes = deepcopy(self.boxes) blocks = [] @@ -633,8 +653,6 @@ class HuParser: chunks = [] def dfs(up, dp): - if not up["text"].strip() or up["text"].strip() in garbage: - return chunks.append(up) i = dp while i < min(dp + 12, len(boxes)): @@ -658,8 +676,7 @@ class HuParser: i += 1 continue - if not down["text"].strip() \ - or down["text"].strip() in garbage: + if not down["text"].strip(): i += 1 continue @@ -1444,18 +1461,19 @@ class HuParser: return j return + def _line_tag(self, bx, ZM): + pn = [bx["page_number"]] + top = bx["top"] - self.page_cum_height[pn[0] - 1] + bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] + while bott * ZM > self.page_images[pn[-1] - 1].size[1]: + bott -= self.page_images[pn[-1] - 1].size[1] / ZM + pn.append(pn[-1] + 1) + + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ + .format("-".join([str(p) for p in pn]), + bx["x0"], bx["x1"], top, bott) + def __filterout_scraps(self, boxes, ZM): - def line_tag(bx): - pn = [bx["page_number"]] - top = bx["top"] - self.page_cum_height[pn[0] - 1] - bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] - while bott * ZM > self.page_images[pn[-1] - 1].size[1]: - bott -= self.page_images[pn[-1] - 1].size[1] / ZM - pn.append(pn[-1] + 1) - - return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ - .format("-".join([str(p) for p in pn]), - bx["x0"], bx["x1"], top, bott) def width(b): return b["x1"] - b["x0"] @@ -1520,14 +1538,14 @@ class HuParser: boxes.pop(0) mw = np.mean(widths) if mj or mw / pw >= 0.35 or mw > 200: - res.append("\n".join([c["text"] + line_tag(c) for c in lines])) + res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines])) else: logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines])) return "\n\n".join(res) - def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): + def __images__(self, fnm, zoomin=3, page_from=0, page_to=299): self.lefted_chars = [] self.mean_height = [] self.mean_width = [] @@ -1537,22 +1555,25 @@ class HuParser: self.page_layout = [] try: self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) - self.page_images = [p.to_image(resolution=72*zoomin).annotated for i,p in enumerate(self.pdf.pages[:299])] - self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in range(len(self.page_images))] + self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in + enumerate(self.pdf.pages[page_from:page_to])] + self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in + range(len(self.page_images))] + self.total_page = len(self.pdf.pages) except Exception as e: self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf") self.page_images = [] self.page_chars = [] mat = fitz.Matrix(zoomin, zoomin) - for page in self.pdf: - pix = page.getPixmap(matrix = mat) + self.total_page = len(self.pdf) + for page in self.pdf[page_from:page_to]: + pix = page.getPixmap(matrix=mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) self.page_images.append(img) self.page_chars.append([]) logging.info("Images converted.") - for i, img in enumerate(self.page_images): chars = self.page_chars[i] self.mean_height.append( @@ -1561,40 +1582,26 @@ class HuParser: self.mean_width.append( np.median(sorted([c["width"] for c in chars])) if chars else 8 ) - if i > 0: - if not chars: - self.page_cum_height.append(img.size[1] / zoomin) - else: - self.page_cum_height.append( - np.max([c["bottom"] for c in chars])) + self.page_cum_height.append(img.size[1] / zoomin) + # if i > 0: + # if not chars: + # self.page_cum_height.append(img.size[1] / zoomin) + # else: + # self.page_cum_height.append( + # np.max([c["bottom"] for c in chars])) self.__ocr_paddle(i + 1, img, chars, zoomin) - self.__layouts_paddle(zoomin) self.page_cum_height = np.cumsum(self.page_cum_height) - assert len(self.page_cum_height) == len(self.page_images) + assert len(self.page_cum_height) == len(self.page_images)+1 - garbage = set() - for k in self.garbages.keys(): - self.garbages[k] = Counter(self.garbages[k]) - for g, c in self.garbages[k].items(): - if c > 1: - garbage.add(g) - - logging.debug("GARBAGE:" + ",".join(garbage)) - self.boxes = [b for b in self.boxes if b["text"] not in garbage] - - # cumlative Y - for i in range(len(self.boxes)): - self.boxes[i]["top"] += \ - self.page_cum_height[self.boxes[i]["page_number"] - 1] - self.boxes[i]["bottom"] += \ - self.page_cum_height[self.boxes[i]["page_number"] - 1] - - self.__table_transformer_job(zoomin) - self.__text_merge(garbage) + def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): + self.__images__(fnm, zoomin) + self._layouts_paddle(zoomin) + self._table_transformer_job(zoomin) + self._text_merge() + self._concat_downward() self.__filter_forpages() tbls = self.__extract_table_figure(need_image, zoomin, return_html) - return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls def remove_tag(self, txt): diff --git a/rag/settings.py b/rag/settings.py index 23a7454..3c13eef 100644 --- a/rag/settings.py +++ b/rag/settings.py @@ -35,3 +35,4 @@ LoggerFactory.LEVEL = 10 es_logger = getLogger("es") minio_logger = getLogger("minio") cron_logger = getLogger("cron_logger") +chunk_logger = getLogger("chunk_logger") -- GitLab