From 369400c483ee74addb3ef45a5cd539f2c3e69e5d Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Tue, 23 Apr 2024 19:10:33 +0800 Subject: [PATCH] fix bug of table in docx (#510) ### What problem does this PR solve? #509 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/book.py | 1 + rag/app/naive.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/rag/app/book.py b/rag/app/book.py index 613c1f2..3e1eaf7 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -76,6 +76,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, binary if binary else filename, from_page=from_page, to_page=to_page) remove_contents_table(sections, eng=is_english( random_choices([t for t, _ in sections], k=200))) + tbls = [((None, lns), None) for lns in tbls] callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): diff --git a/rag/app/naive.py b/rag/app/naive.py index cd77cdc..2ab05c1 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -13,6 +13,7 @@ from tika import parser from io import BytesIO from docx import Document +from timeit import default_timer as timer import re from deepdoc.parser.pdf_parser import PlainParser from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec @@ -67,7 +68,6 @@ class Docx(DocxParser): class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - from timeit import default_timer as timer start = timer() callback(msg="OCR is running...") self.__images__( @@ -83,7 +83,6 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis finished.") - print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.65, "Table analysis finished.") self._text_merge() @@ -93,8 +92,7 @@ class Pdf(PdfParser): self._concat_downward() #self._filter_forpages() - cron_logger.info("layouts: {}".format( - (timer() - start) / (self.total_page + 0.1))) + cron_logger.info("layouts: {}".format(timer() - start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls @@ -167,12 +165,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, raise NotImplementedError( "file type not supported yet(doc, docx, pdf, txt supported)") + st = timer() chunks = naive_merge( sections, parser_config.get( "chunk_token_num", 128), parser_config.get( "delimiter", "\n!?。;ďĽďĽź")) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) + cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) return res -- GitLab