diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index dbebc39ef8b5e6be85b9218b0b6cf8fda3a573d3..c85e0b3cd634f3ea8582e183fe5e716717efef92 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -196,7 +196,10 @@ def chat(dialog, messages, **kwargs): for _ in range(len(questions)//2): questions.append(questions[-1]) - kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, + if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]: + kbinfos = {"total":0, "chunks":[],"doc_aggs":[]} + else: + kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, dialog.similarity_threshold, dialog.vector_similarity_weight, top=1024, aggs=False) knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 04f721ac946af801733299b9ee16bb0ed6ad65f6..b944076d9d49f8f87484eeccc88a84a6bf3da3ae 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -310,7 +310,10 @@ def change_parser(): if not e: return get_data_error_result(retmsg="Document not found!") if doc.parser_id.lower() == req["parser_id"].lower(): - return get_json_result(data=True) + if "parser_config" in req: + if req["parser_config"] == doc.parser_config: + return get_json_result(data=True) + else: return get_json_result(data=True) if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name): return get_data_error_result(retmsg="Not supported yet!") @@ -319,6 +322,8 @@ def change_parser(): {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"}) if not e: return get_data_error_result(retmsg="Document not found!") + if "parser_config" in req: + DocumentService.update_parser_config(doc.id, req["parser_config"]) if doc.token_num > 0: e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duation * -1) diff --git a/api/db/init_data.py b/api/db/init_data.py index b4b63d36bb26a56e76e2ba14a9043ce3f144cc65..89a94e6da4fc443b9c4e8f4b81fb5fb385974e29 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -276,7 +276,7 @@ def init_llm_factory(): drop table llm_factories; update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义ĺŤé—®'; update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI'; - update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture'; + update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One'; alter table knowledgebase modify avatar longtext; alter table user modify avatar longtext; alter table dialog modify icon longtext; @@ -297,5 +297,4 @@ def init_web_data(): if __name__ == '__main__': init_web_db() - init_web_data() - add_tenant_llm() \ No newline at end of file + init_web_data() \ No newline at end of file diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index b219861b9a663dd73ec0580d63ef595d431e4a5a..a01798f45c298f8c82c64c00ec508b49dbca6bb3 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -118,9 +118,25 @@ class DocumentService(CommonService): if not docs:return return docs[0]["tenant_id"] - @classmethod @DB.connection_context() def get_thumbnails(cls, docids): fields = [cls.model.id, cls.model.thumbnail] return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts()) + + @classmethod + @DB.connection_context() + def update_parser_config(cls, id, config): + e, d = cls.get_by_id(id) + if not e:raise LookupError(f"Document({id}) not found.") + def dfs_update(old, new): + for k,v in new.items(): + if k not in old: + old[k] = v + continue + if isinstance(v, dict): + assert isinstance(old[k], dict) + dfs_update(old[k], v) + else: old[k] = v + dfs_update(d.parser_config, config) + cls.update_by_id(id, {"parser_config": d.parser_config}) \ No newline at end of file diff --git a/api/settings.py b/api/settings.py index 030d1983dc7735ab4d02b0c6a14043ccfedace72..93c5906a9490c531d10198aff475fd02e8f51b17 100644 --- a/api/settings.py +++ b/api/settings.py @@ -94,7 +94,7 @@ ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] API_KEY = LLM.get("api_key", "") -PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") +PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One") # distribution DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index bbec4042554928af8cc4794836125ba0893ddc66..82042879c0c8f0056a73191f00301f41ea835ec4 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -1,6 +1,6 @@ -from .pdf_parser import HuParser as PdfParser +from .pdf_parser import HuParser as PdfParser, PlainParser from .docx_parser import HuDocxParser as DocxParser from .excel_parser import HuExcelParser as ExcelParser from .ppt_parser import HuPptParser as PptParser diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 65185280160d69c119384535d0927376a9b127d7..1a4afa1387c5e5d0456b33b01b0231a2f17eb9ec 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1073,5 +1073,37 @@ class HuParser: return poss +class PlainParser(object): + def __call__(self, filename, **kwargs): + self.outlines = [] + lines = [] + try: + self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename)) + outlines = self.pdf.outline + for page in self.pdf.pages: + lines.extend([t for t in page.extract_text().split("\n")]) + + def dfs(arr, depth): + for a in arr: + if isinstance(a, dict): + self.outlines.append((a["/Title"], depth)) + continue + dfs(a, depth + 1) + + dfs(outlines, 0) + except Exception as e: + logging.warning(f"Outlines exception: {e}") + if not self.outlines: + logging.warning(f"Miss outlines") + + return [(l, "") for l in lines], [] + + def crop(self, ck, need_position): + raise NotImplementedError + + @staticmethod + def remove_tag(txt): + raise NotImplementedError + if __name__ == "__main__": pass diff --git a/rag/app/book.py b/rag/app/book.py index 24e3f3bf8068636f77aff35f870a5db39620d6bb..6f51a95c09b003ca649509bc37d84c0edd31139b 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -12,10 +12,12 @@ # import copy import re +from io import BytesIO + from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ - hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions + hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, tokenize_chunks from rag.nlp import huqie -from deepdoc.parser import PdfParser, DocxParser +from deepdoc.parser import PdfParser, DocxParser, PlainParser class Pdf(PdfParser): @@ -69,10 +71,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page) remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200))) callback(0.8, "Finish parsing.") + elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() + pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser() sections, tbls = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) + elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" @@ -87,31 +91,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections = [(l,"") for l in sections if l] remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200))) callback(0.8, "Finish parsing.") + else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") make_colon_as_title(sections) bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)]) - if bull >= 0: cks = hierarchical_merge(bull, sections, 3) + if bull >= 0: + chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)] else: sections = [s.split("@") for s,_ in sections] sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2] - cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;ďĽďĽź")) + chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;ďĽďĽź")) # is it English eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218)) res = tokenize_table(tbls, doc, eng) + res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) - # wrap up to es documents - for ck in cks: - d = copy.deepcopy(doc) - ck = "\n".join(ck) - if pdf_parser: - d["image"], poss = pdf_parser.crop(ck, need_position=True) - add_positions(d, poss) - ck = pdf_parser.remove_tag(ck) - tokenize(d, ck, eng) - res.append(d) return res diff --git a/rag/app/laws.py b/rag/app/laws.py index 947f913be03b2f868b1bfc8612872e2e16417809..d5b29e5ec7c8be84ceadcfef1c5e9d695afd0eb4 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -15,9 +15,9 @@ import re from io import BytesIO from docx import Document from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ - make_colon_as_title, add_positions + make_colon_as_title, add_positions, tokenize_chunks from rag.nlp import huqie -from deepdoc.parser import PdfParser, DocxParser +from deepdoc.parser import PdfParser, DocxParser, PlainParser from rag.settings import cron_logger @@ -68,7 +68,7 @@ class Pdf(PdfParser): callback(0.8, "Text extraction finished") - return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes] + return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): @@ -87,11 +87,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca for txt in Docx()(filename, binary): sections.append(txt) callback(0.8, "Finish parsing.") + elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - for txt in pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback): - sections.append(txt) + pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser() + for txt, poss in pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback): + sections.append(txt + poss) + elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" @@ -114,22 +116,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca make_colon_as_title(sections) bull = bullets_category(sections) - cks = hierarchical_merge(bull, sections, 3) - if not cks: callback(0.99, "No chunk parsed out.") - - res = [] - # wrap up to es documents - for ck in cks: - print("\n-".join(ck)) - ck = "\n".join(ck) - d = copy.deepcopy(doc) - if pdf_parser: - d["image"], poss = pdf_parser.crop(ck, need_position=True) - add_positions(d, poss) - ck = pdf_parser.remove_tag(ck) - tokenize(d, ck, eng) - res.append(d) - return res + chunks = hierarchical_merge(bull, sections, 3) + if not chunks: callback(0.99, "No chunk parsed out.") + + return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser) if __name__ == "__main__": diff --git a/rag/app/manual.py b/rag/app/manual.py index e72c2ac30f746f13836266fd76706dc7289e20fc..7ab1caa8533f6a38b03b7e8948278e275bc86a0a 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -2,8 +2,8 @@ import copy import re from api.db import ParserType -from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency -from deepdoc.parser import PdfParser +from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks +from deepdoc.parser import PdfParser, PlainParser from rag.utils import num_tokens_from_string @@ -30,9 +30,7 @@ class Pdf(PdfParser): # print(b) print("OCR:", timer()-start) - def tag(pn, left, right, top, bottom): - return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ - .format(pn, left, right, top, bottom) + self._layouts_rec(zoomin) callback(0.65, "Layout analysis finished.") @@ -49,6 +47,8 @@ class Pdf(PdfParser): for b in self.boxes: b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) + return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] + # set pivot using the most frequent type of title, # then merge between 2 pivot if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1: @@ -103,9 +103,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca pdf_parser = None if re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - cks, tbls = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) + pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser() + sections, tbls = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) + if sections and len(sections[0])<3: cks = [(t, l, [0]*5) for t, l in sections] else: raise NotImplementedError("file type not supported yet(pdf supported)") doc = { "docnm_kwd": filename @@ -115,13 +116,60 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # is it English eng = lang.lower() == "english"#pdf_parser.is_english + # set pivot using the most frequent type of title, + # then merge between 2 pivot + if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: + max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) + most_level = max(0, max_lvl - 1) + levels = [] + for txt, _, _ in sections: + for t, lvl in pdf_parser.outlines: + tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) + tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))]) + if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: + levels.append(lvl) + break + else: + levels.append(max_lvl + 1) + else: + bull = bullets_category([txt for txt,_,_ in sections]) + most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections]) + + assert len(sections) == len(levels) + sec_ids = [] + sid = 0 + for i, lvl in enumerate(levels): + if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1 + sec_ids.append(sid) + # print(lvl, self.boxes[i]["text"], most_level, sid) + + sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)] + for (img, rows), poss in tbls: + sections.append((rows if isinstance(rows, str) else rows[0], -1, + [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) + + def tag(pn, left, right, top, bottom): + if pn+left+right+top+bottom == 0: + return "" + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ + .format(pn, left, right, top, bottom) + + chunks = [] + last_sid = -2 + tk_cnt = 0 + for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])): + poss = "\t".join([tag(*pos) for pos in poss]) + if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1): + if chunks: + chunks[-1] += "\n" + txt + poss + tk_cnt += num_tokens_from_string(txt) + continue + chunks.append(txt + poss) + tk_cnt = num_tokens_from_string(txt) + if sec_id > -1: last_sid = sec_id + res = tokenize_table(tbls, doc, eng) - for ck in cks: - d = copy.deepcopy(doc) - d["image"], poss = pdf_parser.crop(ck, need_position=True) - add_positions(d, poss) - tokenize(d, pdf_parser.remove_tag(ck), eng) - res.append(d) + res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) return res diff --git a/rag/app/naive.py b/rag/app/naive.py index 230f96784466ea534575b0af0d74798aaf86c392..a92f2e3e1cd771a5df38bcd2618c40def84f265b 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -12,8 +12,9 @@ # import copy import re +from deepdoc.parser.pdf_parser import PlainParser from rag.app import laws -from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions +from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks from deepdoc.parser import PdfParser, ExcelParser from rag.settings import cron_logger @@ -56,6 +57,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca """ eng = lang.lower() == "english"#is_english(cks) + parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;ďĽďĽź", "layout_recognize": True}) doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -69,15 +71,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca for txt in laws.Docx()(filename, binary): sections.append((txt, "")) callback(0.8, "Finish parsing.") + elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() + pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser() sections, tbls = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) res = tokenize_table(tbls, doc, eng) + elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = ExcelParser() sections = [(excel_parser.html(binary), "")] + elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" @@ -92,26 +97,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections = txt.split("\n") sections = [(l, "") for l in sections if l] callback(0.8, "Finish parsing.") + else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") - parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;ďĽďĽź"}) - cks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;ďĽďĽź")) - - # wrap up to es documents - for ck in cks: - if len(ck.strip()) == 0:continue - print("--", ck) - d = copy.deepcopy(doc) - if pdf_parser: - try: - d["image"], poss = pdf_parser.crop(ck, need_position=True) - except Exception as e: - continue - add_positions(d, poss) - ck = pdf_parser.remove_tag(ck) - tokenize(d, ck, eng) - res.append(d) + chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;ďĽďĽź")) + + res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) return res diff --git a/rag/app/one.py b/rag/app/one.py index cce70277a7d1e8a9d8edf3421423f035453b8dd5..2ad59bece30152e03d26c3249b17727f0cc575e1 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -13,7 +13,7 @@ import re from rag.app import laws from rag.nlp import huqie, tokenize -from deepdoc.parser import PdfParser, ExcelParser +from deepdoc.parser import PdfParser, ExcelParser, PlainParser class Pdf(PdfParser): @@ -45,7 +45,7 @@ class Pdf(PdfParser): for (img, rows), poss in tbls: sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) - return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))] + return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))] def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): @@ -59,16 +59,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections = [] if re.search(r"\.docx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - for txt in laws.Docx()(filename, binary): - sections.append(txt) + sections = [txt for txt in laws.Docx()(filename, binary) if txt] callback(0.8, "Finish parsing.") + elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() + pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser() sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback) + sections = [s for s, _ in sections if s] + elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = ExcelParser() sections = [excel_parser.html(binary)] + elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" @@ -81,8 +84,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if not l: break txt += l sections = txt.split("\n") - sections = [(l, "") for l in sections if l] + sections = [s for s in sections if s] callback(0.8, "Finish parsing.") + else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") diff --git a/rag/app/paper.py b/rag/app/paper.py index c993502c3543d76a256d997a4a007ee250363f1b..11045772bd5bf198d297d44745c956a142ea27e3 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -15,8 +15,8 @@ import re from collections import Counter from api.db import ParserType -from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency -from deepdoc.parser import PdfParser +from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks +from deepdoc.parser import PdfParser, PlainParser import numpy as np from rag.utils import num_tokens_from_string @@ -59,24 +59,6 @@ class Pdf(PdfParser): self.boxes = self.sort_X_by_page(self.boxes, column_width / 2) for b in self.boxes: b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) - # freq = Counter([b["text"] for b in self.boxes]) - # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6]) - # i = 0 - # while i < len(self.boxes): - # if self.boxes[i]["text"] in garbage \ - # or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \ - # or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]): - # self.boxes.pop(i) - # elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno", - # '1'): - # # merge within same layouts - # self.boxes[i + 1]["top"] = self.boxes[i]["top"] - # self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"]) - # self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"]) - # self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"] - # self.boxes.pop(i) - # else: - # i += 1 def _begin(txt): return re.match( @@ -148,9 +130,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca """ pdf_parser = None if re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - paper = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) + if not kwargs.get("parser_config",{}).get("layout_recognize", True): + pdf_parser = PlainParser() + paper = { + "title": filename, + "authors": " ", + "abstract": "", + "sections": pdf_parser(filename if not binary else binary), + "tables": [] + } + else: + pdf_parser = Pdf() + paper = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) else: raise NotImplementedError("file type not supported yet(pdf supported)") doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]), @@ -195,16 +187,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca continue chunks.append(txt) last_sid = sec_id - for txt in chunks: - d = copy.deepcopy(doc) - d["image"], poss = pdf_parser.crop(txt, need_position=True) - add_positions(d, poss) - tokenize(d, pdf_parser.remove_tag(txt), eng) - res.append(d) - print("----------------------\n", pdf_parser.remove_tag(txt)) - + res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) return res +""" readed = [0] * len(paper["lines"]) # find colon firstly i = 0 @@ -280,7 +266,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca print(d) # d["image"].save(f"./logs/{i}.jpg") return res - +""" if __name__ == "__main__": import sys diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 98622b1753c8af1456879a668fd510564bec49e8..597bc7ac477804dab1a329cd88ddec86f3b3deff 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -18,7 +18,8 @@ from PIL import Image from rag.nlp import tokenize, is_english from rag.nlp import huqie -from deepdoc.parser import PdfParser, PptParser +from deepdoc.parser import PdfParser, PptParser, PlainParser +from PyPDF2 import PdfReader as pdf2_read class Ppt(PptParser): @@ -56,19 +57,6 @@ class Pdf(PdfParser): callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page))) assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) res = [] - #################### More precisely ################### - # self._layouts_rec(zoomin) - # self._text_merge() - # pages = {} - # for b in self.boxes: - # if self.__garbage(b["text"]):continue - # if b["page_number"] not in pages: pages[b["page_number"]] = [] - # pages[b["page_number"]].append(b["text"]) - # for i, lines in pages.items(): - # res.append(("\n".join(lines), self.page_images[i-1])) - # return res - ######################################## - for i in range(len(self.boxes)): lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])]) res.append((lines, self.page_images[i])) @@ -76,6 +64,16 @@ class Pdf(PdfParser): return res +class PlainPdf(PlainParser): + def __call__(self, filename, binary=None, callback=None, **kwargs): + self.pdf = pdf2_read(filename if not binary else BytesIO(filename)) + page_txt = [] + for page in self.pdf.pages: + page_txt.append(page.extract_text()) + callback(0.9, "Parsing finished") + return [(txt, None) for txt in page_txt] + + def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ The supported file formats are pdf, pptx. @@ -102,14 +100,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res.append(d) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)): + pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf() + for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)): d = copy.deepcopy(doc) pn += from_page - d["image"] = img + if img: d["image"] = img d["page_num_int"] = [pn+1] d["top_int"] = [0] - d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] + d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] tokenize(d, txt, eng) res.append(d) return res diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 92fbcdc3f65545c004789774a2994e99b54db00c..e30a4fa87ac289d76b9203926a24d1aabba6ea49 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -76,6 +76,25 @@ def tokenize(d, t, eng): d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) +def tokenize_chunks(chunks, doc, eng, pdf_parser): + res = [] + # wrap up as es documents + for ck in chunks: + if len(ck.strip()) == 0:continue + print("--", ck) + d = copy.deepcopy(doc) + if pdf_parser: + try: + d["image"], poss = pdf_parser.crop(ck, need_position=True) + add_positions(d, poss) + ck = pdf_parser.remove_tag(ck) + except NotImplementedError as e: + pass + tokenize(d, ck, eng) + res.append(d) + return res + + def tokenize_table(tbls, doc, eng, batch_size=10): res = [] # add tables diff --git a/rag/nlp/huqie.py b/rag/nlp/huqie.py index 548c2d26410fd4ca54f1c7d2bc118c9d0afa3383..90a1bfe23d75daca1c0ad78c7b963bbe5c817aa7 100644 --- a/rag/nlp/huqie.py +++ b/rag/nlp/huqie.py @@ -300,7 +300,11 @@ class Huqie: def qieqie(self, tks): tks = tks.split(" ") zh_num = len([1 for c in tks if c and is_chinese(c[0])]) - if zh_num < len(tks) * 0.2:return " ".join(tks) + if zh_num < len(tks) * 0.2: + res = [] + for tk in tks: + res.extend(tk.split("/")) + return " ".join(res) res = [] for tk in tks: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 3b6f20bbb4d7e6a61d9c4976d11f4476f24364f1..03be167f791dd4e8d4de382f4447e0e6530935b4 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -68,6 +68,7 @@ class Dealer: s = Search() pg = int(req.get("page", 1)) - 1 ps = int(req.get("size", 1000)) + topk = int(req.get("topk", 1024)) src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int", "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"]) @@ -103,7 +104,7 @@ class Dealer: assert emb_mdl, "No embedding model selected" s["knn"] = self._vector( qst, emb_mdl, req.get( - "similarity", 0.1), ps) + "similarity", 0.1), topk) s["knn"]["filter"] = bqry.to_dict() if "highlight" in s: del s["highlight"] @@ -292,8 +293,8 @@ class Dealer: ranks = {"total": 0, "chunks": [], "doc_aggs": {}} if not question: return ranks - req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top, - "question": question, "vector": True, + req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size, + "question": question, "vector": True, "topk": top, "similarity": similarity_threshold} sres = self.search(req, index_name(tenant_id), embd_mdl) diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index 9bcc8aa3120a8f89d9d03a9d1ab7c33c07603332..b882966897aec071b192b01c47886d90606f9ec7 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -81,11 +81,15 @@ def dispatch(): tsks = [] if r["type"] == FileType.PDF.value: + if not r["parser_config"].get("layout_recognize", True): + tsks.append(new_task()) + continue pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) - page_size = 12 - if r["parser_id"] == "paper": page_size = 22 + page_size = r["parser_config"].get("task_page_size", 12) + if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22) if r["parser_id"] == "one": page_size = 1000000000 - for s,e in r["parser_config"].get("pages", [(0,100000)]): + for s,e in r["parser_config"].get("pages", [(1, 100000)]): + s -= 1 e = min(e, pages) for p in range(s, e, page_size): task = new_task()