From 5875c8ba088963f59bc041971a789debc0e4469b Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Wed, 20 Mar 2024 18:57:22 +0800 Subject: [PATCH] Add 'One' chunk method (#137) --- README.md | 4 +- api/db/__init__.py | 1 + api/db/init_data.py | 12 ++++- api/settings.py | 12 ++--- rag/app/manual.py | 8 +-- rag/app/naive.py | 2 +- rag/app/one.py | 108 +++++++++++++++++++++++++++++++++++++++ rag/llm/__init__.py | 12 ++--- rag/nlp/search.py | 4 +- rag/svr/task_broker.py | 1 + rag/svr/task_executor.py | 3 +- 11 files changed, 143 insertions(+), 24 deletions(-) create mode 100644 rag/app/one.py diff --git a/README.md b/README.md index d720f75..2e858d8 100644 --- a/README.md +++ b/README.md @@ -88,8 +88,8 @@ If your machine doesn't have *Docker* installed, please refer to [Install Docker > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_. > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system. > - We have supported the flowing LLM factory, and the others is coming soon: -> [OpenAI](https://platform.openai.com/login?launch), [通义ĺŤé—®/QWen](https://dashscope.console.aliyun.com/model), -> [智谱AI/ZhipuAI](https://open.bigmodel.cn/) +> [OpenAI](https://platform.openai.com/login?launch), [Tongyi-Qianwen](https://dashscope.console.aliyun.com/model), +> [ZHIPU-AI](https://open.bigmodel.cn/), [Moonshot](https://platform.moonshot.cn/docs/docs) ```bash 121:/# git clone https://github.com/infiniflow/ragflow.git 121:/# cd ragflow/docker diff --git a/api/db/__init__.py b/api/db/__init__.py index c1f5d80..1ba7938 100644 --- a/api/db/__init__.py +++ b/api/db/__init__.py @@ -79,3 +79,4 @@ class ParserType(StrEnum): TABLE = "table" NAIVE = "naive" PICTURE = "picture" + ONE = "one" diff --git a/api/db/init_data.py b/api/db/init_data.py index a930fb4..3418bcf 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -79,12 +79,12 @@ factory_infos = [{ "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "status": "1", },{ - "name": "通义ĺŤé—®", + "name": "Tongyi-Qianwen", "logo": "", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "status": "1", },{ - "name": "智谱AI", + "name": "ZHIPU-AI", "logo": "", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "status": "1", @@ -270,6 +270,14 @@ def init_llm_factory(): except Exception as e: pass + """ + drop table llm; + drop table factories; + update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义ĺŤé—®'; + update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI'; + update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture'; + """ + def init_web_data(): start_time = time.time() diff --git a/api/settings.py b/api/settings.py index b2fe8d8..030d198 100644 --- a/api/settings.py +++ b/api/settings.py @@ -52,7 +52,7 @@ REQUEST_MAX_WAIT_SEC = 300 USE_REGISTRY = get_base_config("use_registry") default_llm = { - "通义ĺŤé—®": { + "Tongyi-Qianwen": { "chat_model": "qwen-plus", "embedding_model": "text-embedding-v2", "image2text_model": "qwen-vl-max", @@ -64,7 +64,7 @@ default_llm = { "image2text_model": "gpt-4-vision-preview", "asr_model": "whisper-1", }, - "智谱AI": { + "ZHIPU-AI": { "chat_model": "glm-3-turbo", "embedding_model": "embedding-2", "image2text_model": "glm-4v", @@ -84,17 +84,17 @@ default_llm = { } } LLM = get_base_config("user_default_llm", {}) -LLM_FACTORY = LLM.get("factory", "通义ĺŤé—®") +LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen") if LLM_FACTORY not in default_llm: - print("\33[91mă€ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to '通义ĺŤé—®/QWen' automatically, and please check the API_KEY in service_conf.yaml.") - LLM_FACTORY = "通义ĺŤé—®" + print("\33[91mă€ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.") + LLM_FACTORY = "Tongyi-Qianwen" CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"] ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] API_KEY = LLM.get("api_key", "") -PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") +PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") # distribution DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) diff --git a/rag/app/manual.py b/rag/app/manual.py index b8b4d7a..7ca5451 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -57,7 +57,7 @@ class Pdf(PdfParser): sec_ids = [] sid = 0 for i, lvl in enumerate(levels): - if lvl <= most_level: sid += 1 + if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1 sec_ids.append(sid) #print(lvl, self.boxes[i]["text"], most_level) @@ -75,7 +75,7 @@ class Pdf(PdfParser): continue chunks.append(txt + poss) if sec_id >-1: last_sid = sec_id - return chunks + return chunks, tbls def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): @@ -86,7 +86,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() - cks = pdf_parser(filename if not binary else binary, + cks, tbls = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) else: raise NotImplementedError("file type not supported yet(pdf supported)") doc = { @@ -100,7 +100,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca i = 0 chunk = [] tk_cnt = 0 - res = [] + res = tokenize_table(tbls, doc, eng) def add_chunk(): nonlocal chunk, res, doc, pdf_parser, tk_cnt d = copy.deepcopy(doc) diff --git a/rag/app/naive.py b/rag/app/naive.py index 4c82e56..230f967 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -49,7 +49,7 @@ class Pdf(PdfParser): def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ - Supported file formats are docx, pdf, txt. + Supported file formats are docx, pdf, excel, txt. This method apply the naive ways to chunk files. Successive text will be sliced into pieces using 'delimiter'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. diff --git a/rag/app/one.py b/rag/app/one.py new file mode 100644 index 0000000..d43961a --- /dev/null +++ b/rag/app/one.py @@ -0,0 +1,108 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import copy +import re +from rag.app import laws +from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions +from deepdoc.parser import PdfParser, ExcelParser +from rag.settings import cron_logger + + +class Pdf(PdfParser): + def __call__(self, filename, binary=None, from_page=0, + to_page=100000, zoomin=3, callback=None): + callback(msg="OCR is running...") + self.__images__( + filename if not binary else binary, + zoomin, + from_page, + to_page, + callback + ) + callback(msg="OCR finished") + + from timeit import default_timer as timer + start = timer() + self._layouts_rec(zoomin) + callback(0.63, "Layout analysis finished.") + print("paddle layouts:", timer() - start) + self._table_transformer_job(zoomin) + callback(0.65, "Table analysis finished.") + self._text_merge() + callback(0.67, "Text merging finished") + tbls = self._extract_table_figure(True, zoomin, True, True) + self._concat_downward() + + sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] + for (img, rows), poss in tbls: + sections.append((rows if isinstance(rows, str) else rows[0], + [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) + return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))] + + +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): + """ + Supported file formats are docx, pdf, excel, txt. + One file forms a chunk which maintains original text order. + """ + + eng = lang.lower() == "english"#is_english(cks) + + sections = [] + if re.search(r"\.docx?$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + for txt in laws.Docx()(filename, binary): + sections.append(txt) + callback(0.8, "Finish parsing.") + elif re.search(r"\.pdf$", filename, re.IGNORECASE): + pdf_parser = Pdf() + sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback) + elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + excel_parser = ExcelParser() + sections = [excel_parser.html(binary)] + elif re.search(r"\.txt$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + txt = "" + if binary: + txt = binary.decode("utf-8") + else: + with open(filename, "r") as f: + while True: + l = f.readline() + if not l: break + txt += l + sections = txt.split("\n") + sections = [(l, "") for l in sections if l] + callback(0.8, "Finish parsing.") + else: + raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") + + doc = { + "docnm_kwd": filename, + "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } + doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + tokenize(doc, "\n".join(sections), eng) + return [doc] + + +if __name__ == "__main__": + import sys + + + def dummy(prog=None, msg=""): + pass + + + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index cc4e462..74a8dbf 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -21,8 +21,8 @@ from .cv_model import * EmbeddingModel = { "Local": HuEmbedding, "OpenAI": OpenAIEmbed, - "通义ĺŤé—®": HuEmbedding, #QWenEmbed, - "智谱AI": ZhipuEmbed, + "Tongyi-Qianwen": HuEmbedding, #QWenEmbed, + "ZHIPU-AI": ZhipuEmbed, "Moonshot": HuEmbedding } @@ -30,16 +30,16 @@ EmbeddingModel = { CvModel = { "OpenAI": GptV4, "Local": LocalCV, - "通义ĺŤé—®": QWenCV, - "智谱AI": Zhipu4V, + "Tongyi-Qianwen": QWenCV, + "ZHIPU-AI": Zhipu4V, "Moonshot": LocalCV } ChatModel = { "OpenAI": GptTurbo, - "智谱AI": ZhipuChat, - "通义ĺŤé—®": QWenChat, + "ZHIPU-AI": ZhipuChat, + "Tongyi-Qianwen": QWenChat, "Local": LocalLLM, "Moonshot": MoonshotChat } diff --git a/rag/nlp/search.py b/rag/nlp/search.py index f9fbcf2..9f89cd5 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -194,7 +194,7 @@ class Dealer: return [float(t) for t in txt.split("\t")] def insert_citations(self, answer, chunks, chunk_v, - embd_mdl, tkweight=0.7, vtweight=0.3): + embd_mdl, tkweight=0.1, vtweight=0.9): assert len(chunks) == len(chunk_v) pieces = re.split(r"(```)", answer) if len(pieces) >= 3: @@ -243,7 +243,7 @@ class Dealer: chunks_tks, tkweight, vtweight) mx = np.max(sim) * 0.99 - if mx < 0.7: + if mx < 0.65: continue cites[idx[i]] = list( set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index 665ab1e..62f0d07 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -84,6 +84,7 @@ def dispatch(): pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) page_size = 5 if r["parser_id"] == "paper": page_size = 12 + if r["parser_id"] == "one": page_size = 1000000000 for s,e in r["parser_config"].get("pages", [(0,100000)]): e = min(e, pages) for p in range(s, e, page_size): diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index f8438e1..f88faf7 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -39,7 +39,7 @@ from rag.nlp import search from io import BytesIO import pandas as pd -from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive +from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one from api.db import LLMType, ParserType from api.db.services.document_service import DocumentService @@ -60,6 +60,7 @@ FACTORY = { ParserType.TABLE.value: table, ParserType.RESUME.value: resume, ParserType.PICTURE.value: picture, + ParserType.ONE.value: one, } -- GitLab