From a8294f216866aea5f0ac1331ea33cb07fc284b72 Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Mon, 19 Feb 2024 19:22:17 +0800 Subject: [PATCH] Refine resume parts and fix bugs in retrival using sql (#66) --- api/apps/conversation_app.py | 92 ++++++++++++++---------- api/apps/dialog_app.py | 2 +- api/apps/document_app.py | 13 ++-- api/apps/kb_app.py | 3 + api/apps/llm_app.py | 30 +++++++- api/apps/user_app.py | 7 +- api/db/db_models.py | 3 +- api/db/init_data.py | 41 ++--------- api/db/services/knowledgebase_service.py | 7 +- api/settings.py | 12 ++-- api/utils/file_utils.py | 6 +- conf/mapping.json | 38 +++++++++- conf/service_conf.yaml | 15 +++- rag/app/book.py | 5 ++ rag/app/laws.py | 4 +- rag/app/manual.py | 4 +- rag/app/naive.py | 29 +++++--- rag/app/paper.py | 4 ++ rag/app/presentation.py | 5 ++ rag/app/qa.py | 10 +++ rag/app/resume.py | 48 +++++++++---- rag/app/table.py | 17 ++++- rag/llm/__init__.py | 2 +- rag/llm/chat_model.py | 12 ++-- rag/llm/embedding_model.py | 6 +- rag/nlp/search.py | 27 +++---- rag/parser/pdf_parser.py | 5 +- rag/svr/task_executor.py | 9 +-- rag/utils/es_conn.py | 2 +- 29 files changed, 301 insertions(+), 157 deletions(-) diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index 30c3ee9..ad1745a 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -21,20 +21,21 @@ from api.db.services.dialog_service import DialogService, ConversationService from api.db import LLMType from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMService, LLMBundle -from api.settings import access_logger +from api.settings import access_logger, stat_logger from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils import get_uuid from api.utils.api_utils import get_json_result +from rag.app.resume import forbidden_select_fields4resume from rag.llm import ChatModel from rag.nlp import retrievaler from rag.nlp.search import index_name -from rag.utils import num_tokens_from_string, encoder +from rag.utils import num_tokens_from_string, encoder, rmSpace @manager.route('/set', methods=['POST']) @login_required @validate_request("dialog_id") -def set(): +def set_conversation(): req = request.json conv_id = req.get("conversation_id") if conv_id: @@ -96,9 +97,10 @@ def rm(): except Exception as e: return server_error_response(e) + @manager.route('/list', methods=['GET']) @login_required -def list(): +def list_convsersation(): dialog_id = request.args["dialog_id"] try: convs = ConversationService.query(dialog_id=dialog_id) @@ -112,7 +114,7 @@ def message_fit_in(msg, max_length=4000): def count(): nonlocal msg tks_cnts = [] - for m in msg:tks_cnts.append({"role": m["role"], "count": num_tokens_from_string(m["content"])}) + for m in msg: tks_cnts.append({"role": m["role"], "count": num_tokens_from_string(m["content"])}) total = 0 for m in tks_cnts: total += m["count"] return total @@ -121,22 +123,22 @@ def message_fit_in(msg, max_length=4000): if c < max_length: return c, msg msg = [m for m in msg if m.role in ["system", "user"]] c = count() - if c < max_length:return c, msg + if c < max_length: return c, msg msg_ = [m for m in msg[:-1] if m.role == "system"] msg_.append(msg[-1]) msg = msg_ c = count() - if c < max_length:return c, msg + if c < max_length: return c, msg ll = num_tokens_from_string(msg_[0].content) l = num_tokens_from_string(msg_[-1].content) - if ll/(ll + l) > 0.8: + if ll / (ll + l) > 0.8: m = msg_[0].content - m = encoder.decode(encoder.encode(m)[:max_length-l]) + m = encoder.decode(encoder.encode(m)[:max_length - l]) msg[0].content = m return max_length, msg m = msg_[1].content - m = encoder.decode(encoder.encode(m)[:max_length-l]) + m = encoder.decode(encoder.encode(m)[:max_length - l]) msg[1].content = m return max_length, msg @@ -148,8 +150,8 @@ def completion(): req = request.json msg = [] for m in req["messages"]: - if m["role"] == "system":continue - if m["role"] == "assistant" and not msg:continue + if m["role"] == "system": continue + if m["role"] == "assistant" and not msg: continue msg.append({"role": m["role"], "content": m["content"]}) try: e, dia = DialogService.get_by_id(req["dialog_id"]) @@ -166,7 +168,7 @@ def chat(dialog, messages, **kwargs): assert messages[-1]["role"] == "user", "The last content of this conversation is not from user." llm = LLMService.query(llm_name=dialog.llm_id) if not llm: - raise LookupError("LLM(%s) not found"%dialog.llm_id) + raise LookupError("LLM(%s) not found" % dialog.llm_id) llm = llm[0] question = messages[-1]["content"] embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING) @@ -175,19 +177,21 @@ def chat(dialog, messages, **kwargs): field_map = KnowledgebaseService.get_field_map(dialog.kb_ids) ## try to use sql if field mapping is good to go if field_map: - markdown_tbl,chunks = use_sql(question, field_map, dialog.tenant_id, chat_mdl) + stat_logger.info("Use SQL to retrieval.") + markdown_tbl, chunks = use_sql(question, field_map, dialog.tenant_id, chat_mdl) if markdown_tbl: return {"answer": markdown_tbl, "retrieval": {"chunks": chunks}} prompt_config = dialog.prompt_config for p in prompt_config["parameters"]: - if p["key"] == "knowledge":continue - if p["key"] not in kwargs and not p["optional"]:raise KeyError("Miss parameter: " + p["key"]) + if p["key"] == "knowledge": continue + if p["key"] not in kwargs and not p["optional"]: raise KeyError("Miss parameter: " + p["key"]) if p["key"] not in kwargs: - prompt_config["system"] = prompt_config["system"].replace("{%s}"%p["key"], " ") + prompt_config["system"] = prompt_config["system"].replace("{%s}" % p["key"], " ") - kbinfos = retrievaler.retrieval(question, embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, dialog.similarity_threshold, - dialog.vector_similarity_weight, top=1024, aggs=False) + kbinfos = retrievaler.retrieval(question, embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, + dialog.similarity_threshold, + dialog.vector_similarity_weight, top=1024, aggs=False) knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] if not knowledges and prompt_config["empty_response"]: @@ -202,17 +206,17 @@ def chat(dialog, messages, **kwargs): answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) answer = retrievaler.insert_citations(answer, - [ck["content_ltks"] for ck in kbinfos["chunks"]], - [ck["vector"] for ck in kbinfos["chunks"]], - embd_mdl, - tkweight=1-dialog.vector_similarity_weight, - vtweight=dialog.vector_similarity_weight) + [ck["content_ltks"] for ck in kbinfos["chunks"]], + [ck["vector"] for ck in kbinfos["chunks"]], + embd_mdl, + tkweight=1 - dialog.vector_similarity_weight, + vtweight=dialog.vector_similarity_weight) for c in kbinfos["chunks"]: - if c.get("vector"):del c["vector"] + if c.get("vector"): del c["vector"] return {"answer": answer, "retrieval": kbinfos} -def use_sql(question,field_map, tenant_id, chat_mdl): +def use_sql(question, field_map, tenant_id, chat_mdl): sys_prompt = "ä˝ ćŻä¸€ä¸ŞDBAă€‚ä˝ éś€č¦čż™ĺŻąä»Ąä¸‹čˇ¨çš„ĺ—ć®µç»“ćž„ďĽŚć ąćŤ®ć‘çš„é—®é˘ĺ†™ĺ‡şsql。" user_promt = """ 表ĺŤďĽš{}; @@ -220,37 +224,47 @@ def use_sql(question,field_map, tenant_id, chat_mdl): {} é—®é˘ďĽš{} -请写出SQL。 +请写出SQL,且只č¦SQL,不č¦ćś‰ĺ…¶ä»–说ćŽĺŹŠć–‡ĺ—。 """.format( index_name(tenant_id), - "\n".join([f"{k}: {v}" for k,v in field_map.items()]), + "\n".join([f"{k}: {v}" for k, v in field_map.items()]), question ) - sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {"temperature": 0.1}) - sql = re.sub(r".*?select ", "select ", sql, flags=re.IGNORECASE) + sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {"temperature": 0.06}) + stat_logger.info(f"“{question}” get SQL: {sql}") + sql = re.sub(r"[\r\n]+", " ", sql.lower()) + sql = re.sub(r".*?select ", "select ", sql.lower()) sql = re.sub(r" +", " ", sql) - sql = re.sub(r"[;;].*", "", sql) - if sql[:len("select ")].lower() != "select ": + sql = re.sub(r"([;;]|```).*", "", sql) + if sql[:len("select ")] != "select ": return None, None - if sql[:len("select *")].lower() != "select *": + if sql[:len("select *")] != "select *": sql = "select doc_id,docnm_kwd," + sql[6:] + else: + flds = [] + for k in field_map.keys(): + if k in forbidden_select_fields4resume:continue + if len(flds) > 11:break + flds.append(k) + sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:] - tbl = retrievaler.sql_retrieval(sql) - if not tbl: return None, None + stat_logger.info(f"“{question}” get SQL(refined): {sql}") + tbl = retrievaler.sql_retrieval(sql, format="json") + if not tbl or len(tbl["rows"]) == 0: return None, None docid_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "doc_id"]) docnm_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "docnm_kwd"]) - clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx|docnm_idx)] + clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)] # compose markdown table - clmns = "|".join([re.sub(r"/.*", "", field_map.get(tbl["columns"][i]["name"], f"C{i}")) for i in clmn_idx]) + "|原文" + clmns = "|".join([re.sub(r"(/.*|ďĽ[^ďĽďĽ‰]+))", "", field_map.get(tbl["columns"][i]["name"], f"C{i}")) for i in clmn_idx]) + "|原文" line = "|".join(["------" for _ in range(len(clmn_idx))]) + "|------" - rows = ["|".join([str(r[i]) for i in clmn_idx])+"|" for r in tbl["rows"]] + rows = ["|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]] if not docid_idx or not docnm_idx: access_logger.error("SQL missing field: " + sql) return "\n".join([clmns, line, "\n".join(rows)]), [] - rows = "\n".join([r+f"##{ii}$$" for ii,r in enumerate(rows)]) + rows = "\n".join([r + f"##{ii}$$" for ii, r in enumerate(rows)]) docid_idx = list(docid_idx)[0] docnm_idx = list(docnm_idx)[0] return "\n".join([clmns, line, rows]), [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]] diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py index d46a892..083d412 100644 --- a/api/apps/dialog_app.py +++ b/api/apps/dialog_app.py @@ -27,7 +27,7 @@ from api.utils.api_utils import get_json_result @manager.route('/set', methods=['POST']) @login_required -def set(): +def set_dialog(): req = request.json dialog_id = req.get("dialog_id") name = req.get("name", "New Dialog") diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 0b949cc..0649ada 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -262,17 +262,18 @@ def rename(): return server_error_response(e) -@manager.route('/get', methods=['GET']) -@login_required -def get(): - doc_id = request.args["doc_id"] +@manager.route('/get/<doc_id>', methods=['GET']) +def get(doc_id): try: e, doc = DocumentService.get_by_id(doc_id) if not e: return get_data_error_result(retmsg="Document not found!") - blob = MINIO.get(doc.kb_id, doc.location) - return get_json_result(data={"base64": base64.b64decode(blob)}) + response = flask.make_response(MINIO.get(doc.kb_id, doc.location)) + ext = re.search(r"\.([^.]+)$", doc.name) + if ext: + response.headers.set('Content-Type', 'application/%s'%ext.group(1)) + return response except Exception as e: return server_error_response(e) diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index a7f4009..b0ba165 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -38,6 +38,9 @@ def create(): req["id"] = get_uuid() req["tenant_id"] = current_user.id req["created_by"] = current_user.id + e, t = TenantService.get_by_id(current_user.id) + if not e: return get_data_error_result(retmsg="Tenant not found.") + req["embd_id"] = t.embd_id if not KnowledgebaseService.save(**req): return get_data_error_result() return get_json_result(data={"kb_id": req["id"]}) except Exception as e: diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 43f3109..c70f7ea 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -21,11 +21,12 @@ from api.db.services.llm_service import LLMFactoriesService, TenantLLMService, L from api.db.services.user_service import TenantService, UserTenantService from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils import get_uuid, get_format_time -from api.db import StatusEnum, UserTenantRole +from api.db import StatusEnum, UserTenantRole, LLMType from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.db_models import Knowledgebase, TenantLLM from api.settings import stat_logger, RetCode from api.utils.api_utils import get_json_result +from rag.llm import EmbeddingModel, CvModel, ChatModel @manager.route('/factories', methods=['GET']) @@ -43,16 +44,37 @@ def factories(): @validate_request("llm_factory", "api_key") def set_api_key(): req = request.json + # test if api key works + msg = "" + for llm in LLMService.query(fid=req["llm_factory"]): + if llm.model_type == LLMType.EMBEDDING.value: + mdl = EmbeddingModel[req["llm_factory"]]( + req["api_key"], llm.llm_name) + try: + arr, tc = mdl.encode(["Test if the api key is available"]) + if len(arr[0]) == 0 or tc ==0: raise Exception("Fail") + except Exception as e: + msg += f"\nFail to access embedding model({llm.llm_name}) using this api key." + elif llm.model_type == LLMType.CHAT.value: + mdl = ChatModel[req["llm_factory"]]( + req["api_key"], llm.llm_name) + try: + m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9}) + if not tc: raise Exception(m) + except Exception as e: + msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(e) + + if msg: return get_data_error_result(retmsg=msg) + llm = { "tenant_id": current_user.id, "llm_factory": req["llm_factory"], "api_key": req["api_key"] } - # TODO: Test api_key for n in ["model_type", "llm_name"]: if n in req: llm[n] = req[n] - TenantLLM.insert(**llm).on_conflict("replace").execute() + TenantLLMService.filter_update([TenantLLM.tenant_id==llm["tenant_id"], TenantLLM.llm_factory==llm["llm_factory"]], llm) return get_json_result(data=True) @@ -69,6 +91,7 @@ def my_llms(): @manager.route('/list', methods=['GET']) @login_required def list(): + model_type = request.args.get("model_type") try: objs = TenantLLMService.query(tenant_id=current_user.id) mdlnms = set([o.to_dict()["llm_name"] for o in objs if o.api_key]) @@ -79,6 +102,7 @@ def list(): res = {} for m in llms: + if model_type and m["model_type"] != model_type: continue if m["fid"] not in res: res[m["fid"]] = [] res[m["fid"]].append(m) diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 6ca77f0..0f08e0e 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -24,7 +24,8 @@ from api.db.services.llm_service import TenantLLMService, LLMService from api.utils.api_utils import server_error_response, validate_request from api.utils import get_uuid, get_format_time, decrypt, download_img from api.db import UserTenantRole, LLMType -from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS +from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \ + LLM_FACTORY from api.db.services.user_service import UserService, TenantService, UserTenantService from api.settings import stat_logger from api.utils.api_utils import get_json_result, cors_reponse @@ -204,8 +205,8 @@ def user_register(user_id, user): "role": UserTenantRole.OWNER } tenant_llm = [] - for llm in LLMService.query(fid="Infiniflow"): - tenant_llm.append({"tenant_id": user_id, "llm_factory": "Infiniflow", "llm_name": llm.llm_name, "model_type":llm.model_type, "api_key": "infiniflow API Key"}) + for llm in LLMService.query(fid=LLM_FACTORY): + tenant_llm.append({"tenant_id": user_id, "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type":llm.model_type, "api_key": API_KEY}) if not UserService.save(**user):return TenantService.save(**tenant) diff --git a/api/db/db_models.py b/api/db/db_models.py index 210b83b..282a566 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -465,7 +465,8 @@ class Knowledgebase(DataBaseModel): tenant_id = CharField(max_length=32, null=False) name = CharField(max_length=128, null=False, help_text="KB name", index=True) description = TextField(null=True, help_text="KB description") - permission = CharField(max_length=16, null=False, help_text="me|team") + embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID") + permission = CharField(max_length=16, null=False, help_text="me|team", default="me") created_by = CharField(max_length=32, null=False) doc_num = IntegerField(default=0) token_num = IntegerField(default=0) diff --git a/api/db/init_data.py b/api/db/init_data.py index 319524d..5e4a812 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -46,11 +46,6 @@ def init_llm_factory(): "logo": "", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "status": "1", - },{ - "name": "Infiniflow", - "logo": "", - "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", - "status": "1", },{ "name": "智普AI", "logo": "", @@ -135,59 +130,33 @@ def init_llm_factory(): "model_type": LLMType.SPEECH2TEXT.value },{ "fid": factory_infos[1]["name"], - "llm_name": "qwen_vl_chat_v1", - "tags": "LLM,CHAT,IMAGE2TEXT", - "max_tokens": 765, - "model_type": LLMType.IMAGE2TEXT.value - }, - # ----------------------- Infiniflow ----------------------- - { - "fid": factory_infos[2]["name"], - "llm_name": "gpt-3.5-turbo", - "tags": "LLM,CHAT,4K", - "max_tokens": 4096, - "model_type": LLMType.CHAT.value - },{ - "fid": factory_infos[2]["name"], - "llm_name": "text-embedding-ada-002", - "tags": "TEXT EMBEDDING,8K", - "max_tokens": 8191, - "model_type": LLMType.EMBEDDING.value - },{ - "fid": factory_infos[2]["name"], - "llm_name": "whisper-1", - "tags": "SPEECH2TEXT", - "max_tokens": 25*1024*1024, - "model_type": LLMType.SPEECH2TEXT.value - },{ - "fid": factory_infos[2]["name"], - "llm_name": "gpt-4-vision-preview", + "llm_name": "qwen-vl-max", "tags": "LLM,CHAT,IMAGE2TEXT", "max_tokens": 765, "model_type": LLMType.IMAGE2TEXT.value }, # ---------------------- ZhipuAI ---------------------- { - "fid": factory_infos[3]["name"], + "fid": factory_infos[2]["name"], "llm_name": "glm-3-turbo", "tags": "LLM,CHAT,", "max_tokens": 128 * 1000, "model_type": LLMType.CHAT.value }, { - "fid": factory_infos[3]["name"], + "fid": factory_infos[2]["name"], "llm_name": "glm-4", "tags": "LLM,CHAT,", "max_tokens": 128 * 1000, "model_type": LLMType.CHAT.value }, { - "fid": factory_infos[3]["name"], + "fid": factory_infos[2]["name"], "llm_name": "glm-4v", "tags": "LLM,CHAT,IMAGE2TEXT", "max_tokens": 2000, "model_type": LLMType.IMAGE2TEXT.value }, { - "fid": factory_infos[3]["name"], + "fid": factory_infos[2]["name"], "llm_name": "embedding-2", "tags": "TEXT EMBEDDING", "max_tokens": 512, diff --git a/api/db/services/knowledgebase_service.py b/api/db/services/knowledgebase_service.py index 0851e6b..236b8d0 100644 --- a/api/db/services/knowledgebase_service.py +++ b/api/db/services/knowledgebase_service.py @@ -77,9 +77,12 @@ class KnowledgebaseService(CommonService): if isinstance(v, dict): assert isinstance(old[k], dict) dfs_update(old[k], v) + if isinstance(v, list): + assert isinstance(old[k], list) + old[k] = list(set(old[k]+v)) else: old[k] = v dfs_update(m.parser_config, config) - cls.update_by_id(id, m.parser_config) + cls.update_by_id(id, {"parser_config": m.parser_config}) @classmethod @@ -88,6 +91,6 @@ class KnowledgebaseService(CommonService): conf = {} for k in cls.get_by_ids(ids): if k.parser_config and "field_map" in k.parser_config: - conf.update(k.parser_config) + conf.update(k.parser_config["field_map"]) return conf diff --git a/api/settings.py b/api/settings.py index 4493bdb..a5882a5 100644 --- a/api/settings.py +++ b/api/settings.py @@ -43,12 +43,14 @@ REQUEST_MAX_WAIT_SEC = 300 USE_REGISTRY = get_base_config("use_registry") -LLM = get_base_config("llm", {}) -CHAT_MDL = LLM.get("chat_model", "gpt-3.5-turbo") -EMBEDDING_MDL = LLM.get("embedding_model", "text-embedding-ada-002") -ASR_MDL = LLM.get("asr_model", "whisper-1") +LLM = get_base_config("user_default_llm", {}) +LLM_FACTORY=LLM.get("factory", "通义ĺŤé—®") +CHAT_MDL = LLM.get("chat_model", "qwen-plus") +EMBEDDING_MDL = LLM.get("embedding_model", "text-embedding-v2") +ASR_MDL = LLM.get("asr_model", "paraformer-realtime-8k-v1") +IMAGE2TEXT_MDL = LLM.get("image2text_model", "qwen-vl-max") +API_KEY = LLM.get("api_key", "infiniflow API Key") PARSERS = LLM.get("parsers", "general:General,qa:Q&A,resume:Resume,naive:Naive,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") -IMAGE2TEXT_MDL = LLM.get("image2text_model", "gpt-4-vision-preview") # distribution DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 7c54302..b504cfe 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -164,10 +164,10 @@ def thumbnail(filename, blob): buffered = BytesIO() Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(buffered, format="png") - return "data:image/png;base64," + base64.b64encode(buffered.getvalue()) + return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8") if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): - return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()) + return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8") if re.match(r".*\.(ppt|pptx)$", filename): import aspose.slides as slides @@ -176,7 +176,7 @@ def thumbnail(filename, blob): with slides.Presentation(BytesIO(blob)) as presentation: buffered = BytesIO() presentation.slides[0].get_thumbnail(0.03, 0.03).save(buffered, drawing.imaging.ImageFormat.png) - return "data:image/png;base64," + base64.b64encode(buffered.getvalue()) + return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8") except Exception as e: pass diff --git a/conf/mapping.json b/conf/mapping.json index 95572dd..713153e 100644 --- a/conf/mapping.json +++ b/conf/mapping.json @@ -118,11 +118,45 @@ }, { "dense_vector": { - "match": "*_vec", + "match": "*_512_vec", "mapping": { "type": "dense_vector", "index": true, - "similarity": "cosine" + "similarity": "cosine", + "dims": 512 + } + } + }, + { + "dense_vector": { + "match": "*_768_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 768 + } + } + }, + { + "dense_vector": { + "match": "*_1024_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 1024 + } + } + }, + { + "dense_vector": { + "match": "*_1536_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 1536 } } }, diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml index dd8cca1..1aa8cb8 100644 --- a/conf/service_conf.yaml +++ b/conf/service_conf.yaml @@ -11,7 +11,7 @@ permission: dataset: false ragflow: # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported - host: 127.0.0.1 + host: 0.0.0.0 http_port: 9380 database: name: 'rag_flow' @@ -21,6 +21,19 @@ database: port: 5455 max_connections: 100 stale_timeout: 30 +minio: + user: 'rag_flow' + passwd: 'infini_rag_flow' + host: '123.60.95.134:9000' +es: + hosts: 'http://123.60.95.134:9200' +user_default_llm: + factory: '通义ĺŤé—®' + chat_model: 'qwen-plus' + embedding_model: 'text-embedding-v2' + asr_model: 'paraformer-realtime-8k-v1' + image2text_model: 'qwen-vl-max' + api_key: 'sk-xxxxxxxxxxxxx' oauth: github: client_id: 302129228f0d96055bee diff --git a/rag/app/book.py b/rag/app/book.py index 4b38d09..75b9f08 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -39,6 +39,11 @@ class Pdf(HuParser): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + """ + Supported file formats are docx, pdf, txt. + Since a book is long and not all the parts are useful, if it's a PDF, + please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time. + """ doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) diff --git a/rag/app/laws.py b/rag/app/laws.py index ebeebf0..0c4bca1 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -2,7 +2,6 @@ import copy import re from io import BytesIO from docx import Document -import numpy as np from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ make_colon_as_title from rag.nlp import huqie @@ -59,6 +58,9 @@ class Pdf(HuParser): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + """ + Supported file formats are docx, pdf, txt. + """ doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) diff --git a/rag/app/manual.py b/rag/app/manual.py index a35fdfd..e8a9dad 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -58,8 +58,10 @@ class Pdf(HuParser): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + """ + Only pdf is supported. + """ pdf_parser = None - paper = {} if re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() diff --git a/rag/app/naive.py b/rag/app/naive.py index b6a26f9..8c80d5f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -6,6 +6,7 @@ from rag.nlp import huqie from rag.parser.pdf_parser import HuParser from rag.settings import cron_logger + class Pdf(HuParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): @@ -20,12 +21,18 @@ class Pdf(HuParser): start = timer() self._layouts_paddle(zoomin) callback(0.77, "Layout analysis finished") - cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1))) + cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) self._naive_vertical_merge() return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + """ + Supported file formats are docx, pdf, txt. + This method apply the naive ways to chunk files. + Successive text will be sliced into pieces using 'delimiter'. + Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. + """ doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -41,24 +48,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() sections = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) + from_page=from_page, to_page=to_page, callback=callback) elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" - if binary:txt = binary.decode("utf-8") + if binary: + txt = binary.decode("utf-8") else: with open(filename, "r") as f: while True: l = f.readline() - if not l:break + if not l: break txt += l sections = txt.split("\n") - sections = [(l,"") for l in sections if l] + sections = [(l, "") for l in sections if l] callback(0.8, "Finish parsing.") - else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") + else: + raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") - parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。;ďĽďĽź"}) - cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"]) + parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;ďĽďĽź"}) + cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"]) eng = is_english(cks) res = [] # wrap up to es documents @@ -75,6 +84,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k if __name__ == "__main__": import sys + + def dummy(a, b): pass + + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) diff --git a/rag/app/paper.py b/rag/app/paper.py index 52c7a62..4f464ac 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -129,6 +129,10 @@ class Pdf(HuParser): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + """ + Only pdf is supported. + The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. + """ pdf_parser = None if re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() diff --git a/rag/app/presentation.py b/rag/app/presentation.py index ff805bb..afcb8f2 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -94,6 +94,11 @@ class Pdf(HuParser): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + """ + The supported file formats are pdf, pptx. + Every page will be treated as a chunk. And the thumbnail of every page will be stored. + PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. + """ doc = { "docnm_kwd": filename, "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) diff --git a/rag/app/qa.py b/rag/app/qa.py index 75ebd94..9d55d1b 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng): def chunk(filename, binary=None, callback=None, **kwargs): + """ + Excel and csv(txt) format files are supported. + If the file is in excel format, there should be 2 column question and answer without header. + And question column is ahead of answer column. + And it's O.K if it has multiple sheets as long as the columns are rightly composed. + If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer. + + All the deformed lines will be ignored. + Every pair of Q&A will be treated as a chunk. + """ res = [] if re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/rag/app/resume.py b/rag/app/resume.py index 14649bb..fd9dc03 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -4,24 +4,34 @@ import os import re import requests from api.db.services.knowledgebase_service import KnowledgebaseService +from api.settings import stat_logger from rag.nlp import huqie from rag.settings import cron_logger from rag.utils import rmSpace +forbidden_select_fields4resume = [ + "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd" +] def chunk(filename, binary=None, callback=None, **kwargs): + """ + The supported file formats are pdf, docx and txt. + To maximize the effectiveness, parse the resume correctly, + please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site + to get token. It's FREE! + Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or + using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container. + """ if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE): raise NotImplementedError("file type not supported yet(pdf supported)") url = os.environ.get("INFINIFLOW_SERVER") - if not url: - raise EnvironmentError( - "Please set environment variable: 'INFINIFLOW_SERVER'") token = os.environ.get("INFINIFLOW_TOKEN") - if not token: - raise EnvironmentError( - "Please set environment variable: 'INFINIFLOW_TOKEN'") + if not url or not token: + stat_logger.warning( + "INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.") + return [] if not binary: with open(filename, "rb") as f: @@ -44,22 +54,28 @@ def chunk(filename, binary=None, callback=None, **kwargs): callback(0.2, "Resume parsing is going on...") resume = remote_call() + if len(resume.keys()) < 7: + callback(-1, "Resume is not successfully parsed.") + return [] callback(0.6, "Done parsing. Chunking...") print(json.dumps(resume, ensure_ascii=False, indent=2)) field_map = { "name_kwd": "姓ĺŤ/ĺŤĺ—", + "name_pinyin_kwd": "姓ĺŤć‹Ľéźł/ĺŤĺ—ć‹Ľéźł", "gender_kwd": "性ĺ«ďĽç”·ďĽŚĺĄłďĽ‰", "age_int": "ĺą´éľ„/ĺ˛/年纪", "phone_kwd": "电话/手机/微信", "email_tks": "email/e-mail/邮箱", "position_name_tks": "čŚä˝Ť/čŚč˝/岗位/čŚč´Ł", - "expect_position_name_tks": "ćśźćś›čŚä˝Ť/ćśźćś›čŚč˝/期望岗位", + "expect_city_names_tks": "期望城市", + "work_exp_flt": "工作年é™/工作年份/N年经验/毕业了多少年", + "corporation_name_tks": "最近就čŚ(上çŹ)的公司/上一家公司", - "hightest_degree_kwd": "最é«ĺ¦ĺŽ†ďĽé«ä¸ďĽŚčŚé«ďĽŚçˇ•ĺŁ«ďĽŚćś¬ç§‘,博士,ĺťä¸ďĽŚä¸ćŠ€ďĽŚä¸ä¸“,专科,专升本,MPA,MBA,EMBA)", + "first_school_name_tks": "第一ĺ¦ĺŽ†ćŻ•ä¸šĺ¦ć ˇ", "first_degree_kwd": "第一ĺ¦ĺŽ†ďĽé«ä¸ďĽŚčŚé«ďĽŚçˇ•ĺŁ«ďĽŚćś¬ç§‘,博士,ĺťä¸ďĽŚä¸ćŠ€ďĽŚä¸ä¸“,专科,专升本,MPA,MBA,EMBA)", + "highest_degree_kwd": "最é«ĺ¦ĺŽ†ďĽé«ä¸ďĽŚčŚé«ďĽŚçˇ•ĺŁ«ďĽŚćś¬ç§‘,博士,ĺťä¸ďĽŚä¸ćŠ€ďĽŚä¸ä¸“,专科,专升本,MPA,MBA,EMBA)", "first_major_tks": "第一ĺ¦ĺŽ†ä¸“业", - "first_school_name_tks": "第一ĺ¦ĺŽ†ćŻ•ä¸šĺ¦ć ˇ", "edu_first_fea_kwd": "第一ĺ¦ĺŽ†ć ‡çľďĽ211,留ĺ¦ďĽŚĺŹŚä¸€ćµďĽŚ985,海外知ĺŤďĽŚé‡Ťç‚ąĺ¤§ĺ¦ďĽŚä¸ä¸“,专升本,专科,本科,大专)", "degree_kwd": "过往ĺ¦ĺŽ†ďĽé«ä¸ďĽŚčŚé«ďĽŚçˇ•ĺŁ«ďĽŚćś¬ç§‘,博士,ĺťä¸ďĽŚä¸ćŠ€ďĽŚä¸ä¸“,专科,专升本,MPA,MBA,EMBA)", @@ -68,14 +84,14 @@ def chunk(filename, binary=None, callback=None, **kwargs): "sch_rank_kwd": "ĺ¦ć ˇć ‡çľďĽéˇ¶ĺ°–ĺ¦ć ˇďĽŚç˛ľč‹±ĺ¦ć ˇďĽŚäĽč´¨ĺ¦ć ˇďĽŚä¸€č¬ĺ¦ć ˇďĽ‰", "edu_fea_kwd": "ć•™č‚˛ć ‡çľďĽ211,留ĺ¦ďĽŚĺŹŚä¸€ćµďĽŚ985,海外知ĺŤďĽŚé‡Ťç‚ąĺ¤§ĺ¦ďĽŚä¸ä¸“,专升本,专科,本科,大专)", - "work_exp_flt": "工作年é™/工作年份/N年经验/毕业了多少年", - "birth_dt": "生日/出生年份", "corp_nm_tks": "ĺ°±čŚčż‡çš„公司/之前的公司/上过çŹçš„公司", - "corporation_name_tks": "最近就čŚ(上çŹ)的公司/上一家公司", "edu_end_int": "毕业年份", - "expect_city_names_tks": "期望城市", - "industry_name_tks": "所在行业" + "industry_name_tks": "所在行业", + + "birth_dt": "生日/出生年份", + "expect_position_name_tks": "ćśźćś›čŚä˝Ť/ćśźćś›čŚč˝/期望岗位", } + titles = [] for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]: v = resume.get(n, "") @@ -105,6 +121,10 @@ def chunk(filename, binary=None, callback=None, **kwargs): doc["content_ltks"] = huqie.qie(doc["content_with_weight"]) doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"]) for n, _ in field_map.items(): + if n not in resume:continue + if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume): + resume[n] = resume[n][0] + if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n]) doc[n] = resume[n] print(doc) diff --git a/rag/app/table.py b/rag/app/table.py index a078a1a..c80b3fb 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -100,7 +100,20 @@ def column_data_type(arr): def chunk(filename, binary=None, callback=None, **kwargs): - dfs = [] + """ + Excel and csv(txt) format files are supported. + For csv or txt file, the delimiter between columns is TAB. + The first line must be column headers. + Column headers must be meaningful terms inorder to make our NLP model understanding. + It's good to enumerate some synonyms using slash '/' to separate, and even better to + enumerate values using brackets like 'gender/sex(male, female)'. + Here are some examples for headers: + 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) + 2. 姓ĺŤ/ĺŤĺ—\t电话/手机/微信\t最é«ĺ¦ĺŽ†ďĽé«ä¸ďĽŚčŚé«ďĽŚçˇ•ĺŁ«ďĽŚćś¬ç§‘,博士,ĺťä¸ďĽŚä¸ćŠ€ďĽŚä¸ä¸“,专科,专升本,MPA,MBA,EMBA) + + Every row in table will be treated as a chunk. + """ + if re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = Excel() @@ -155,7 +168,7 @@ def chunk(filename, binary=None, callback=None, **kwargs): del df[n] clmns = df.columns.values txts = list(copy.deepcopy(clmns)) - py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns] + py_clmns = [PY.get_pinyins(re.sub(r"(/.*|ďĽ[^ďĽďĽ‰]+?)|\([^()]+?\))", "", n), '_')[0] for n in clmns] clmn_tys = [] for j in range(len(clmns)): cln, ty = column_data_type(df[clmns[j]]) diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index 129bf60..462313d 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -21,7 +21,7 @@ from .cv_model import * EmbeddingModel = { "Infiniflow": HuEmbedding, "OpenAI": OpenAIEmbed, - "通义ĺŤé—®": QWenEmbed, + "通义ĺŤé—®": HuEmbedding, #QWenEmbed, } diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 7868eb2..cdc2c8b 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -32,7 +32,7 @@ class GptTurbo(Base): self.model_name = model_name def chat(self, system, history, gen_conf): - history.insert(0, {"role": "system", "content": system}) + if system: history.insert(0, {"role": "system", "content": system}) res = self.client.chat.completions.create( model=self.model_name, messages=history, @@ -49,11 +49,12 @@ class QWenChat(Base): def chat(self, system, history, gen_conf): from http import HTTPStatus - history.insert(0, {"role": "system", "content": system}) + if system: history.insert(0, {"role": "system", "content": system}) response = Generation.call( self.model_name, messages=history, - result_format='message' + result_format='message', + **gen_conf ) if response.status_code == HTTPStatus.OK: return response.output.choices[0]['message']['content'], response.usage.output_tokens @@ -68,10 +69,11 @@ class ZhipuChat(Base): def chat(self, system, history, gen_conf): from http import HTTPStatus - history.insert(0, {"role": "system", "content": system}) + if system: history.insert(0, {"role": "system", "content": system}) response = self.client.chat.completions.create( self.model_name, - messages=history + messages=history, + **gen_conf ) if response.status_code == HTTPStatus.OK: return response.output.choices[0]['message']['content'], response.usage.completion_tokens diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index 94277d9..cd52586 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -100,11 +100,11 @@ class QWenEmbed(Base): input=texts[i:i+batch_size], text_type="document" ) - embds = [[]] * len(resp["output"]["embeddings"]) + embds = [[] for _ in range(len(resp["output"]["embeddings"]))] for e in resp["output"]["embeddings"]: embds[e["text_index"]] = e["embedding"] res.extend(embds) - token_count += resp["usage"]["input_tokens"] + token_count += resp["usage"]["total_tokens"] return np.array(res), token_count def encode_queries(self, text): @@ -113,7 +113,7 @@ class QWenEmbed(Base): input=text[:2048], text_type="query" ) - return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["input_tokens"] + return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["total_tokens"] from zhipuai import ZhipuAI diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 85f2449..580caaf 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -92,7 +92,7 @@ class Dealer: assert emb_mdl, "No embedding model selected" s["knn"] = self._vector( qst, emb_mdl, req.get( - "similarity", 0.4), ps) + "similarity", 0.1), ps) s["knn"]["filter"] = bqry.to_dict() if "highlight" in s: del s["highlight"] @@ -106,7 +106,7 @@ class Dealer: bqry.filter.append(Q("terms", kb_id=req["kb_ids"])) s["query"] = bqry.to_dict() s["knn"]["filter"] = bqry.to_dict() - s["knn"]["similarity"] = 0.7 + s["knn"]["similarity"] = 0.17 res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src) kwds = set([]) @@ -171,7 +171,7 @@ class Dealer: continue if not isinstance(v, type("")): m[n] = str(m[n]) - m[n] = rmSpace(m[n]) + if n.find("tks")>0: m[n] = rmSpace(m[n]) if m: res[d["id"]] = m @@ -303,21 +303,22 @@ class Dealer: return ranks - def sql_retrieval(self, sql, fetch_size=128): + def sql_retrieval(self, sql, fetch_size=128, format="json"): sql = re.sub(r"[ ]+", " ", sql) + sql = sql.replace("%", "") + es_logger.info(f"Get es sql: {sql}") replaces = [] - for r in re.finditer(r" ([a-z_]+_l?tks like |[a-z_]+_l?tks ?= ?)'([^']+)'", sql): - fld, v = r.group(1), r.group(2) - fld = re.sub(r" ?(like|=)$", "", fld).lower() - if v[0] == "%%": v = v[1:-1] - match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qie(v)) - replaces.append((r.group(1)+r.group(2), match)) + for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql): + fld, v = r.group(1), r.group(3) + match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v))) + replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match)) - for p, r in replaces: sql.replace(p, r) + for p, r in replaces: sql = sql.replace(p, r, 1) + es_logger.info(f"To es: {sql}") try: - tbl = self.es.sql(sql, fetch_size) + tbl = self.es.sql(sql, fetch_size, format) return tbl except Exception as e: - es_logger(f"SQL failure: {sql} =>" + str(e)) + es_logger.error(f"SQL failure: {sql} =>" + str(e)) diff --git a/rag/parser/pdf_parser.py b/rag/parser/pdf_parser.py index 32cd636..79611a7 100644 --- a/rag/parser/pdf_parser.py +++ b/rag/parser/pdf_parser.py @@ -53,9 +53,10 @@ class HuParser: def __remote_call(self, species, images, thr=0.7): url = os.environ.get("INFINIFLOW_SERVER") - if not url:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_SERVER'") token = os.environ.get("INFINIFLOW_TOKEN") - if not token:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_TOKEN'") + if not url or not token: + logging.warning("INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.") + return [] def convert_image_to_bytes(PILimage): image = BytesIO() diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index ed86987..c75260a 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -47,7 +47,7 @@ from api.utils.file_utils import get_project_base_directory BATCH_SIZE = 64 FACTORY = { - ParserType.GENERAL.value: laws, + ParserType.GENERAL.value: manual, ParserType.PAPER.value: paper, ParserType.BOOK.value: book, ParserType.PRESENTATION.value: presentation, @@ -119,8 +119,8 @@ def build(row, cvmdl): chunker = FACTORY[row["parser_id"].lower()] try: cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"])) - cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"], - callback, kb_id=row["kb_id"], parser_config=row["parser_config"]) + cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"], + callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"]) except Exception as e: if re.search("(No such file|not found)", str(e)): callback(-1, "Can not find file <%s>" % row["doc_name"]) @@ -129,7 +129,7 @@ def build(row, cvmdl): cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) - return [] + return callback(msg="Finished slicing files. Start to embedding the content.") @@ -211,6 +211,7 @@ def main(comm, mod): st_tm = timer() cks = build(r, cv_mdl) + if cks is None:continue if not cks: tmf.write(str(r["update_time"]) + "\n") callback(1., "No chunk! Done!") diff --git a/rag/utils/es_conn.py b/rag/utils/es_conn.py index fb24e8e..515574c 100644 --- a/rag/utils/es_conn.py +++ b/rag/utils/es_conn.py @@ -241,7 +241,7 @@ class HuEs: es_logger.error("ES search timeout for 3 times!") raise Exception("ES search timeout.") - def sql(self, sql, fetch_size=128, format="json", timeout=2): + def sql(self, sql, fetch_size=128, format="json", timeout="2s"): for i in range(3): try: res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout=timeout) -- GitLab