diff --git a/Dockerfile b/Dockerfile index 49d3045cc1543199469e06b4283dd1d2b90c1148..c174ccb800294ca4c19f61a41af52457f70c6152 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,6 @@ ADD ./rag ./rag ENV PYTHONPATH=/ragflow/ ENV HF_ENDPOINT=https://hf-mirror.com -/root/miniconda3/envs/py11/bin/pip install peewee==3.17.1 ADD docker/entrypoint.sh ./entrypoint.sh RUN chmod +x ./entrypoint.sh diff --git a/Dockerfile.cuda b/Dockerfile.cuda index a5db6177dc3a0be4036e7ad421d1389dfff2e9ea..2bed3fb61ec12a2951ca7fd641aba6896ed90f1a 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -19,7 +19,6 @@ ADD ./rag ./rag ENV PYTHONPATH=/ragflow/ ENV HF_ENDPOINT=https://hf-mirror.com -/root/miniconda3/envs/py11/bin/pip install peewee==3.17.1 ADD docker/entrypoint.sh ./entrypoint.sh RUN chmod +x ./entrypoint.sh diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index ef99a162c683a485d04f3bb83f7913eb7b376a84..648ecd3d91d3ee5849eb564093b11b97572e6baf 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -309,6 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl): # compose markdown table clmns = "|"+"|".join([re.sub(r"(/.*|ďĽ[^ďĽďĽ‰]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|") line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "") + line = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}\|", "|", line) rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]] if not docid_idx or not docnm_idx: chat_logger.warning("SQL missing field: " + sql) diff --git a/api/db/init_data.py b/api/db/init_data.py index de201d3d0f6e46ddecc321278270c4cc86802fb4..189e5437be463660e103e627cf80f145e0757aee 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -94,7 +94,7 @@ def init_llm_factory(): "name": "Local", "logo": "", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", - "status": "0", + "status": "1", },{ "name": "Moonshot", "logo": "", diff --git a/api/db/services/knowledgebase_service.py b/api/db/services/knowledgebase_service.py index 46fe4bcdf5f4ccd023deb06ebd4e407ce1debd81..be2c96400fa1eca5c331313b3f38f0ffa94939f7 100644 --- a/api/db/services/knowledgebase_service.py +++ b/api/db/services/knowledgebase_service.py @@ -78,7 +78,7 @@ class KnowledgebaseService(CommonService): if isinstance(v, dict): assert isinstance(old[k], dict) dfs_update(old[k], v) - if isinstance(v, list): + elif isinstance(v, list): assert isinstance(old[k], list) old[k] = list(set(old[k]+v)) else: old[k] = v diff --git a/rag/app/table.py b/rag/app/table.py index 3b6cff38ea57a924b40c7e2a7cea3c8d69b145d5..6f40da48d70eb49d676157634a3c6f2c320fbb5f 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -73,9 +73,9 @@ def trans_datatime(s): def trans_bool(s): - if re.match(r"(true|yes|ćŻ)$", str(s).strip(), flags=re.IGNORECASE): + if re.match(r"(true|yes|ćŻ|\*|âś“|âś”|â‘|âś…|âš)$", str(s).strip(), flags=re.IGNORECASE): return ["yes", "ćŻ"] - if re.match(r"(false|no|ĺ¦)$", str(s).strip(), flags=re.IGNORECASE): + if re.match(r"(false|no|ĺ¦|⍻|Ă—)$", str(s).strip(), flags=re.IGNORECASE): return ["no", "ĺ¦"] @@ -107,9 +107,9 @@ def column_data_type(arr): arr[i] = trans[ty](str(arr[i])) except Exception as e: arr[i] = None - if ty == "text": - if len(arr) > 128 and uni / len(arr) < 0.1: - ty = "keyword" + #if ty == "text": + # if len(arr) > 128 and uni / len(arr) < 0.1: + # ty = "keyword" return arr, ty @@ -170,7 +170,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese PY = Pinyin() fieds_map = { "text": "_tks", - "int": "_int", + "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", @@ -189,7 +189,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese df[clmns[j]] = cln if ty == "text": txts.extend([str(c) for c in cln if c]) - clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i].replace("_", " ")) + clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " ")) for i in range(len(clmns))] eng = lang.lower() == "english"#is_english(txts) @@ -204,6 +204,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese continue if not str(row[clmns[j]]): continue + if pd.isna(row[clmns[j]]): + continue fld = clmns_map[j][0] d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie( row[clmns[j]]) @@ -223,7 +225,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese if __name__ == "__main__": import sys - def dummy(a, b): + def dummy(prog=None, msg=""): pass chunk(sys.argv[1], callback=dummy) diff --git a/rag/utils/__init__.py b/rag/utils/__init__.py index 4de8d190764a8691e69f7731ae4a6c5d60f1fd85..2ff34b35a8c11bf6f3e118a8fb9fb779a98b45b1 100644 --- a/rag/utils/__init__.py +++ b/rag/utils/__init__.py @@ -19,8 +19,8 @@ from .minio_conn import MINIO from .es_conn import ELASTICSEARCH def rmSpace(txt): - txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt) - return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt) + txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE) + return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE) def findMaxDt(fnm):