diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 3c86b57384f81af4bc6092d5026d8bfc1c0dc505..ef1704cd6976ee742a55a86a718bacce407a39b5 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -214,7 +214,7 @@ def retrieval_test(): question = req["question"] kb_id = req["kb_id"] doc_ids = req.get("doc_ids", []) - similarity_threshold = float(req.get("similarity_threshold", 0.4)) + similarity_threshold = float(req.get("similarity_threshold", 0.2)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) top = int(req.get("top", 1024)) try: diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index 1b2bae860baa45bbb7fdd698bca7472c4a232936..130acb9990bc0b461d9c9d3c921356f0f18244de 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -170,7 +170,7 @@ def chat(dialog, messages, **kwargs): if p["key"] not in kwargs: prompt_config["system"] = prompt_config["system"].replace("{%s}"%p["key"], " ") - model_config = TenantLLMService.get_api_key(dialog.tenant_id, LLMType.CHAT.value, dialog.llm_id) + model_config = TenantLLMService.get_api_key(dialog.tenant_id, dialog.llm_id) if not model_config: raise LookupError("LLM({}) API key not found".format(dialog.llm_id)) question = messages[-1]["content"] @@ -186,10 +186,10 @@ def chat(dialog, messages, **kwargs): kwargs["knowledge"] = "\n".join(knowledges) gen_conf = dialog.llm_setting[dialog.llm_setting_type] msg = [{"role": m["role"], "content": m["content"]} for m in messages if m["role"] != "system"] - used_token_count = message_fit_in(msg, int(llm.max_tokens * 0.97)) + used_token_count, msg = message_fit_in(msg, int(llm.max_tokens * 0.97)) if "max_tokens" in gen_conf: gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count) - mdl = ChatModel[model_config.llm_factory](model_config["api_key"], dialog.llm_id) + mdl = ChatModel[model_config.llm_factory](model_config.api_key, dialog.llm_id) answer = mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) answer = retrievaler.insert_citations(answer, @@ -198,4 +198,6 @@ def chat(dialog, messages, **kwargs): embd_mdl, tkweight=1-dialog.vector_similarity_weight, vtweight=dialog.vector_similarity_weight) + for c in kbinfos["chunks"]: + if c.get("vector"):del c["vector"] return {"answer": answer, "retrieval": kbinfos} \ No newline at end of file diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 18ab2ae822bbfe314f557dbfc1a3ceaae76a7d6f..c8f8384eed55af184d418f2779050ab98b0f5359 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -11,7 +11,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License +# # import base64 import pathlib @@ -65,7 +66,7 @@ def upload(): while MINIO.obj_exist(kb_id, location): location += "_" blob = request.files['file'].read() - MINIO.put(kb_id, filename, blob) + MINIO.put(kb_id, location, blob) doc = DocumentService.insert({ "id": get_uuid(), "kb_id": kb.id, @@ -188,7 +189,10 @@ def rm(): e, doc = DocumentService.get_by_id(req["doc_id"]) if not e: return get_data_error_result(retmsg="Document not found!") - ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(doc.kb_id)) + tenant_id = DocumentService.get_tenant_id(req["doc_id"]) + if not tenant_id: + return get_data_error_result(retmsg="Tenant not found!") + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, 0) if not DocumentService.delete_by_id(req["doc_id"]): diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index de72273879a3a46aae4cb4f6b0abd1ec92d7bc6d..614a902e0c437b05a172323b47b20e839e5ed11c 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -75,7 +75,7 @@ def list(): llms = LLMService.get_all() llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value] for m in llms: - m["available"] = m.llm_name in mdlnms + m["available"] = m["llm_name"] in mdlnms res = {} for m in llms: diff --git a/api/db/db_models.py b/api/db/db_models.py index ab6dc878e85d997de1ef2aa7ba3c6044d66fb1bd..0862dec831c44d267ecf5e2d0dd1b84efe15965a 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -469,7 +469,7 @@ class Knowledgebase(DataBaseModel): doc_num = IntegerField(default=0) token_num = IntegerField(default=0) chunk_num = IntegerField(default=0) - similarity_threshold = FloatField(default=0.4) + similarity_threshold = FloatField(default=0.2) vector_similarity_weight = FloatField(default=0.3) parser_id = CharField(max_length=32, null=False, help_text="default parser ID") @@ -521,7 +521,7 @@ class Dialog(DataBaseModel): prompt_config = JSONField(null=False, default={"system": "", "prologue": "您好,ć‘ćŻć‚¨çš„助手小樱,长得可ç±ĺŹĺ–„良,can I help you?", "parameters": [], "empty_response": "Sorry! 知识库ä¸ćśŞć‰ľĺ°ç›¸ĺ…łĺ†…容ďĽ"}) - similarity_threshold = FloatField(default=0.4) + similarity_threshold = FloatField(default=0.2) vector_similarity_weight = FloatField(default=0.3) top_n = IntegerField(default=6) diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py index 5180adb47c5496e9fc52b441758378c403749f35..51914714e687795b3404f41e6ddd743425ec8619 100644 --- a/api/db/services/llm_service.py +++ b/api/db/services/llm_service.py @@ -63,7 +63,7 @@ class TenantLLMService(CommonService): model_config = cls.get_api_key(tenant_id, mdlnm) if not model_config: raise LookupError("Model({}) not found".format(mdlnm)) - model_config = model_config[0].to_dict() + model_config = model_config.to_dict() if llm_type == LLMType.EMBEDDING.value: if model_config["llm_factory"] not in EmbeddingModel: return return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"]) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index a1ee4682564222c7c74fc3edbaabd9f20c51c290..0aa650e2e10ea0cfdebfc9eec9c357571a6b9059 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -143,7 +143,7 @@ def filename_type(filename): if re.match(r".*\.pdf$", filename): return FileType.PDF.value - if re.match(r".*\.(doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename): + if re.match(r".*\.(docx|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename): return FileType.DOC.value if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 19b870a7e834b041eb9eb61c7861235531aa642d..7dd8267b593b4d39eccfb52aea317ea945f6d3d0 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -19,31 +19,39 @@ import os class Base(ABC): + def __init__(self, key, model_name): + pass + def chat(self, system, history, gen_conf): raise NotImplementedError("Please implement encode method!") class GptTurbo(Base): - def __init__(self): - self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + def __init__(self, key, model_name="gpt-3.5-turbo"): + self.client = OpenAI(api_key=key) + self.model_name = model_name def chat(self, system, history, gen_conf): history.insert(0, {"role": "system", "content": system}) res = self.client.chat.completions.create( - model="gpt-3.5-turbo", + model=self.model_name, messages=history, **gen_conf) return res.choices[0].message.content.strip() +from dashscope import Generation class QWenChat(Base): + def __init__(self, key, model_name=Generation.Models.qwen_turbo): + import dashscope + dashscope.api_key = key + self.model_name = model_name + def chat(self, system, history, gen_conf): from http import HTTPStatus - from dashscope import Generation - # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY history.insert(0, {"role": "system", "content": system}) response = Generation.call( - Generation.Models.qwen_turbo, + self.model_name, messages=history, result_format='message' ) diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py index 0346890391d1b891cb2c14c3b93cf95a6da46af6..3719535330945a71025c950752bad54579ab7618 100644 --- a/rag/llm/cv_model.py +++ b/rag/llm/cv_model.py @@ -28,6 +28,8 @@ class Base(ABC): raise NotImplementedError("Please implement encode method!") def image2base64(self, image): + if isinstance(image, bytes): + return base64.b64encode(image).decode("utf-8") if isinstance(image, BytesIO): return base64.b64encode(image.getvalue()).decode("utf-8") buffered = BytesIO() @@ -59,7 +61,7 @@ class Base(ABC): class GptV4(Base): def __init__(self, key, model_name="gpt-4-vision-preview"): - self.client = OpenAI(key) + self.client = OpenAI(api_key = key) self.model_name = model_name def describe(self, image, max_tokens=300): diff --git a/rag/nlp/search.py b/rag/nlp/search.py index bd936f5e2685eb0081e2ed7f4c9cc8398fc7e02a..29539cedda8e9f004ca3e55b7c35fc6bce12e2d7 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -187,9 +187,10 @@ class Dealer: if len(t) < 5: continue idx.append(i) pieces_.append(t) + es_logger.info("{} => {}".format(answer, pieces_)) if not pieces_: return answer - ans_v = embd_mdl.encode(pieces_) + ans_v, c = embd_mdl.encode(pieces_) assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( len(ans_v[0]), len(chunk_v[0])) @@ -219,7 +220,7 @@ class Dealer: Dealer.trans2floats( sres.field[i]["q_%d_vec" % len(sres.query_vector)]) for i in sres.ids] if not ins_embd: - return [] + return [], [], [] ins_tw = [huqie.qie(sres.field[i][cfield]).split(" ") for i in sres.ids] sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector, ins_embd, @@ -235,6 +236,8 @@ class Dealer: def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2, vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True): + ranks = {"total": 0, "chunks": [], "doc_aggs": {}} + if not question: return ranks req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top, "question": question, "vector": True, "similarity": similarity_threshold} @@ -243,7 +246,7 @@ class Dealer: sim, tsim, vsim = self.rerank( sres, question, 1 - vector_similarity_weight, vector_similarity_weight) idx = np.argsort(sim * -1) - ranks = {"total": 0, "chunks": [], "doc_aggs": {}} + dim = len(sres.query_vector) start_idx = (page - 1) * page_size for i in idx: diff --git a/rag/svr/parse_user_docs.py b/rag/svr/parse_user_docs.py index b69411a908498d54cbfba0001f85c7e12ebc4281..0000c6a3897bd284ba381ca3b38c48dac67ec2c2 100644 --- a/rag/svr/parse_user_docs.py +++ b/rag/svr/parse_user_docs.py @@ -78,6 +78,7 @@ def chuck_doc(name, binary, cvmdl=None): field = TextChunker.Fields() field.text_chunks = [(txt, binary)] field.table_chunks = [] + return field return TextChunker()(binary) @@ -161,9 +162,9 @@ def build(row, cvmdl): doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) output_buffer = BytesIO() docs = [] - md5 = hashlib.md5() for txt, img in obj.text_chunks: d = copy.deepcopy(doc) + md5 = hashlib.md5() md5.update((txt + str(d["doc_id"])).encode("utf-8")) d["_id"] = md5.hexdigest() d["content_ltks"] = huqie.qie(txt) @@ -186,6 +187,7 @@ def build(row, cvmdl): for i, txt in enumerate(arr): d = copy.deepcopy(doc) d["content_ltks"] = huqie.qie(txt) + md5 = hashlib.md5() md5.update((txt + str(d["doc_id"])).encode("utf-8")) d["_id"] = md5.hexdigest() if not img: @@ -226,9 +228,6 @@ def embedding(docs, mdl): def main(comm, mod): - global model - from rag.llm import HuEmbedding - model = HuEmbedding() tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm") tm = findMaxTm(tm_fnm) rows = collect(comm, mod, tm) @@ -260,13 +259,14 @@ def main(comm, mod): set_progress(r["id"], random.randint(70, 95) / 100., "Finished embedding! Start to build index!") init_kb(r) + chunk_count = len(set([c["_id"] for c in cks])) es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"])) if es_r: set_progress(r["id"], -1, "Index failure!") cron_logger.error(str(es_r)) else: set_progress(r["id"], 1., "Done!") - DocumentService.increment_chunk_num(r["id"], r["kb_id"], tk_count, len(cks), timer()-st_tm) + DocumentService.increment_chunk_num(r["id"], r["kb_id"], tk_count, chunk_count, timer()-st_tm) cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks))) tmf.write(str(r["update_time"]) + "\n")