diff --git a/rag/app/naive.py b/rag/app/naive.py index b97cf15736c25b4dac4477febe98b48957b4174a..608cf561178219839a0919ecd4faf20c1f2b107f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -14,8 +14,7 @@ from io import BytesIO from docx import Document import re from deepdoc.parser.pdf_parser import PlainParser -from rag.app import laws -from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks +from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks from deepdoc.parser import PdfParser, ExcelParser, DocxParser from rag.settings import cron_logger @@ -140,7 +139,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.1, "Start to parse.") txt = "" if binary: - txt = binary.decode("utf-8") + try: + txt = binary.decode("utf-8") + except Exception as e: + txt = binary.decode("gb2312") else: with open(filename, "r") as f: while True: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 422ce54e97e7d18660ecd961849a10505bed57ea..971373cfcc81f8f5e20c436cb3238d66f802776a 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -237,7 +237,7 @@ class Dealer: pieces_.append(t) es_logger.info("{} => {}".format(answer, pieces_)) if not pieces_: - return answer + return answer, set([]) ans_v, _ = embd_mdl.encode(pieces_) assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(