diff --git a/rag/app/book.py b/rag/app/book.py index 294f0d7cb276794606056759ebaff9b9fe5e9e56..a76513e2b605d512e6d23a188ea5ceec75b2edfc 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) pdf_parser = None sections, tbls = [], [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") doc_parser = DocxParser() # TODO: table of contents need to be removed diff --git a/rag/app/laws.py b/rag/app/laws.py index 947e4dc408a88bd0c0a35426b6f9e6ec69a65c7b..9b77b4fb704a5d14729f4c57b3f15449786023d4 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) pdf_parser = None sections = [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") for txt in Docx()(filename, binary): sections.append(txt) diff --git a/rag/app/naive.py b/rag/app/naive.py index 62c1df023f1d0edbde45ca4ca796acf686950b1b..0fcbd9fad72206b1d2a23787eacb99f9fa160e65 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res = [] pdf_parser = None sections = [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Docx()(filename, binary) res = tokenize_table(tbls, doc, eng) diff --git a/rag/app/one.py b/rag/app/one.py index e78c7e97eeadc7fa9c6e79f697a9afe4131d7eb4..c56f121403ed99dd97db01b3ed79d18db685ecb0 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, eng = lang.lower() == "english" # is_english(cks) - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = [txt for txt in laws.Docx()(filename, binary) if txt] callback(0.8, "Finish parsing.")