From a38e163035650fcc2ed786dd7c56ab8f9f6b2dfb Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Mon, 22 Apr 2024 15:46:09 +0800
Subject: [PATCH] remove doc from supported processing types (#488)

### What problem does this PR solve?
#474

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/app/book.py  | 2 +-
 rag/app/laws.py  | 2 +-
 rag/app/naive.py | 2 +-
 rag/app/one.py   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rag/app/book.py b/rag/app/book.py
index 294f0d7..a76513e 100644
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     pdf_parser = None
     sections, tbls = [], []
-    if re.search(r"\.docx?$", filename, re.IGNORECASE):
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         doc_parser = DocxParser()
         # TODO: table of contents need to be removed
diff --git a/rag/app/laws.py b/rag/app/laws.py
index 947e4dc..9b77b4f 100644
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     pdf_parser = None
     sections = []
-    if re.search(r"\.docx?$", filename, re.IGNORECASE):
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         for txt in Docx()(filename, binary):
             sections.append(txt)
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 62c1df0..0fcbd9f 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     res = []
     pdf_parser = None
     sections = []
-    if re.search(r"\.docx?$", filename, re.IGNORECASE):
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         sections, tbls = Docx()(filename, binary)
         res = tokenize_table(tbls, doc, eng)
diff --git a/rag/app/one.py b/rag/app/one.py
index e78c7e9..c56f121 100644
--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
 
     eng = lang.lower() == "english"  # is_english(cks)
 
-    if re.search(r"\.docx?$", filename, re.IGNORECASE):
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         sections = [txt for txt in laws.Docx()(filename, binary) if txt]
         callback(0.8, "Finish parsing.")
-- 
GitLab