Skip to content
Snippets Groups Projects
Unverified Commit a38e1630 authored by KevinHuSh's avatar KevinHuSh Committed by GitHub
Browse files

remove doc from supported processing types (#488)

### What problem does this PR solve?
#474 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
parent 3610e1e5
No related branches found
No related tags found
No related merge requests found
...@@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ...@@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pdf_parser = None pdf_parser = None
sections, tbls = [], [] sections, tbls = [], []
if re.search(r"\.docx?$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
doc_parser = DocxParser() doc_parser = DocxParser()
# TODO: table of contents need to be removed # TODO: table of contents need to be removed
......
...@@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ...@@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pdf_parser = None pdf_parser = None
sections = [] sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary): for txt in Docx()(filename, binary):
sections.append(txt) sections.append(txt)
......
...@@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ...@@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = [] res = []
pdf_parser = None pdf_parser = None
sections = [] sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary) sections, tbls = Docx()(filename, binary)
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
......
...@@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ...@@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
eng = lang.lower() == "english" # is_english(cks) eng = lang.lower() == "english" # is_english(cks)
if re.search(r"\.docx?$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections = [txt for txt in laws.Docx()(filename, binary) if txt] sections = [txt for txt in laws.Docx()(filename, binary) if txt]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment