From 572e5b1ff120cb179af6efb6e39ef6d139f858b7 Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Tue, 2 Apr 2024 11:39:01 +0800 Subject: [PATCH] Let task continue dispaching while meeting unexpected doc formats (#199) ### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/198)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Breaking Change (fix or feature that could cause existing functionality not to work as expected) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Test cases - [ ] Python SDK impacted, Need to update PyPI - [ ] Other (please describe): --- rag/svr/task_broker.py | 76 ++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index 0892a6c..b54792a 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -73,7 +73,7 @@ def dispatch(): for t in tsks: TaskService.delete_by_id(t.id) except Exception as e: - cron_logger.error("delete task exception:" + str(e)) + cron_logger.exception(e) def new_task(): nonlocal r @@ -83,44 +83,48 @@ def dispatch(): } tsks = [] - if r["type"] == FileType.PDF.value: - do_layout = r["parser_config"].get("layout_recognize", True) - pages = PdfParser.total_page_number( - r["name"], MINIO.get(r["kb_id"], r["location"])) - page_size = r["parser_config"].get("task_page_size", 12) - if r["parser_id"] == "paper": - page_size = r["parser_config"].get("task_page_size", 22) - if r["parser_id"] == "one": - page_size = 1000000000 - if not do_layout: - page_size = 1000000000 - page_ranges = r["parser_config"].get("pages") - if not page_ranges: - page_ranges = [(1, 100000)] - for s, e in page_ranges: - s -= 1 - s = max(0, s) - e = min(e - 1, pages) - for p in range(s, e, page_size): + try: + if r["type"] == FileType.PDF.value: + do_layout = r["parser_config"].get("layout_recognize", True) + pages = PdfParser.total_page_number( + r["name"], MINIO.get(r["kb_id"], r["location"])) + page_size = r["parser_config"].get("task_page_size", 12) + if r["parser_id"] == "paper": + page_size = r["parser_config"].get("task_page_size", 22) + if r["parser_id"] == "one": + page_size = 1000000000 + if not do_layout: + page_size = 1000000000 + page_ranges = r["parser_config"].get("pages") + if not page_ranges: + page_ranges = [(1, 100000)] + for s, e in page_ranges: + s -= 1 + s = max(0, s) + e = min(e - 1, pages) + for p in range(s, e, page_size): + task = new_task() + task["from_page"] = p + task["to_page"] = min(p + page_size, e) + tsks.append(task) + + elif r["parser_id"] == "table": + rn = HuExcelParser.row_number( + r["name"], MINIO.get( + r["kb_id"], r["location"])) + for i in range(0, rn, 3000): task = new_task() - task["from_page"] = p - task["to_page"] = min(p + page_size, e) + task["from_page"] = i + task["to_page"] = min(i + 3000, rn) tsks.append(task) + else: + tsks.append(new_task()) + + bulk_insert_into_db(Task, tsks, True) + set_dispatching(r["id"]) + except Exception as e: + cron_logger.exception(e) - elif r["parser_id"] == "table": - rn = HuExcelParser.row_number( - r["name"], MINIO.get( - r["kb_id"], r["location"])) - for i in range(0, rn, 3000): - task = new_task() - task["from_page"] = i - task["to_page"] = min(i + 3000, rn) - tsks.append(task) - else: - tsks.append(new_task()) - - bulk_insert_into_db(Task, tsks, True) - set_dispatching(r["id"]) tmf.write(str(r["update_time"]) + "\n") tmf.close() -- GitLab