From 572e5b1ff120cb179af6efb6e39ef6d139f858b7 Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Tue, 2 Apr 2024 11:39:01 +0800
Subject: [PATCH] Let task continue dispaching while meeting unexpected doc
 formats (#199)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

Issue link:#[[Link the issue
here](https://github.com/infiniflow/ragflow/issues/198)]

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Breaking Change (fix or feature that could cause existing
functionality not to work as expected)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Test cases
- [ ] Python SDK impacted, Need to update PyPI
- [ ] Other (please describe):
---
 rag/svr/task_broker.py | 76 ++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py
index 0892a6c..b54792a 100644
--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
@@ -73,7 +73,7 @@ def dispatch():
                 for t in tsks:
                     TaskService.delete_by_id(t.id)
         except Exception as e:
-            cron_logger.error("delete task exception:" + str(e))
+            cron_logger.exception(e)
 
         def new_task():
             nonlocal r
@@ -83,44 +83,48 @@ def dispatch():
             }
 
         tsks = []
-        if r["type"] == FileType.PDF.value:
-            do_layout = r["parser_config"].get("layout_recognize", True)
-            pages = PdfParser.total_page_number(
-                r["name"], MINIO.get(r["kb_id"], r["location"]))
-            page_size = r["parser_config"].get("task_page_size", 12)
-            if r["parser_id"] == "paper":
-                page_size = r["parser_config"].get("task_page_size", 22)
-            if r["parser_id"] == "one":
-                page_size = 1000000000
-            if not do_layout:
-                page_size = 1000000000
-            page_ranges = r["parser_config"].get("pages")
-            if not page_ranges:
-                page_ranges = [(1, 100000)]
-            for s, e in page_ranges:
-                s -= 1
-                s = max(0, s)
-                e = min(e - 1, pages)
-                for p in range(s, e, page_size):
+        try:
+            if r["type"] == FileType.PDF.value:
+                do_layout = r["parser_config"].get("layout_recognize", True)
+                pages = PdfParser.total_page_number(
+                        r["name"], MINIO.get(r["kb_id"], r["location"]))
+                page_size = r["parser_config"].get("task_page_size", 12)
+                if r["parser_id"] == "paper":
+                    page_size = r["parser_config"].get("task_page_size", 22)
+                if r["parser_id"] == "one":
+                    page_size = 1000000000
+                if not do_layout:
+                    page_size = 1000000000
+                page_ranges = r["parser_config"].get("pages")
+                if not page_ranges:
+                    page_ranges = [(1, 100000)]
+                for s, e in page_ranges:
+                    s -= 1
+                    s = max(0, s)
+                    e = min(e - 1, pages)
+                    for p in range(s, e, page_size):
+                        task = new_task()
+                        task["from_page"] = p
+                        task["to_page"] = min(p + page_size, e)
+                        tsks.append(task)
+
+            elif r["parser_id"] == "table":
+                rn = HuExcelParser.row_number(
+                    r["name"], MINIO.get(
+                        r["kb_id"], r["location"]))
+                for i in range(0, rn, 3000):
                     task = new_task()
-                    task["from_page"] = p
-                    task["to_page"] = min(p + page_size, e)
+                    task["from_page"] = i
+                    task["to_page"] = min(i + 3000, rn)
                     tsks.append(task)
+            else:
+                tsks.append(new_task())
+
+            bulk_insert_into_db(Task, tsks, True)
+            set_dispatching(r["id"])
+        except Exception as e:
+            cron_logger.exception(e)
 
-        elif r["parser_id"] == "table":
-            rn = HuExcelParser.row_number(
-                r["name"], MINIO.get(
-                    r["kb_id"], r["location"]))
-            for i in range(0, rn, 3000):
-                task = new_task()
-                task["from_page"] = i
-                task["to_page"] = min(i + 3000, rn)
-                tsks.append(task)
-        else:
-            tsks.append(new_task())
-
-        bulk_insert_into_db(Task, tsks, True)
-        set_dispatching(r["id"])
         tmf.write(str(r["update_time"]) + "\n")
     tmf.close()
 
-- 
GitLab