From 0499a3f6219da2d2ba70ec4ee72a55832c410390 Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Thu, 18 Apr 2024 12:09:56 +0800
Subject: [PATCH] rm page number exception for pdf parser (#424)

### What problem does this PR solve?

#423

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 deepdoc/parser/pdf_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index 6c33245..c9a2003 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -830,6 +830,7 @@ class HuParser:
         pn = [bx["page_number"]]
         top = bx["top"] - self.page_cum_height[pn[0] - 1]
         bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
+        if pn[-1] - 1 >= len(self.page_images): return ""
         while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
             bott -= self.page_images[pn[-1] - 1].size[1] / ZM
             pn.append(pn[-1] + 1)
-- 
GitLab