diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index c9a200374ddfa3c27d501c0a7d2a5c3e251543a7..b3528581c64e3e117b89e83b668abd811ae0022f 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -830,10 +830,13 @@ class HuParser: pn = [bx["page_number"]] top = bx["top"] - self.page_cum_height[pn[0] - 1] bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] - if pn[-1] - 1 >= len(self.page_images): return "" + page_images_cnt = len(self.page_images) + if pn[-1] - 1 >= page_images_cnt: return "" while bott * ZM > self.page_images[pn[-1] - 1].size[1]: bott -= self.page_images[pn[-1] - 1].size[1] / ZM pn.append(pn[-1] + 1) + if pn[-1] - 1 >= page_images_cnt: + return "" return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ .format("-".join([str(p) for p in pn]),