diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index b8a69245a35710dd806bae91cf9ba31f7d03f241..6bda36970a4919e34c09ebc87878dca2982a81b5 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -11,7 +11,7 @@ import pdfplumber import logging from PIL import Image, ImageDraw import numpy as np - +from timeit import default_timer as timer from PyPDF2 import PdfReader as pdf2_read from api.utils.file_utils import get_project_base_directory @@ -936,6 +936,7 @@ class HuParser: self.page_cum_height = [0] self.page_layout = [] self.page_from = page_from + st = timer() try: self.pdf = pdfplumber.open(fnm) if isinstance( fnm, str) else pdfplumber.open(BytesIO(fnm)) @@ -989,7 +990,9 @@ class HuParser: self.is_english = True else: self.is_english = False + self.is_english = False + st = timer() for i, img in enumerate(self.page_images): chars = self.page_chars[i] if not self.is_english else [] self.mean_height.append( @@ -1007,15 +1010,11 @@ class HuParser: chars[j]["width"]) / 2: chars[j]["text"] += " " j += 1 - # if i > 0: - # if not chars: - # self.page_cum_height.append(img.size[1] / zoomin) - # else: - # self.page_cum_height.append( - # np.max([c["bottom"] for c in chars])) + self.__ocr(i + 1, img, chars, zoomin) - if callback: - callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + #if callback: + # callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + #print("OCR:", timer()-st) if not self.is_english and not any( [c for c in self.page_chars]) and self.boxes: diff --git a/docker/.env b/docker/.env index ddb181d1036571bac6b6ac165f82c3f2e70e56f3..9f028c826fffef2e8109636a317fd04634e65d1a 100644 --- a/docker/.env +++ b/docker/.env @@ -11,7 +11,9 @@ ES_PORT=1200 KIBANA_PORT=6601 # Increase or decrease based on the available host memory (in bytes) -MEM_LIMIT=12073741824 + +MEM_LIMIT=8073741824 + MYSQL_PASSWORD=infini_rag_flow MYSQL_PORT=5455 diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml index b110ce95256a31cc39c33c5c799b0bc021b2be24..24519ebd4f73949b89972a603dc6cee7712a7f55 100644 --- a/docker/docker-compose-base.yml +++ b/docker/docker-compose-base.yml @@ -29,23 +29,23 @@ services: - ragflow restart: always - kibana: - depends_on: - es01: - condition: service_healthy - image: docker.elastic.co/kibana/kibana:${STACK_VERSION} - container_name: ragflow-kibana - volumes: - - kibanadata:/usr/share/kibana/data - ports: - - ${KIBANA_PORT}:5601 - environment: - - SERVERNAME=kibana - - ELASTICSEARCH_HOSTS=http://es01:9200 - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - networks: - - ragflow + #kibana: + # depends_on: + # es01: + # condition: service_healthy + # image: docker.elastic.co/kibana/kibana:${STACK_VERSION} + # container_name: ragflow-kibana + # volumes: + # - kibanadata:/usr/share/kibana/data + # ports: + # - ${KIBANA_PORT}:5601 + # environment: + # - SERVERNAME=kibana + # - ELASTICSEARCH_HOSTS=http://es01:9200 + # - TZ=${TIMEZONE} + # mem_limit: ${MEM_LIMIT} + # networks: + # - ragflow mysql: image: mysql:5.7.18 diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 6da4a814ccac20e688c6b69df281ed1f0223be10..a1361618266ef097476288c07c11181a0be13fa3 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -29,7 +29,7 @@ function task_bro(){ task_bro & -WS=2 +WS=1 for ((i=0;i<WS;i++)) do task_exe $i $WS & diff --git a/rag/app/book.py b/rag/app/book.py index a76513e2b605d512e6d23a188ea5ceec75b2edfc..3c46b68a4aba90a783c8d30ae7b43519b0d08f8d 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -37,7 +37,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.67, "Layout analysis finished") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() diff --git a/rag/app/laws.py b/rag/app/laws.py index 9b77b4fb704a5d14729f4c57b3f15449786023d4..acb96692ff80a009e7ed8c9a3a3321906b020e35 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -71,7 +71,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.67, "Layout analysis finished") - cron_logger.info("paddle layouts:".format( + cron_logger.info("layouts:".format( (timer() - start) / (self.total_page + 0.1))) self._naive_vertical_merge() diff --git a/rag/app/manual.py b/rag/app/manual.py index bc8e0f17c4ca870cc4e5b581a6a4d961fbf4c7b1..a6fd653afd11a7377f53740df1f4a8de46efa777 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -32,7 +32,7 @@ class Pdf(PdfParser): self._layouts_rec(zoomin) callback(0.65, "Layout analysis finished.") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.67, "Table analysis finished.") self._text_merge() diff --git a/rag/app/naive.py b/rag/app/naive.py index 0fcbd9fad72206b1d2a23787eacb99f9fa160e65..82618b1560c237de396ba345f11137d3cb2f4052 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -77,12 +77,12 @@ class Pdf(PdfParser): callback ) callback(msg="OCR finished") - cron_logger.info("OCR: {}".format(timer() - start)) + cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis finished.") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.65, "Table analysis finished.") self._text_merge() @@ -92,7 +92,7 @@ class Pdf(PdfParser): self._concat_downward() #self._filter_forpages() - cron_logger.info("paddle layouts: {}".format( + cron_logger.info("layouts: {}".format( (timer() - start) / (self.total_page + 0.1))) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls diff --git a/rag/app/one.py b/rag/app/one.py index c56f121403ed99dd97db01b3ed79d18db685ecb0..430958d25e53df6133266e2a75389563cdd91fd4 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -33,7 +33,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin, drop=False) callback(0.63, "Layout analysis finished.") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.65, "Table analysis finished.") self._text_merge() diff --git a/rag/app/paper.py b/rag/app/paper.py index 9a75bec78815ce7ad853f192f17a92a3b5a1655c..2e707b527bdd87cfd9f702bb6b60856c83ab032e 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -42,7 +42,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis finished") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index f44ba3f86587f0b4cf1b1d2d08490c974e0df766..126d7e882ca02212822d8fd488dbfc1c8a71d0a9 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -33,6 +33,8 @@ from api.settings import database_logger from api.utils import get_format_time, get_uuid from api.utils.file_utils import get_project_base_directory from rag.utils.redis_conn import REDIS_CONN +from api.db.db_models import init_database_tables as init_web_db +from api.db.init_data import init_web_data def collect(tm): @@ -181,6 +183,9 @@ if __name__ == "__main__": peewee_logger.propagate = False peewee_logger.addHandler(database_logger.handlers[0]) peewee_logger.setLevel(database_logger.level) + # init db + init_web_db() + init_web_data() while True: dispatch() diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 9ad044e87bdf269708611f01a5e310fcf387109e..7783a6308a744e62953160da6a4631f8d43d83d2 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -163,6 +163,7 @@ def build(row): "doc_id": row["doc_id"], "kb_id": [str(row["kb_id"])] } + el = 0 for ck in cks: d = copy.deepcopy(doc) d.update(ck) @@ -182,10 +183,13 @@ def build(row): else: d["image"].save(output_buffer, format='JPEG') + st = timer() MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue()) + el += timer() - st d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"]) del d["image"] docs.append(d) + cron_logger.info("MINIO PUT({}):{}".format(row["name"], el)) return docs @@ -258,7 +262,9 @@ def main(comm, mod): callback(prog=-1, msg=str(e)) continue + st = timer() cks = build(r) + cron_logger.info("Build chunks({}): {}".format(r["name"], timer()-st)) if cks is None: continue if not cks: @@ -277,12 +283,14 @@ def main(comm, mod): callback(-1, "Embedding error:{}".format(str(e))) cron_logger.error(str(e)) tk_count = 0 + cron_logger.info("Embedding elapsed({}): {}".format(r["name"], timer()-st)) callback(msg="Finished embedding({})! Start to build index!".format(timer()-st)) init_kb(r) chunk_count = len(set([c["_id"] for c in cks])) st = timer() es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"])) + cron_logger.info("Indexing elapsed({}): {}".format(r["name"], timer()-st)) if es_r: callback(-1, "Index failure!") ELASTICSEARCH.deleteByQuery(