From 0dfc8ddc0fef83fbe6a553b7997dee6aca52de62 Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Tue, 23 Apr 2024 14:41:10 +0800 Subject: [PATCH] enlarge docker memory usage (#501) ### What problem does this PR solve? ### Type of change - [x] Refactoring --- deepdoc/parser/pdf_parser.py | 17 ++++++++--------- docker/.env | 4 +++- docker/docker-compose-base.yml | 34 +++++++++++++++++----------------- docker/entrypoint.sh | 2 +- rag/app/book.py | 2 +- rag/app/laws.py | 2 +- rag/app/manual.py | 2 +- rag/app/naive.py | 6 +++--- rag/app/one.py | 2 +- rag/app/paper.py | 2 +- rag/svr/task_broker.py | 5 +++++ rag/svr/task_executor.py | 8 ++++++++ 12 files changed, 50 insertions(+), 36 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index b8a6924..6bda369 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -11,7 +11,7 @@ import pdfplumber import logging from PIL import Image, ImageDraw import numpy as np - +from timeit import default_timer as timer from PyPDF2 import PdfReader as pdf2_read from api.utils.file_utils import get_project_base_directory @@ -936,6 +936,7 @@ class HuParser: self.page_cum_height = [0] self.page_layout = [] self.page_from = page_from + st = timer() try: self.pdf = pdfplumber.open(fnm) if isinstance( fnm, str) else pdfplumber.open(BytesIO(fnm)) @@ -989,7 +990,9 @@ class HuParser: self.is_english = True else: self.is_english = False + self.is_english = False + st = timer() for i, img in enumerate(self.page_images): chars = self.page_chars[i] if not self.is_english else [] self.mean_height.append( @@ -1007,15 +1010,11 @@ class HuParser: chars[j]["width"]) / 2: chars[j]["text"] += " " j += 1 - # if i > 0: - # if not chars: - # self.page_cum_height.append(img.size[1] / zoomin) - # else: - # self.page_cum_height.append( - # np.max([c["bottom"] for c in chars])) + self.__ocr(i + 1, img, chars, zoomin) - if callback: - callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + #if callback: + # callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + #print("OCR:", timer()-st) if not self.is_english and not any( [c for c in self.page_chars]) and self.boxes: diff --git a/docker/.env b/docker/.env index ddb181d..9f028c8 100644 --- a/docker/.env +++ b/docker/.env @@ -11,7 +11,9 @@ ES_PORT=1200 KIBANA_PORT=6601 # Increase or decrease based on the available host memory (in bytes) -MEM_LIMIT=12073741824 + +MEM_LIMIT=8073741824 + MYSQL_PASSWORD=infini_rag_flow MYSQL_PORT=5455 diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml index b110ce9..24519eb 100644 --- a/docker/docker-compose-base.yml +++ b/docker/docker-compose-base.yml @@ -29,23 +29,23 @@ services: - ragflow restart: always - kibana: - depends_on: - es01: - condition: service_healthy - image: docker.elastic.co/kibana/kibana:${STACK_VERSION} - container_name: ragflow-kibana - volumes: - - kibanadata:/usr/share/kibana/data - ports: - - ${KIBANA_PORT}:5601 - environment: - - SERVERNAME=kibana - - ELASTICSEARCH_HOSTS=http://es01:9200 - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - networks: - - ragflow + #kibana: + # depends_on: + # es01: + # condition: service_healthy + # image: docker.elastic.co/kibana/kibana:${STACK_VERSION} + # container_name: ragflow-kibana + # volumes: + # - kibanadata:/usr/share/kibana/data + # ports: + # - ${KIBANA_PORT}:5601 + # environment: + # - SERVERNAME=kibana + # - ELASTICSEARCH_HOSTS=http://es01:9200 + # - TZ=${TIMEZONE} + # mem_limit: ${MEM_LIMIT} + # networks: + # - ragflow mysql: image: mysql:5.7.18 diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 6da4a81..a136161 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -29,7 +29,7 @@ function task_bro(){ task_bro & -WS=2 +WS=1 for ((i=0;i<WS;i++)) do task_exe $i $WS & diff --git a/rag/app/book.py b/rag/app/book.py index a76513e..3c46b68 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -37,7 +37,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.67, "Layout analysis finished") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() diff --git a/rag/app/laws.py b/rag/app/laws.py index 9b77b4f..acb9669 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -71,7 +71,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.67, "Layout analysis finished") - cron_logger.info("paddle layouts:".format( + cron_logger.info("layouts:".format( (timer() - start) / (self.total_page + 0.1))) self._naive_vertical_merge() diff --git a/rag/app/manual.py b/rag/app/manual.py index bc8e0f1..a6fd653 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -32,7 +32,7 @@ class Pdf(PdfParser): self._layouts_rec(zoomin) callback(0.65, "Layout analysis finished.") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.67, "Table analysis finished.") self._text_merge() diff --git a/rag/app/naive.py b/rag/app/naive.py index 0fcbd9f..82618b1 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -77,12 +77,12 @@ class Pdf(PdfParser): callback ) callback(msg="OCR finished") - cron_logger.info("OCR: {}".format(timer() - start)) + cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis finished.") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.65, "Table analysis finished.") self._text_merge() @@ -92,7 +92,7 @@ class Pdf(PdfParser): self._concat_downward() #self._filter_forpages() - cron_logger.info("paddle layouts: {}".format( + cron_logger.info("layouts: {}".format( (timer() - start) / (self.total_page + 0.1))) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls diff --git a/rag/app/one.py b/rag/app/one.py index c56f121..430958d 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -33,7 +33,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin, drop=False) callback(0.63, "Layout analysis finished.") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.65, "Table analysis finished.") self._text_merge() diff --git a/rag/app/paper.py b/rag/app/paper.py index 9a75bec..2e707b5 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -42,7 +42,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis finished") - print("paddle layouts:", timer() - start) + print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index f44ba3f..126d7e8 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -33,6 +33,8 @@ from api.settings import database_logger from api.utils import get_format_time, get_uuid from api.utils.file_utils import get_project_base_directory from rag.utils.redis_conn import REDIS_CONN +from api.db.db_models import init_database_tables as init_web_db +from api.db.init_data import init_web_data def collect(tm): @@ -181,6 +183,9 @@ if __name__ == "__main__": peewee_logger.propagate = False peewee_logger.addHandler(database_logger.handlers[0]) peewee_logger.setLevel(database_logger.level) + # init db + init_web_db() + init_web_data() while True: dispatch() diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 9ad044e..7783a63 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -163,6 +163,7 @@ def build(row): "doc_id": row["doc_id"], "kb_id": [str(row["kb_id"])] } + el = 0 for ck in cks: d = copy.deepcopy(doc) d.update(ck) @@ -182,10 +183,13 @@ def build(row): else: d["image"].save(output_buffer, format='JPEG') + st = timer() MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue()) + el += timer() - st d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"]) del d["image"] docs.append(d) + cron_logger.info("MINIO PUT({}):{}".format(row["name"], el)) return docs @@ -258,7 +262,9 @@ def main(comm, mod): callback(prog=-1, msg=str(e)) continue + st = timer() cks = build(r) + cron_logger.info("Build chunks({}): {}".format(r["name"], timer()-st)) if cks is None: continue if not cks: @@ -277,12 +283,14 @@ def main(comm, mod): callback(-1, "Embedding error:{}".format(str(e))) cron_logger.error(str(e)) tk_count = 0 + cron_logger.info("Embedding elapsed({}): {}".format(r["name"], timer()-st)) callback(msg="Finished embedding({})! Start to build index!".format(timer()-st)) init_kb(r) chunk_count = len(set([c["_id"] for c in cks])) st = timer() es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"])) + cron_logger.info("Indexing elapsed({}): {}".format(r["name"], timer()-st)) if es_r: callback(-1, "Index failure!") ELASTICSEARCH.deleteByQuery( -- GitLab