From 23b448cf960ed9ef8c20b24eac8ca4e71a04503c Mon Sep 17 00:00:00 2001 From: KevinHuSh <kevinhu.sh@gmail.com> Date: Sun, 7 Apr 2024 09:04:32 +0800 Subject: [PATCH] fix docker compose issue (#238) ### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/226)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/document_app.py | 5 ++ api/apps/user_app.py | 6 +- api/db/init_data.py | 8 +-- api/settings.py | 2 + docker/docker-compose-CN.yml | 112 ++------------------------------- docker/docker-compose-base.yml | 110 ++++++++++++++++++++++++++++++++ docker/docker-compose.yml | 112 ++------------------------------- docker/entrypoint.sh | 2 +- docker/service_conf.yaml | 1 + rag/app/naive.py | 53 ++++++++++++++-- rag/app/table.py | 4 +- rag/llm/chat_model.py | 1 + rag/nlp/__init__.py | 5 +- rag/nlp/search.py | 2 +- 14 files changed, 192 insertions(+), 231 deletions(-) create mode 100644 docker/docker-compose-base.yml diff --git a/api/apps/document_app.py b/api/apps/document_app.py index ea06a3c..6b6715a 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -65,6 +65,11 @@ def upload(): DocumentService.query, name=file.filename, kb_id=kb.id) + filetype = filename_type(filename) + if not filetype: + return get_data_error_result( + retmsg="This type of file has not been supported yet!") + location = filename while MINIO.obj_exist(kb_id, location): location += "_" diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 857c506..55f5d29 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -25,7 +25,7 @@ from api.utils.api_utils import server_error_response, validate_request from api.utils import get_uuid, get_format_time, decrypt, download_img from api.db import UserTenantRole, LLMType from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \ - LLM_FACTORY + LLM_FACTORY, LLM_BASE_URL from api.db.services.user_service import UserService, TenantService, UserTenantService from api.settings import stat_logger from api.utils.api_utils import get_json_result, cors_reponse @@ -220,7 +220,9 @@ def user_register(user_id, user): "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type, - "api_key": API_KEY}) + "api_key": API_KEY, + "base_url": LLM_BASE_URL + }) if not UserService.save(**user): return diff --git a/api/db/init_data.py b/api/db/init_data.py index 9c735de..e2c3e88 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os import time import uuid @@ -21,7 +22,7 @@ from api.db.db_models import init_database_tables as init_web_db from api.db.services import UserService from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle from api.db.services.user_service import TenantService, UserTenantService -from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY +from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL def init_superuser(): @@ -53,7 +54,7 @@ def init_superuser(): for llm in LLMService.query(fid=LLM_FACTORY): tenant_llm.append( {"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type, - "api_key": API_KEY}) + "api_key": API_KEY, "base_url": LLM_BASE_URL}) if not UserService.save(**user_info): print("\033[93mă€ERROR】\033[0mcan't init admin.") @@ -282,11 +283,8 @@ def init_llm_factory(): pass """ - modify service_config drop table llm; drop table llm_factories; - update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义ĺŤé—®'; - update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI'; update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One'; alter table knowledgebase modify avatar longtext; alter table user modify avatar longtext; diff --git a/api/settings.py b/api/settings.py index 0142fd0..4a0387c 100644 --- a/api/settings.py +++ b/api/settings.py @@ -91,6 +91,8 @@ default_llm = { } LLM = get_base_config("user_default_llm", {}) LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen") +LLM_BASE_URL = LLM.get("base_url") + if LLM_FACTORY not in default_llm: print( "\33[91mă€ERROR】\33[0m:", diff --git a/docker/docker-compose-CN.yml b/docker/docker-compose-CN.yml index 67d4448..a4f3f77 100644 --- a/docker/docker-compose-CN.yml +++ b/docker/docker-compose-CN.yml @@ -1,99 +1,12 @@ version: '2.2' -services: - es01: - container_name: ragflow-es-01 - image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} - volumes: - - esdata01:/usr/share/elasticsearch/data - ports: - - ${ES_PORT}:9200 - environment: - - node.name=es01 - - cluster.name=${CLUSTER_NAME} - - cluster.initial_master_nodes=es01 - - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} - - bootstrap.memory_lock=false - - xpack.security.enabled=false - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - ulimits: - memlock: - soft: -1 - hard: -1 - healthcheck: - test: ["CMD-SHELL", "curl http://localhost:9200"] - interval: 10s - timeout: 10s - retries: 120 - networks: - - ragflow - restart: always - - kibana: - depends_on: - es01: - condition: service_healthy - image: docker.elastic.co/kibana/kibana:${STACK_VERSION} - container_name: ragflow-kibana - volumes: - - kibanadata:/usr/share/kibana/data - ports: - - ${KIBANA_PORT}:5601 - environment: - - SERVERNAME=kibana - - ELASTICSEARCH_HOSTS=http://es01:9200 - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - networks: - - ragflow - - mysql: - image: mysql:5.7.18 - container_name: ragflow-mysql - environment: - - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD} - - TZ=${TIMEZONE} - command: - --max_connections=1000 - --character-set-server=utf8mb4 - --collation-server=utf8mb4_general_ci - --default-authentication-plugin=mysql_native_password - --tls_version="TLSv1.2,TLSv1.3" - --init-file /data/application/init.sql - ports: - - ${MYSQL_PORT}:3306 - volumes: - - mysql_data:/var/lib/mysql - - ./init.sql:/data/application/init.sql - networks: - - ragflow - healthcheck: - test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"] - interval: 10s - timeout: 10s - retries: 3 - restart: always - - minio: - image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z - container_name: ragflow-minio - command: server --console-address ":9001" /data - ports: - - 9000:9000 - - 9001:9001 - environment: - - MINIO_ROOT_USER=${MINIO_USER} - - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD} - - TZ=${TIMEZONE} - volumes: - - minio_data:/data - networks: - - ragflow - restart: always +include: + - path: ./docker-compose-base.yml + env_file: ./.env - ragflow: +services: + ragflow: depends_on: mysql: condition: service_healthy @@ -116,18 +29,3 @@ services: networks: - ragflow restart: always - - -volumes: - esdata01: - driver: local - kibanadata: - driver: local - mysql_data: - driver: local - minio_data: - driver: local - -networks: - ragflow: - driver: bridge diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml new file mode 100644 index 0000000..ae9006b --- /dev/null +++ b/docker/docker-compose-base.yml @@ -0,0 +1,110 @@ +version: '2.2' + +services: + es01: + container_name: ragflow-es-01 + image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} + volumes: + - esdata01:/usr/share/elasticsearch/data + ports: + - ${ES_PORT}:9200 + environment: + - node.name=es01 + - cluster.name=${CLUSTER_NAME} + - cluster.initial_master_nodes=es01 + - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} + - bootstrap.memory_lock=false + - xpack.security.enabled=false + - cluster.max_shards_per_node=4096 + - TZ=${TIMEZONE} + mem_limit: ${MEM_LIMIT} + ulimits: + memlock: + soft: -1 + hard: -1 + healthcheck: + test: ["CMD-SHELL", "curl http://localhost:9200"] + interval: 10s + timeout: 10s + retries: 120 + networks: + - ragflow + restart: always + + kibana: + depends_on: + es01: + condition: service_healthy + image: docker.elastic.co/kibana/kibana:${STACK_VERSION} + container_name: ragflow-kibana + volumes: + - kibanadata:/usr/share/kibana/data + ports: + - ${KIBANA_PORT}:5601 + environment: + - SERVERNAME=kibana + - ELASTICSEARCH_HOSTS=http://es01:9200 + - TZ=${TIMEZONE} + mem_limit: ${MEM_LIMIT} + networks: + - ragflow + + mysql: + image: mysql:5.7.18 + container_name: ragflow-mysql + environment: + - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD} + - TZ=${TIMEZONE} + command: + --max_connections=1000 + --character-set-server=utf8mb4 + --collation-server=utf8mb4_general_ci + --default-authentication-plugin=mysql_native_password + --tls_version="TLSv1.2,TLSv1.3" + --init-file /data/application/init.sql + ports: + - ${MYSQL_PORT}:3306 + volumes: + - mysql_data:/var/lib/mysql + - ./init.sql:/data/application/init.sql + networks: + - ragflow + healthcheck: + test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"] + interval: 10s + timeout: 10s + retries: 3 + restart: always + + + minio: + image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z + container_name: ragflow-minio + command: server --console-address ":9001" /data + ports: + - 9000:9000 + - 9001:9001 + environment: + - MINIO_ROOT_USER=${MINIO_USER} + - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD} + - TZ=${TIMEZONE} + volumes: + - minio_data:/data + networks: + - ragflow + restart: always + + +volumes: + esdata01: + driver: local + kibanadata: + driver: local + mysql_data: + driver: local + minio_data: + driver: local + +networks: + ragflow: + driver: bridge diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 7d6705b..f5ad8f8 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,98 +1,10 @@ version: '2.2' -services: - es01: - container_name: ragflow-es-01 - image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} - volumes: - - esdata01:/usr/share/elasticsearch/data - ports: - - ${ES_PORT}:9200 - environment: - - node.name=es01 - - cluster.name=${CLUSTER_NAME} - - cluster.initial_master_nodes=es01 - - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} - - bootstrap.memory_lock=false - - xpack.security.enabled=false - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - ulimits: - memlock: - soft: -1 - hard: -1 - healthcheck: - test: ["CMD-SHELL", "curl http://localhost:9200"] - interval: 10s - timeout: 10s - retries: 120 - networks: - - ragflow - restart: always - - kibana: - depends_on: - es01: - condition: service_healthy - image: docker.elastic.co/kibana/kibana:${STACK_VERSION} - container_name: ragflow-kibana - volumes: - - kibanadata:/usr/share/kibana/data - ports: - - ${KIBANA_PORT}:5601 - environment: - - SERVERNAME=kibana - - ELASTICSEARCH_HOSTS=http://es01:9200 - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - networks: - - ragflow - - mysql: - image: mysql:5.7.18 - container_name: ragflow-mysql - environment: - - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD} - - TZ=${TIMEZONE} - command: - --max_connections=1000 - --character-set-server=utf8mb4 - --collation-server=utf8mb4_general_ci - --default-authentication-plugin=mysql_native_password - --tls_version="TLSv1.2,TLSv1.3" - --init-file /data/application/init.sql - ports: - - ${MYSQL_PORT}:3306 - volumes: - - mysql_data:/var/lib/mysql - - ./init.sql:/data/application/init.sql - networks: - - ragflow - healthcheck: - test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"] - interval: 10s - timeout: 10s - retries: 3 - restart: always - - - minio: - image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z - container_name: ragflow-minio - command: server --console-address ":9001" /data - ports: - - 9000:9000 - - 9001:9001 - environment: - - MINIO_ROOT_USER=${MINIO_USER} - - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD} - - TZ=${TIMEZONE} - volumes: - - minio_data:/data - networks: - - ragflow - restart: always +include: + - path: ./docker-compose-base.yml + env_file: ./.env +services: ragflow: depends_on: mysql: @@ -107,6 +19,7 @@ services: - 443:443 volumes: - ./service_conf.yaml:/ragflow/conf/service_conf.yaml + - ./entrypoint.sh:/ragflow/entrypoint.sh - ./ragflow-logs:/ragflow/logs - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf - ./nginx/proxy.conf:/etc/nginx/proxy.conf @@ -116,18 +29,3 @@ services: networks: - ragflow restart: always - - -volumes: - esdata01: - driver: local - kibanadata: - driver: local - mysql_data: - driver: local - minio_data: - driver: local - -networks: - ragflow: - driver: bridge diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 1e76a13..6da4a81 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -23,7 +23,7 @@ function watch_broker(){ } function task_bro(){ - sleep 60; + sleep 160; watch_broker; } diff --git a/docker/service_conf.yaml b/docker/service_conf.yaml index 661d181..a277c72 100644 --- a/docker/service_conf.yaml +++ b/docker/service_conf.yaml @@ -18,6 +18,7 @@ es: user_default_llm: factory: 'Tongyi-Qianwen' api_key: 'sk-xxxxxxxxxxxxx' + base_url: '' oauth: github: client_id: xxxxxxxxxxxxxxxxxxxxxxxxx diff --git a/rag/app/naive.py b/rag/app/naive.py index dc8726f..3e43cce 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -10,14 +10,59 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import copy +from io import BytesIO +from docx import Document import re from deepdoc.parser.pdf_parser import PlainParser from rag.app import laws from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks -from deepdoc.parser import PdfParser, ExcelParser +from deepdoc.parser import PdfParser, ExcelParser, DocxParser from rag.settings import cron_logger +class Docx(DocxParser): + def __init__(self): + pass + + def __clean(self, line): + line = re.sub(r"\u3000", " ", line).strip() + return line + + def __call__(self, filename, binary=None, from_page=0, to_page=100000): + self.doc = Document( + filename) if not binary else Document(BytesIO(binary)) + pn = 0 + lines = [] + for p in self.doc.paragraphs: + if pn > to_page: + break + if from_page <= pn < to_page and p.text.strip(): + lines.append(self.__clean(p.text)) + for run in p.runs: + if 'lastRenderedPageBreak' in run._element.xml: + pn += 1 + continue + if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + pn += 1 + tbls = [] + for tb in self.doc.tables: + html= "<table>" + for r in tb.rows: + html += "<tr>" + i = 0 + while i < len(r.cells): + span = 1 + c = r.cells[i] + for j in range(i+1, len(r.cells)): + if c.text == r.cells[j].text: + span += 1 + i = j + i += 1 + html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" + html += "</tr>" + html += "</table>" + tbls.append(((None, html), "")) + return [(l, "") for l in lines if l], tbls + class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, @@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [] if re.search(r"\.docx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - for txt in laws.Docx()(filename, binary): - sections.append((txt, "")) + sections, tbls = Docx()(filename, binary) + res = tokenize_table(tbls, doc, eng) callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): diff --git a/rag/app/table.py b/rag/app/table.py index 29fa254..aca1bbf 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, continue if not str(row[clmns[j]]): continue - if pd.isna(row[clmns[j]]): - continue + #if pd.isna(row[clmns[j]]): + # continue fld = clmns_map[j][0] d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie( row[clmns[j]]) diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 623246f..c0379a8 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -170,3 +170,4 @@ class LocalLLM(Base): return ans, num_tokens_from_string(ans) except Exception as e: return "**ERROR**: " + str(e), 0 + diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index d0a9cf9..8aac725 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -68,6 +68,7 @@ def bullets_category(sections): def is_english(texts): eng = 0 + if not texts: return False for t in texts: if re.match(r"[a-zA-Z]{2,}", t.strip()): eng += 1 @@ -112,8 +113,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10): d = copy.deepcopy(doc) tokenize(d, rows, eng) d["content_with_weight"] = rows - d["image"] = img - add_positions(d, poss) + if img: d["image"] = img + if poss: add_positions(d, poss) res.append(d) continue de = "; " if eng else "; " diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 01564bb..178466a 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -46,7 +46,7 @@ class Dealer: "k": topk, "similarity": sim, "num_candidates": topk * 2, - "query_vector": qv + "query_vector": list(qv) } def search(self, req, idxnm, emb_mdl=None): -- GitLab