From 23b448cf960ed9ef8c20b24eac8ca4e71a04503c Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Sun, 7 Apr 2024 09:04:32 +0800
Subject: [PATCH] fix docker compose issue (#238)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

Issue link:#[[Link the issue
here](https://github.com/infiniflow/ragflow/issues/226)]

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 api/apps/document_app.py       |   5 ++
 api/apps/user_app.py           |   6 +-
 api/db/init_data.py            |   8 +--
 api/settings.py                |   2 +
 docker/docker-compose-CN.yml   | 112 ++-------------------------------
 docker/docker-compose-base.yml | 110 ++++++++++++++++++++++++++++++++
 docker/docker-compose.yml      | 112 ++-------------------------------
 docker/entrypoint.sh           |   2 +-
 docker/service_conf.yaml       |   1 +
 rag/app/naive.py               |  53 ++++++++++++++--
 rag/app/table.py               |   4 +-
 rag/llm/chat_model.py          |   1 +
 rag/nlp/__init__.py            |   5 +-
 rag/nlp/search.py              |   2 +-
 14 files changed, 192 insertions(+), 231 deletions(-)
 create mode 100644 docker/docker-compose-base.yml

diff --git a/api/apps/document_app.py b/api/apps/document_app.py
index ea06a3c..6b6715a 100644
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -65,6 +65,11 @@ def upload():
             DocumentService.query,
             name=file.filename,
             kb_id=kb.id)
+        filetype = filename_type(filename)
+        if not filetype:
+            return get_data_error_result(
+                retmsg="This type of file has not been supported yet!")
+
         location = filename
         while MINIO.obj_exist(kb_id, location):
             location += "_"
diff --git a/api/apps/user_app.py b/api/apps/user_app.py
index 857c506..55f5d29 100644
--- a/api/apps/user_app.py
+++ b/api/apps/user_app.py
@@ -25,7 +25,7 @@ from api.utils.api_utils import server_error_response, validate_request
 from api.utils import get_uuid, get_format_time, decrypt, download_img
 from api.db import UserTenantRole, LLMType
 from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
-    LLM_FACTORY
+    LLM_FACTORY, LLM_BASE_URL
 from api.db.services.user_service import UserService, TenantService, UserTenantService
 from api.settings import stat_logger
 from api.utils.api_utils import get_json_result, cors_reponse
@@ -220,7 +220,9 @@ def user_register(user_id, user):
                            "llm_factory": LLM_FACTORY,
                            "llm_name": llm.llm_name,
                            "model_type": llm.model_type,
-                           "api_key": API_KEY})
+                           "api_key": API_KEY,
+                           "base_url": LLM_BASE_URL
+                           })
 
     if not UserService.save(**user):
         return
diff --git a/api/db/init_data.py b/api/db/init_data.py
index 9c735de..e2c3e88 100644
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import os
 import time
 import uuid
 
@@ -21,7 +22,7 @@ from api.db.db_models import init_database_tables as init_web_db
 from api.db.services import UserService
 from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
 from api.db.services.user_service import TenantService, UserTenantService
-from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY
+from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
 
 
 def init_superuser():
@@ -53,7 +54,7 @@ def init_superuser():
     for llm in LLMService.query(fid=LLM_FACTORY):
         tenant_llm.append(
             {"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type,
-             "api_key": API_KEY})
+             "api_key": API_KEY, "base_url": LLM_BASE_URL})
 
     if not UserService.save(**user_info):
         print("\033[93m【ERROR】\033[0mcan't init admin.")
@@ -282,11 +283,8 @@ def init_llm_factory():
             pass
 
     """
-    modify service_config
     drop table llm;
     drop table llm_factories;
-    update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
-    update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
     update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
     alter table knowledgebase modify avatar longtext;
     alter table user modify avatar longtext;
diff --git a/api/settings.py b/api/settings.py
index 0142fd0..4a0387c 100644
--- a/api/settings.py
+++ b/api/settings.py
@@ -91,6 +91,8 @@ default_llm = {
 }
 LLM = get_base_config("user_default_llm", {})
 LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
+LLM_BASE_URL = LLM.get("base_url")
+
 if LLM_FACTORY not in default_llm:
     print(
         "\33[91m【ERROR】\33[0m:",
diff --git a/docker/docker-compose-CN.yml b/docker/docker-compose-CN.yml
index 67d4448..a4f3f77 100644
--- a/docker/docker-compose-CN.yml
+++ b/docker/docker-compose-CN.yml
@@ -1,99 +1,12 @@
 version: '2.2'
-services:
-  es01:
-    container_name: ragflow-es-01
-    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
-    volumes:
-      - esdata01:/usr/share/elasticsearch/data
-    ports:
-      - ${ES_PORT}:9200
-    environment:
-      - node.name=es01
-      - cluster.name=${CLUSTER_NAME}
-      - cluster.initial_master_nodes=es01
-      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
-      - bootstrap.memory_lock=false
-      - xpack.security.enabled=false
-      - TZ=${TIMEZONE}
-    mem_limit: ${MEM_LIMIT}
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
-    healthcheck:
-      test: ["CMD-SHELL", "curl http://localhost:9200"]
-      interval: 10s
-      timeout: 10s
-      retries: 120
-    networks:
-      - ragflow
-    restart: always
-
-  kibana:
-    depends_on:
-        es01:
-          condition: service_healthy
-    image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
-    container_name: ragflow-kibana
-    volumes:
-      - kibanadata:/usr/share/kibana/data
-    ports:
-      - ${KIBANA_PORT}:5601
-    environment:
-      - SERVERNAME=kibana
-      - ELASTICSEARCH_HOSTS=http://es01:9200
-      - TZ=${TIMEZONE}
-    mem_limit: ${MEM_LIMIT}
-    networks:
-      - ragflow
-
-  mysql:
-    image: mysql:5.7.18
-    container_name: ragflow-mysql
-    environment:
-      - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
-      - TZ=${TIMEZONE}
-    command:
-      --max_connections=1000
-      --character-set-server=utf8mb4
-      --collation-server=utf8mb4_general_ci
-      --default-authentication-plugin=mysql_native_password
-      --tls_version="TLSv1.2,TLSv1.3"
-      --init-file /data/application/init.sql
-    ports:
-      - ${MYSQL_PORT}:3306
-    volumes:
-      - mysql_data:/var/lib/mysql
-      - ./init.sql:/data/application/init.sql
-    networks:
-      - ragflow
-    healthcheck:
-      test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
-      interval: 10s
-      timeout: 10s
-      retries: 3
-    restart: always
-
 
-  minio:
-    image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
-    container_name: ragflow-minio
-    command: server --console-address ":9001" /data
-    ports:
-      - 9000:9000
-      - 9001:9001
-    environment:
-      - MINIO_ROOT_USER=${MINIO_USER}
-      - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
-      - TZ=${TIMEZONE}
-    volumes:
-      - minio_data:/data
-    networks:
-      - ragflow
-    restart: always
 
+include:
+  - path: ./docker-compose-base.yml
+    env_file: ./.env
 
-  ragflow:
+services:
+   ragflow:
     depends_on:
       mysql:
         condition: service_healthy
@@ -116,18 +29,3 @@ services:
     networks:
       - ragflow
     restart: always
-
-
-volumes:
-  esdata01:
-    driver: local
-  kibanadata:
-    driver: local
-  mysql_data:
-    driver: local
-  minio_data:
-    driver: local
-
-networks:
-  ragflow:
-    driver: bridge
diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml
new file mode 100644
index 0000000..ae9006b
--- /dev/null
+++ b/docker/docker-compose-base.yml
@@ -0,0 +1,110 @@
+version: '2.2'
+
+services:
+  es01:
+    container_name: ragflow-es-01
+    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
+    volumes:
+      - esdata01:/usr/share/elasticsearch/data
+    ports:
+      - ${ES_PORT}:9200
+    environment:
+      - node.name=es01
+      - cluster.name=${CLUSTER_NAME}
+      - cluster.initial_master_nodes=es01
+      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
+      - bootstrap.memory_lock=false
+      - xpack.security.enabled=false
+      - cluster.max_shards_per_node=4096
+      - TZ=${TIMEZONE}
+    mem_limit: ${MEM_LIMIT}
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    healthcheck:
+      test: ["CMD-SHELL", "curl http://localhost:9200"]
+      interval: 10s
+      timeout: 10s
+      retries: 120
+    networks:
+      - ragflow
+    restart: always
+
+  kibana:
+    depends_on:
+        es01:
+          condition: service_healthy
+    image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
+    container_name: ragflow-kibana
+    volumes:
+      - kibanadata:/usr/share/kibana/data
+    ports:
+      - ${KIBANA_PORT}:5601
+    environment:
+      - SERVERNAME=kibana
+      - ELASTICSEARCH_HOSTS=http://es01:9200
+      - TZ=${TIMEZONE}
+    mem_limit: ${MEM_LIMIT}
+    networks:
+      - ragflow
+
+  mysql:
+    image: mysql:5.7.18
+    container_name: ragflow-mysql
+    environment:
+      - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
+      - TZ=${TIMEZONE}
+    command:
+      --max_connections=1000
+      --character-set-server=utf8mb4
+      --collation-server=utf8mb4_general_ci
+      --default-authentication-plugin=mysql_native_password
+      --tls_version="TLSv1.2,TLSv1.3"
+      --init-file /data/application/init.sql
+    ports:
+      - ${MYSQL_PORT}:3306
+    volumes:
+      - mysql_data:/var/lib/mysql
+      - ./init.sql:/data/application/init.sql
+    networks:
+      - ragflow
+    healthcheck:
+      test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
+      interval: 10s
+      timeout: 10s
+      retries: 3
+    restart: always
+
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
+    container_name: ragflow-minio
+    command: server --console-address ":9001" /data
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=${MINIO_USER}
+      - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
+      - TZ=${TIMEZONE}
+    volumes:
+      - minio_data:/data
+    networks:
+      - ragflow
+    restart: always
+
+
+volumes:
+  esdata01:
+    driver: local
+  kibanadata:
+    driver: local
+  mysql_data:
+    driver: local
+  minio_data:
+    driver: local
+
+networks:
+  ragflow:
+    driver: bridge
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 7d6705b..f5ad8f8 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -1,98 +1,10 @@
 version: '2.2'
-services:
-  es01:
-    container_name: ragflow-es-01
-    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
-    volumes:
-      - esdata01:/usr/share/elasticsearch/data
-    ports:
-      - ${ES_PORT}:9200
-    environment:
-      - node.name=es01
-      - cluster.name=${CLUSTER_NAME}
-      - cluster.initial_master_nodes=es01
-      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
-      - bootstrap.memory_lock=false
-      - xpack.security.enabled=false
-      - TZ=${TIMEZONE}
-    mem_limit: ${MEM_LIMIT}
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
-    healthcheck:
-      test: ["CMD-SHELL", "curl http://localhost:9200"]
-      interval: 10s
-      timeout: 10s
-      retries: 120
-    networks:
-      - ragflow
-    restart: always
-
-  kibana:
-    depends_on:
-        es01:
-          condition: service_healthy
-    image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
-    container_name: ragflow-kibana
-    volumes:
-      - kibanadata:/usr/share/kibana/data
-    ports:
-      - ${KIBANA_PORT}:5601
-    environment:
-      - SERVERNAME=kibana
-      - ELASTICSEARCH_HOSTS=http://es01:9200
-      - TZ=${TIMEZONE}
-    mem_limit: ${MEM_LIMIT}
-    networks:
-      - ragflow
-
-  mysql:
-    image: mysql:5.7.18
-    container_name: ragflow-mysql
-    environment:
-      - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
-      - TZ=${TIMEZONE}
-    command:
-      --max_connections=1000
-      --character-set-server=utf8mb4
-      --collation-server=utf8mb4_general_ci
-      --default-authentication-plugin=mysql_native_password
-      --tls_version="TLSv1.2,TLSv1.3"
-      --init-file /data/application/init.sql
-    ports:
-      - ${MYSQL_PORT}:3306
-    volumes:
-      - mysql_data:/var/lib/mysql
-      - ./init.sql:/data/application/init.sql
-    networks:
-      - ragflow
-    healthcheck:
-      test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
-      interval: 10s
-      timeout: 10s
-      retries: 3
-    restart: always
-
-
-  minio:
-    image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
-    container_name: ragflow-minio
-    command: server --console-address ":9001" /data
-    ports:
-      - 9000:9000
-      - 9001:9001
-    environment:
-      - MINIO_ROOT_USER=${MINIO_USER}
-      - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
-      - TZ=${TIMEZONE}
-    volumes:
-      - minio_data:/data
-    networks:
-      - ragflow
-    restart: always
 
+include:
+  - path: ./docker-compose-base.yml
+    env_file: ./.env
 
+services:
   ragflow:
     depends_on:
       mysql:
@@ -107,6 +19,7 @@ services:
       - 443:443
     volumes:
       - ./service_conf.yaml:/ragflow/conf/service_conf.yaml
+      - ./entrypoint.sh:/ragflow/entrypoint.sh
       - ./ragflow-logs:/ragflow/logs
       - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
       - ./nginx/proxy.conf:/etc/nginx/proxy.conf
@@ -116,18 +29,3 @@ services:
     networks:
       - ragflow
     restart: always
-
-
-volumes:
-  esdata01:
-    driver: local
-  kibanadata:
-    driver: local
-  mysql_data:
-    driver: local
-  minio_data:
-    driver: local
-
-networks:
-  ragflow:
-    driver: bridge
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index 1e76a13..6da4a81 100644
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -23,7 +23,7 @@ function watch_broker(){
 }
 
 function task_bro(){
-    sleep 60;
+    sleep 160;
     watch_broker;
 }
 
diff --git a/docker/service_conf.yaml b/docker/service_conf.yaml
index 661d181..a277c72 100644
--- a/docker/service_conf.yaml
+++ b/docker/service_conf.yaml
@@ -18,6 +18,7 @@ es:
 user_default_llm:
   factory: 'Tongyi-Qianwen'
   api_key: 'sk-xxxxxxxxxxxxx'
+  base_url: ''
 oauth:
   github:
     client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
diff --git a/rag/app/naive.py b/rag/app/naive.py
index dc8726f..3e43cce 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -10,14 +10,59 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import copy
+from io import BytesIO
+from docx import Document
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
-from deepdoc.parser import PdfParser, ExcelParser
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from rag.settings import cron_logger
 
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+        tbls.append(((None, html), ""))
+        return [(l, "") for l in lines if l], tbls
+
 
 class Pdf(PdfParser):
     def __call__(self, filename, binary=None, from_page=0,
@@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     sections = []
     if re.search(r"\.docx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
-        for txt in laws.Docx()(filename, binary):
-            sections.append((txt, ""))
+        sections, tbls = Docx()(filename, binary)
+        res = tokenize_table(tbls, doc, eng)
         callback(0.8, "Finish parsing.")
 
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
diff --git a/rag/app/table.py b/rag/app/table.py
index 29fa254..aca1bbf 100644
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
                     continue
                 if not str(row[clmns[j]]):
                     continue
-                if pd.isna(row[clmns[j]]):
-                    continue
+                #if pd.isna(row[clmns[j]]):
+                #    continue
                 fld = clmns_map[j][0]
                 d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
                     row[clmns[j]])
diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py
index 623246f..c0379a8 100644
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@@ -170,3 +170,4 @@ class LocalLLM(Base):
             return ans, num_tokens_from_string(ans)
         except Exception as e:
             return "**ERROR**: " + str(e), 0
+
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index d0a9cf9..8aac725 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -68,6 +68,7 @@ def bullets_category(sections):
 
 def is_english(texts):
     eng = 0
+    if not texts: return False
     for t in texts:
         if re.match(r"[a-zA-Z]{2,}", t.strip()):
             eng += 1
@@ -112,8 +113,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
             d = copy.deepcopy(doc)
             tokenize(d, rows, eng)
             d["content_with_weight"] = rows
-            d["image"] = img
-            add_positions(d, poss)
+            if img: d["image"] = img
+            if poss: add_positions(d, poss)
             res.append(d)
             continue
         de = "; " if eng else "; "
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index 01564bb..178466a 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -46,7 +46,7 @@ class Dealer:
             "k": topk,
             "similarity": sim,
             "num_candidates": topk * 2,
-            "query_vector": qv
+            "query_vector": list(qv)
         }
 
     def search(self, req, idxnm, emb_mdl=None):
-- 
GitLab