From 3d4315c42a590c8e9951e612097a69cf7c7d356b Mon Sep 17 00:00:00 2001
From: KevinHuSh <kevinhu.sh@gmail.com>
Date: Thu, 29 Feb 2024 18:53:02 +0800
Subject: [PATCH] resolve the issue of naive parser (#87)

---
 README.md                    | 40 +++++++++++++++++++++++++++---------
 api/apps/user_app.py         |  8 ++++----
 api/db/init_data.py          |  2 +-
 api/ragflow_server.py        |  1 -
 api/settings.py              |  2 +-
 deepdoc/parser/pdf_parser.py |  7 ++++---
 rag/app/naive.py             |  3 +--
 7 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 7f1e884..871c6fc 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,7 @@ English | [ç®€ä˝“ä¸ć–‡](./README_zh.md)
 If your machine doesn't have *Docker* installed, please refer to [Install Docker Engine](https://docs.docker.com/engine/install/)
 
 ### OS Setups
-Inorder to run [ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html), 
-you need to check the following command:
+Firstly, you need to check the following command:
 ```bash
 121:/ragflow# sysctl vm.max_map_count
 vm.max_map_count = 262144
@@ -25,23 +24,44 @@ Add or update the following line in the file:
 vm.max_map_count=262144
 ```
 
-### Here we go!
+## Here we go!
 > If you want to change the basic setups, like port, password .etc., please refer to [.env](./docker/.env) before starting the system.
 
-> If you change anything in [.env](./docker/.env), please check [service_conf.yaml](./conf/service_conf.yaml) which is a 
+> If you change anything in [.env](./docker/.env), please check [service_conf.yaml](./docker/service_conf.yaml) which is a 
 > configuration of the back-end service and should be consistent with [.env](./docker/.env).
 
-> - In [service_conf.yaml](./conf/service_conf.yaml), configuration of *LLM* in **user_default_llm** is strongly recommended. 
-> In **user_default_llm** of [service_conf.yaml](./conf/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_.
+> - In [service_conf.yaml](./docker/service_conf.yaml), configuration of *LLM* in **user_default_llm** is strongly recommended. 
+> In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_.
 > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system.
 > - We have supported the flowing LLM factory, and the others is coming soon: 
 > [OpenAI](https://platform.openai.com/login?launch), [é€šäą‰ĺŤé—®/QWen](https://dashscope.console.aliyun.com/model), 
-> [ć™şć™®AI/ZhipuAI](https://open.bigmodel.cn/)
+> [ć™şč°±AI/ZhipuAI](https://open.bigmodel.cn/)
 ```bash
 121:/ragflow# cd docker
-121:/ragflow/docker# docker compose up 
+121:/ragflow/docker# docker compose up -d
 ```
-If after a few minutes, it stops screening and halts like following picture, _**Hallelujah!**_ You have successfully launched the system.
+If after about a half of minutes, use the following command to check the server status. If you can have the following outputs, 
+_**Hallelujah!**_ You have successfully launched the system.
+```bash
+121:/ragflow# docker logs -f  ragflow-server
+
+    ____                 ______ __               
+   / __ \ ____ _ ____ _ / ____// /____  _      __
+  / /_/ // __ `// __ `// /_   / // __ \| | /| / /
+ / _, _// /_/ // /_/ // __/  / // /_/ /| |/ |/ / 
+/_/ |_| \__,_/ \__, //_/    /_/ \____/ |__/|__/  
+              /____/                             
+
+ * Running on all addresses (0.0.0.0)
+ * Running on http://127.0.0.1:9380
+ * Running on http://172.22.0.5:9380
+INFO:werkzeug:Press CTRL+C to quit
+
+```
+Open your browser, after entering the IP address of your server, if you see the flowing in your browser, _**Hallelujah**_ again!
+> The default serving port is 80, if you want to change that, please refer to [ragflow.conf](./nginx/ragflow.conf), 
+> and change the *listen* value.
+    
 <div align="center" style="margin-top:20px;margin-bottom:20px;">
-<img src="https://github.com/infiniflow/ragflow/assets/12318111/7dc8b73f-7890-41b4-aa09-97a417cfd20b" width="1000"/>
+<img src="https://github.com/infiniflow/ragflow/assets/12318111/b24a7a5f-4d1d-4a30-90b1-7b0ec558b79d" width="1000"/>
 </div>
\ No newline at end of file
diff --git a/api/apps/user_app.py b/api/apps/user_app.py
index da352fa..2c1c814 100644
--- a/api/apps/user_app.py
+++ b/api/apps/user_app.py
@@ -70,11 +70,10 @@ def github_callback():
     }, headers={"Accept": "application/json"})
     res = res.json()
     if "error" in res:
-        return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR,
-                               retmsg=res["error_description"])
+        return redirect("/?error=%s" % res["error_description"])
 
     if "user:email" not in res["scope"].split(","):
-        return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg='user:email not in scope')
+        return redirect("/?error=user:email not in scope")
 
     session["access_token"] = res["access_token"]
     session["access_token_from"] = "github"
@@ -104,8 +103,9 @@ def github_callback():
         except Exception as e:
             rollback_user_registration(user_id)
             stat_logger.exception(e)
+            return redirect("/?error=%s"%str(e))
 
-    return redirect("/knowledge")
+    return redirect("/?auth=%s"%user_id)
 
 
 def user_info_from_github(access_token):
diff --git a/api/db/init_data.py b/api/db/init_data.py
index 92dce90..b3ef43c 100644
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@@ -85,7 +85,7 @@ def init_llm_factory():
             "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
             "status": "1",
         },{
-            "name": "ć™şć™®AI",
+            "name": "ć™şč°±AI",
             "logo": "",
             "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
             "status": "1",
diff --git a/api/ragflow_server.py b/api/ragflow_server.py
index 44b4896..a0d7a71 100644
--- a/api/ragflow_server.py
+++ b/api/ragflow_server.py
@@ -13,7 +13,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-# init env. must be the first import
 
 import logging
 import os
diff --git a/api/settings.py b/api/settings.py
index e0076ed..98863db 100644
--- a/api/settings.py
+++ b/api/settings.py
@@ -58,7 +58,7 @@ default_llm = {
         "image2text_model": "gpt-4-vision-preview",
         "asr_model": "whisper-1",
     },
-    "ć™şć™®AI": {
+    "ć™şč°±AI": {
         "chat_model": "glm-3-turbo",
         "embedding_model": "embedding-2",
         "image2text_model": "glm-4v",
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index f99aa91..7f7c919 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -24,9 +24,10 @@ logging.getLogger("pdfminer").setLevel(logging.WARNING)
 class HuParser:
     def __init__(self):
         self.ocr = OCR()
-        if not hasattr(self, "model_speciess"):
-            self.model_speciess = ParserType.NAIVE.value
-        self.layouter = LayoutRecognizer("layout."+self.model_speciess)
+        if hasattr(self, "model_speciess"):
+            self.layouter = LayoutRecognizer("layout."+self.model_speciess)
+        else:
+            self.layouter = LayoutRecognizer("layout")
         self.tbl_det = TableStructureRecognizer()
 
         self.updown_cnt_mdl = xgb.Booster()
diff --git a/rag/app/naive.py b/rag/app/naive.py
index b14e7bf..c40d854 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -30,7 +30,6 @@ class Pdf(PdfParser):
 
         from timeit import default_timer as timer
         start = timer()
-        start = timer()
         self._layouts_rec(zoomin)
         callback(0.5, "Layout analysis finished.")
         print("paddle layouts:", timer() - start)
@@ -102,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
 
     parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?ă€‚ďĽ›ďĽďĽź"})
-    cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
+    cks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?ă€‚ďĽ›ďĽďĽź"))
 
     # wrap up to es documents
     for ck in cks:
-- 
GitLab