Newer
Older
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import hashlib
import copy
import re
from functools import partial
from api.db.services.file2document_service import File2DocumentService
from rag.settings import database_logger, SVR_QUEUE_NAME
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
from multiprocessing import Pool
from elasticsearch_dsl import Q
from multiprocessing.context import TimeoutError
from api.db.services.task_service import TaskService
from timeit import default_timer as timer
from rag.nlp import search
from io import BytesIO
import pandas as pd
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
from api.db.services.llm_service import LLMBundle
from api.utils.file_utils import get_project_base_directory
from rag.utils.redis_conn import REDIS_CONN
ParserType.PAPER.value: paper,
KevinHuSh
committed
ParserType.BOOK.value: book,
ParserType.PRESENTATION.value: presentation,
ParserType.MANUAL.value: manual,
ParserType.LAWS.value: laws,
KevinHuSh
committed
ParserType.TABLE.value: table,
ParserType.RESUME.value: resume,
ParserType.PICTURE.value: picture,
def set_progress(task_id, from_page=0, to_page=-1,
prog=None, msg="Processing..."):
if prog is not None and prog < 0:
cancel = TaskService.do_cancel(task_id)
if cancel:
d = {"progress_msg": msg}
if prog is not None:
d["progress"] = prog
KevinHuSh
committed
TaskService.update_progress(task_id, d)
except Exception as e:
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
if cancel:
sys.exit()
def collect():
try:
payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer")
if not payload:
time.sleep(1)
return pd.DataFrame()
except Exception as e:
cron_logger.error("Get task event from queue exception:" + str(e))
return pd.DataFrame()
msg = payload.get_message()
payload.ack()
if not msg: return pd.DataFrame()
if TaskService.do_cancel(msg["id"]):
tasks = TaskService.get_tasks(msg["id"])
assert tasks, "{} empty task!".format(msg["id"])
tasks = pd.DataFrame(tasks)
return tasks
def get_minio_binary(bucket, name):
return MINIO.get(bucket, name)
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
return []
callback = partial(
set_progress,
row["id"],
row["from_page"],
row["to_page"])
KevinHuSh
committed
chunker = FACTORY[row["parser_id"].lower()]
bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
binary = get_minio_binary(bucket, name)
cron_logger.info(
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
to_page=row["to_page"], lang=row["language"], callback=callback,
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
cron_logger.info(
"Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"]))
except TimeoutError as e:
callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
cron_logger.error(
"Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"]))
return
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s>" % row["name"])
callback(-1, f"Internal server error: %s" %
str(e).replace("'", ""))
traceback.print_exc()
"Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
"doc_id": row["doc_id"],
"kb_id": [str(row["kb_id"])]
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
if not d.get("image"):
output_buffer = BytesIO()
if isinstance(d["image"], bytes):
output_buffer = BytesIO(d["image"])
d["image"].save(output_buffer, format='JPEG')
MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
KevinHuSh
committed
del d["image"]
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
return docs
def init_kb(row):
idxnm = search.index_name(row["tenant_id"])
if ELASTICSEARCH.indexExist(idxnm):
return
return ELASTICSEARCH.createIdx(idxnm, json.load(
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
def embedding(docs, mdl, parser_config={}, callback=None):
batch_size = 32
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
tts_ = np.array([])
for i in range(0, len(tts), batch_size):
vts, c = mdl.encode(tts[i: i + batch_size])
if len(tts_) == 0:
tts_ = vts
else:
tts_ = np.concatenate((tts_, vts), axis=0)
tk_count += c
callback(prog=0.6 + 0.1 * (i + 1) / len(tts), msg="")
tts = tts_
for i in range(0, len(cnts), batch_size):
vts, c = mdl.encode(cnts[i: i + batch_size])
if len(cnts_) == 0:
cnts_ = vts
else:
cnts_ = np.concatenate((cnts_, vts), axis=0)
callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
title_w = float(parser_config.get("filename_embd_weight", 0.1))
vects = (title_w * tts + (1 - title_w) *
cnts) if len(tts) == len(cnts) else cnts
assert len(vects) == len(docs)
for i, d in enumerate(docs):
d["q_%d_vec" % len(v)] = v
def main():
rows = collect()
if len(rows) == 0:
return
for _, r in rows.iterrows():
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
cron_logger.info("Build chunks({}): {}".format(r["name"], timer()-st))
if cks is None:
continue
KevinHuSh
committed
callback(1., "No chunk! Done!")
continue
# TODO: exception handler
## set_progress(r["did"], -1, "ERROR: ")
callback(
msg="Finished slicing files(%d). Start to embedding the content." %
len(cks))
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
callback(-1, "Embedding error:{}".format(str(e)))
cron_logger.info("Embedding elapsed({}): {}".format(r["name"], timer()-st))
callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
chunk_count = len(set([c["_id"] for c in cks]))
es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
cron_logger.info("Indexing elapsed({}): {}".format(r["name"], timer()-st))
callback(-1, "Index failure!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
if TaskService.do_cancel(r["id"]):
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
KevinHuSh
committed
continue
DocumentService.increment_chunk_num(
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
cron_logger.info(
"Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
r["id"], tk_count, len(cks), timer()-st))
peewee_logger = logging.getLogger('peewee')
peewee_logger.propagate = False
peewee_logger.addHandler(database_logger.handlers[0])
peewee_logger.setLevel(database_logger.level)