Skip to content
Snippets Groups Projects
Unverified Commit 436c52bb authored by KevinHuSh's avatar KevinHuSh Committed by GitHub
Browse files

refine presentation parser (#110)

parent 2d7c9080
No related branches found
No related tags found
No related merge requests found
...@@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs): ...@@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs):
if "max_tokens" in gen_conf: if "max_tokens" in gen_conf:
gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count) gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer))
if knowledges: if knowledges:
answer = retrievaler.insert_citations(answer, answer, idx = retrievaler.insert_citations(answer,
[ck["content_ltks"] for ck in kbinfos["chunks"]], [ck["content_ltks"] for ck in kbinfos["chunks"]],
[ck["vector"] for ck in kbinfos["chunks"]], [ck["vector"] for ck in kbinfos["chunks"]],
embd_mdl, embd_mdl,
tkweight=1 - dialog.vector_similarity_weight, tkweight=1 - dialog.vector_similarity_weight,
vtweight=dialog.vector_similarity_weight) vtweight=dialog.vector_similarity_weight)
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
for c in kbinfos["chunks"]: for c in kbinfos["chunks"]:
if c.get("vector"): del c["vector"] if c.get("vector"): del c["vector"]
return {"answer": answer, "reference": kbinfos} return {"answer": answer, "reference": kbinfos}
......
...@@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca ...@@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
res = [] res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE): if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt() ppt_parser = Ppt()
for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback): for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
pn += from_page
d["image"] = img d["image"] = img
tokenize(d, txt, ppt_parser.is_english) d["page_num_int"] = [pn+1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng)
res.append(d) res.append(d)
return res return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()
for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)): for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
pn += from_page
d["image"] = img d["image"] = img
d["page_num_int"] = [pn+1] d["page_num_int"] = [pn+1]
d["top_int"] = [0] d["top_int"] = [0]
d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1])) d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng) tokenize(d, txt, eng)
res.append(d) res.append(d)
return res return res
......
...@@ -243,7 +243,7 @@ class Dealer: ...@@ -243,7 +243,7 @@ class Dealer:
res += f" ##{c}$$" res += f" ##{c}$$"
seted.add(c) seted.add(c)
return res return res, seted
def rerank(self, sres, query, tkweight=0.3, def rerank(self, sres, query, tkweight=0.3,
vtweight=0.7, cfield="content_ltks"): vtweight=0.7, cfield="content_ltks"):
...@@ -290,7 +290,7 @@ class Dealer: ...@@ -290,7 +290,7 @@ class Dealer:
start_idx -= 1 start_idx -= 1
if start_idx >= 0: if start_idx >= 0:
continue continue
if len(ranks["chunks"]) == page_size: if len(ranks["chunks"]) >= page_size:
if aggs: if aggs:
continue continue
break break
...@@ -322,7 +322,7 @@ class Dealer: ...@@ -322,7 +322,7 @@ class Dealer:
if dnm not in ranks["doc_aggs"]: if dnm not in ranks["doc_aggs"]:
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
ranks["doc_aggs"][dnm]["count"] += 1 ranks["doc_aggs"][dnm]["count"] += 1
ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
return ranks return ranks
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment