Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
sqad2zeo_db
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nlp
sqad2zeo_db
Commits
bcf2beb9
There was an error fetching the commit references. Please try again later.
Commit
bcf2beb9
authored
4 years ago
by
Marek Medved
Browse files
Options
Downloads
Patches
Plain Diff
s_bert and cls_bert for KB
parent
08881ebb
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
Makefile
+33
-29
33 additions, 29 deletions
Makefile
add_bert_emberdings.py
+2
-1
2 additions, 1 deletion
add_bert_emberdings.py
query_database.py
+16
-5
16 additions, 5 deletions
query_database.py
sqad_db.py
+8
-2
8 additions, 2 deletions
sqad_db.py
with
59 additions
and
37 deletions
Makefile
+
33
−
29
View file @
bcf2beb9
DB_NAME
=
sqad_db/devel/sqad_v3_
$(
shell
date
+
"%d-%m-%Y_%H-%M-%S"
)
_base
VERSION
=
$(
shell
cat
./sqad_db/version
)
NEW_VERSION
=
$$(
(
$(
VERSION
)
+1
)
)
UPDB
=
$(
DB
)
_with_updates_
$(
shell
date
+
"%d-%m-%Y_%H-%M-%S"
)
#
UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S")
# Need to specify bash in order for conda activate to work.
SHELL
=
/bin/bash
...
...
@@ -10,47 +10,51 @@ CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activ
create
:
printf
"SQAD to DB
\n
=======================
\n
"
>>
$(
DB_NAME
)
.log
cat
./Makefile
>>
$(
DB_NAME
)
.log
echo
$(
NEW_VERSION
)
>
./sqad_db/version
(
$(
CONDA_ACTIVATE
)
base
;
./sqad2database.py
-p
/nlp/projekty/sqad/sqad_v3/data
-n
$(
DB_NAME
)
-v
$(
NEW_VERSION
)
2>>
$(
DB_NAME
)
.log
)
echo
"
$(
hostname
)
"
| mail
-s
"Done sqad_db created"
"xmedved1@fi.muni.cz"
updates
:
@
echo
"creating update
$(
DB
)
->
$(
UPDB
)
"
cp
$(
DB
)
$(
UPDB
)
cp
$(
DB
)
.index
$(
UPDB
)
.index
cp
$(
DB
)
.lock
$(
UPDB
)
.lock
cp
$(
DB
)
.tmp
$(
UPDB
)
.tmp
cat
./Makefile
>>
$(
UPDB
)
.log
@
echo
"creating updates
$(
DB
)
"
# Word Bert embeddings
printf
"add bert embeddings
\n
=======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
bert
;
./add_bert_emberdings.py
-d
$(
UPDB
)
2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
$(
DB
)
_Vbert
printf
"add bert embeddings
\n
=======================
\n
"
>>
$(
DB
)
_Vbert.log
(
$(
CONDA_ACTIVATE
)
bert
;
./add_bert_emberdings.py
-d
$(
DB
)
_Vbert 2>>
$(
DB
)
_Vbert.log
)
# Contains answer sentece
printf
"Contains answer
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
./add_contains_answer_sentences.py
-d
$(
UPDB
)
2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert
$(
DB
)
_Vbert_addAS
printf
"Contains answer
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS.log
(
$(
CONDA_ACTIVATE
)
base
;
./add_contains_answer_sentences.py
-d
$(
DB
)
_Vbert_addAS 2>>
$(
DB
)
_Vbert_addAS.log
)
# Similar sentences
printf
"Similar answers
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
./add_similar_senteces.py
-d
$(
UPDB
)
-n
0 2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert_addAS
$(
DB
)
_Vbert_addAS_simS
printf
"Similar answers
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS.log
(
$(
CONDA_ACTIVATE
)
base
;
./add_similar_senteces.py
-n
0
-d
$(
DB
)
_Vbert_addAS_simS 2>>
$(
DB
)
_Vbert_addAS_simS.log
)
# Context NP
printf
"Contex NP phrases context_window 3
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_np.py
-d
$(
UPDB
)
--context_window
3
--phr_per_sent
"longest"
2>>
$(
UPDB
)
.log
)
printf
"Contex NP phrases context_window 2
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_np.py
-d
$(
UPDB
)
--context_window
2
--phr_per_sent
"longest"
2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert_addAS_simS
$(
DB
)
_Vbert_addAS_simS_cNP
printf
"Contex NP phrases context_window 3
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_np.py
--context_window
3
--phr_per_sent
"longest"
-d
$(
DB
)
_Vbert_addAS_simS_cNP 2>>
$(
DB
)
_Vbert_addAS_simS_cNP.log
)
printf
"Contex NP phrases context_window 2
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_np.py
--context_window
2
--phr_per_sent
"longest"
-d
$(
DB
)
_Vbert_addAS_simS_cNP 2>>
$(
DB
)
_Vbert_addAS_simS_cNP.log
)
# Context Previous sentences
printf
"Context previous sentece 1
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_previous_senteces.py
-d
$(
UPDB
)
--number
1 2>>
$(
UPDB
)
.log
)
printf
"Context previous sentece 2
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_previous_senteces.py
-d
$(
UPDB
)
--number
2 2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert_addAS_simS_cNP
$(
DB
)
_Vbert_addAS_simS_cNP_cPS
printf
"Context previous sentece 1
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_previous_senteces.py
--number
1
-d
$(
DB
)
_Vbert_addAS_simS_cNP_cPS 2>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS.log
)
printf
"Context previous sentece 2
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS.log
(
$(
CONDA_ACTIVATE
)
base
;
./context_previous_senteces.py
--number
2
-d
$(
DB
)
_Vbert_addAS_simS_cNP_cPS 2>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS.log
)
# Context NER
printf
"Context wiki entity context_window 5
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
mypy3
;
python ./context_ner.py
--context_window
5
-d
$(
UPDB
)
-m
named_entity_recognition/BERT-NER/ner_model_cz/ 2>>
$(
UPDB
)
.log
)
printf
"Context wiki entity context_window 2
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
mypy3
;
python ./context_ner.py
--context_window
2
-d
$(
UPDB
)
-m
named_entity_recognition/BERT-NER/ner_model_cz/ 2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert_addAS_simS_cNP_cPS
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER
printf
"Context wiki entity context_window 5
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER.log
(
$(
CONDA_ACTIVATE
)
mypy3
;
python ./context_ner.py
--context_window
5
-m
named_entity_recognition/BERT-NER/ner_model_cz/
-d
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER 2>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER.log
)
printf
"Context wiki entity context_window 2
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER.log
(
$(
CONDA_ACTIVATE
)
mypy3
;
python ./context_ner.py
--context_window
2
-m
named_entity_recognition/BERT-NER/ner_model_cz/
-d
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER 2>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER.log
)
# Sentece Bert
printf
"Sentece to sentece bert embedding
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
base
;
python ./sentece2s_bert.py
-d
$(
UPDB
)
2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert
printf
"Sentece to sentece bert embedding
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log
(
$(
CONDA_ACTIVATE
)
base
;
python ./sentece2s_bert.py
-d
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert 2>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log
)
# CLS Bert
printf
"Sentece to cls bert embedding
\n
======================
\n
"
>>
$(
UPDB
)
.log
(
$(
CONDA_ACTIVATE
)
bert
;
python ./sentece2cls_bert.py
-d
$(
UPDB
)
2>>
$(
UPDB
)
.log
)
./make_copy.sh
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert
printf
"Sentece to cls bert embedding
\n
======================
\n
"
>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log
(
$(
CONDA_ACTIVATE
)
bert
;
python ./sentece2cls_bert.py
-d
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert 2>>
$(
DB
)
_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log
)
echo
"
$(
hostname
)
"
| mail
-s
"Done AQA job"
"xmedved1@fi.muni.cz"
run_ZODB_server
:
...
...
This diff is collapsed.
Click to expand it.
add_bert_emberdings.py
+
2
−
1
View file @
bcf2beb9
...
...
@@ -22,7 +22,8 @@ class Bert_Embeddings:
def
word2embedding
(
self
,
word
):
input_ids
=
self
.
tokenizer
.
encode
([
"
[CLS]
"
,
word
],
return_tensors
=
"
pt
"
,
add_special_tokens
=
True
)
output
=
self
.
model
(
input_ids
)
return
output
[
0
][
0
][
0
]
return
output
[
0
][
0
][
0
].
detach
().
numpy
()
def
add_bert_word_embeddings_word
(
vocabulary
,
model
,
db
):
...
...
This diff is collapsed.
Click to expand it.
query_database.py
+
16
−
5
View file @
bcf2beb9
...
...
@@ -52,7 +52,9 @@ def get_content_ctx(url, kb, vocabulary, part='', context_type='', preloaded=Fal
for
sentence
in
kb
.
url2doc
.
get
(
url
)[
'
text
'
]:
result
.
append
({
'
sent
'
:
get_senence
(
sentence
[
'
sent
'
],
vocabulary
,
part
=
part
,
preloaded
=
preloaded
),
'
ctx
'
:
get_ctx
(
sentence
[
'
ctx
'
],
vocabulary
,
part
=
part
,
context_type
=
context_type
,
preloaded
=
preloaded
)})
preloaded
=
preloaded
),
'
sbert
'
:
sentence
[
'
sbert
'
],
'
cls_bert
'
:
sentence
[
'
cls_bert
'
]})
return
result
...
...
@@ -87,8 +89,11 @@ def get_record(db, record_id, word_parts='', context_type='', vocabulary=None, q
preloaded
=
preloaded
)
data
[
'
text
'
]
=
get_content_ctx
(
record
.
text
,
kb
,
vocabulary
,
part
=
word_parts
,
context_type
=
context_type
,
preloaded
=
preloaded
)
data
[
'
contain_answer
'
]
=
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
data
[
'
not_contain_answer
'
]
=
len
(
data
[
'
text
'
])
-
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
try
:
data
[
'
contain_answer
'
]
=
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
data
[
'
not_contain_answer
'
]
=
len
(
data
[
'
text
'
])
-
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
except
KeyError
:
sys
.
stderr
.
write
(
'
No sents_containing_ans_ext
\n
'
)
return
data
...
...
@@ -142,9 +147,15 @@ def print_record(db, record_id, context_type=''):
context_previous_senteces
.
print_ctx
(
phrs
)
print
(
'
No. text sentences that contain answer
'
)
print
(
f
'
\t
{
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
}
'
)
try
:
print
(
f
'
\t
{
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
}
'
)
except
KeyError
:
print
(
'
\t
No info
'
)
print
(
'
No. text sentences that does NOT contain answer
'
)
print
(
f
'
\t
{
text_sents_total
-
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
}
'
)
try
:
print
(
f
'
\t
{
text_sents_total
-
len
(
record
.
similar_answers
[
"
sents_containing_ans_ext
"
])
}
'
)
except
KeyError
:
print
(
'
\t
No info
'
)
def
main
():
...
...
This diff is collapsed.
Click to expand it.
sqad_db.py
+
8
−
2
View file @
bcf2beb9
...
...
@@ -43,7 +43,10 @@ def id2word(vocabulary, key, parts='', preloaded=False):
if
'
v500
'
in
word_parts
or
not
parts
:
result
[
'
v500
'
]
=
vocabulary
[
'
vectors
'
][
key
][
'
v500
'
]
if
'
v_bert
'
in
word_parts
or
not
parts
:
result
[
'
v_bert
'
]
=
vocabulary
[
'
vectors
'
][
key
][
'
v_bert
'
]
try
:
result
[
'
v_bert
'
]
=
vocabulary
[
'
vectors
'
][
key
][
'
v_bert
'
]
except
KeyError
:
sys
.
stderr
.
write
(
f
'
ERROR: not
"
v_bert
"
for:
{
vocabulary
[
"
id2wlt
"
][
key
][
"
word
"
]
}
\n
'
)
if
'
id
'
in
word_parts
or
not
parts
:
result
[
'
id
'
]
=
key
else
:
...
...
@@ -60,7 +63,10 @@ def id2word(vocabulary, key, parts='', preloaded=False):
if
'
v500
'
in
word_parts
or
not
parts
:
result
[
'
v500
'
]
=
vocabulary
.
vectors
[
key
][
'
v500
'
]
if
'
v_bert
'
in
word_parts
or
not
parts
:
result
[
'
v_bert
'
]
=
vocabulary
.
vectors
[
key
][
'
v_bert
'
]
try
:
result
[
'
v_bert
'
]
=
vocabulary
.
vectors
[
key
][
'
v_bert
'
]
except
KeyError
:
sys
.
stderr
.
write
(
f
'
ERROR: not
"
v_bert
"
for:
{
vocabulary
.
id2wlt
[
key
][
"
word
"
]
}
\n
'
)
if
'
id
'
in
word_parts
or
not
parts
:
result
[
'
id
'
]
=
key
return
result
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment