Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Michal Štefánik
ARQMath-eval
Commits
de8752c0
Commit
de8752c0
authored
Jul 11, 2020
by
Vít Novotný
Browse files
Support computing NDCG' score of an ARQMath submission from the CLI
parent
9273b3ea
Pipeline
#62558
failed with stage
Changes
2
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
README.md
View file @
de8752c0
...
...
@@ -33,10 +33,10 @@ all relevance judgements. Use these to evaluate a system that has not been
trained using subsets of the
`task1`
and
`task2`
tasks.
### Examples
#### Using the `train` set to train your supervised system
#### Using the `train`
sub
set to train your supervised system
```
sh
$
pip
install
--force-reinstall
git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.
8
$
pip
install
--force-reinstall
git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.
13
$
python
>>>
from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
...
...
@@ -61,10 +61,10 @@ Here is the documentation of the available evaluation functions:
-
[
`get_ndcg(parsed_run, task, subset, topn)`
][
get_ndcg
]
, and
-
[
`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`
][
get_random_normalized_ndcg
]
.
#### Using the `validation` set to compare various parameters of your system
#### Using the `validation`
sub
set to compare various parameters of your system
```
sh
$
pip
install
--force-reinstall
git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.
8
$
pip
install
--force-reinstall
git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.
13
$
python
>>>
from arqmath_eval import get_topics, get_judged_documents
>>>
...
...
@@ -92,7 +92,12 @@ $ git add -u # add the updated leaderboard to Git
$
git push
# publish your new result and the updated leaderboard
```
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
```
sh
$
pip
install
--force-reinstall
git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$
python
-m
arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv
0.238
```
[
arqmath-task1
]:
https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html
(Task 1: Find Answers)
...
...
scripts/evaluate.py
View file @
de8752c0
...
...
@@ -4,6 +4,7 @@ from glob import glob
from
multiprocessing
import
Pool
import
os.path
import
re
import
sys
from
pytrec_eval
import
parse_run
from
tqdm
import
tqdm
...
...
@@ -20,7 +21,7 @@ def evaluate_worker(result_filename):
return
(
result_name
,
ndcg
)
if
__name__
==
'__main__'
:
def
produce_leaderboards
()
:
for
task
in
TASKS
:
if
not
os
.
path
.
exists
(
task
):
continue
...
...
@@ -60,3 +61,38 @@ if __name__ == '__main__':
f_readme
.
write
(
'| *%.4f* | *%s* | *%s* |
\n
'
%
(
ndcg
,
result_name
,
user_name
))
else
:
f_readme
.
write
(
'| %.4f | %s | %s |
\n
'
%
(
ndcg
,
result_name
,
user_name
))
def
evaluate_run
(
filename
):
with
open
(
filename
,
'rt'
)
as
f
:
lines
=
[
line
.
strip
().
split
()
for
line
in
f
]
first_line
=
lines
[
0
]
n
=
len
(
first_line
)
if
n
==
5
:
task
=
'task1'
elif
n
==
6
:
task
=
'task2'
else
:
raise
ValueError
(
'Expected lines as 5-tuples (Query_Id, Post_Id, Rank, Score, Run_Number) for task 1, '
'or 6-tuples (Query_Id, Formula_Id, Post_Id, Rank, Score, Run_Number) for task 2, '
'received %d-tuples: %s'
%
(
n
,
first_line
)
)
parsed_result
=
dict
()
for
line
in
lines
:
topic_id
,
result_id
,
*
_
,
rank
,
__
,
___
=
line
if
topic_id
not
in
parsed_result
:
parsed_result
[
topic_id
]
=
dict
()
parsed_result
[
topic_id
][
result_id
]
=
1.0
/
int
(
rank
)
ndcg
=
get_ndcg
(
parsed_result
,
task
,
'all'
)
print
(
'%.3f'
%
ndcg
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
==
1
:
produce_leaderboards
()
elif
len
(
sys
.
argv
)
==
2
:
filename
=
sys
.
argv
[
1
]
evaluate_run
(
filename
)
else
:
raise
ValueError
(
"Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment