Commit de8752c0 authored by Vít Novotný's avatar Vít Novotný
Browse files

Support computing NDCG' score of an ARQMath submission from the CLI

parent 9273b3ea
Pipeline #62558 failed with stage
......@@ -33,10 +33,10 @@ all relevance judgements. Use these to evaluate a system that has not been
trained using subsets of the `task1` and `task2` tasks.
### Examples
#### Using the `train` set to train your supervised system
#### Using the `train` subset to train your supervised system
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
......@@ -61,10 +61,10 @@ Here is the documentation of the available evaluation functions:
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
#### Using the `validation` set to compare various parameters of your system
#### Using the `validation` subset to compare various parameters of your system
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
......@@ -92,7 +92,12 @@ $ git add -u # add the updated leaderboard to Git
$ git push # publish your new result and the updated leaderboard
```
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv
0.238
```
[arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
......
......@@ -4,6 +4,7 @@ from glob import glob
from multiprocessing import Pool
import os.path
import re
import sys
from pytrec_eval import parse_run
from tqdm import tqdm
......@@ -20,7 +21,7 @@ def evaluate_worker(result_filename):
return (result_name, ndcg)
if __name__ == '__main__':
def produce_leaderboards():
for task in TASKS:
if not os.path.exists(task):
continue
......@@ -60,3 +61,38 @@ if __name__ == '__main__':
f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name))
else:
f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name))
def evaluate_run(filename):
with open(filename, 'rt') as f:
lines = [line.strip().split() for line in f]
first_line = lines[0]
n = len(first_line)
if n == 5:
task = 'task1'
elif n == 6:
task = 'task2'
else:
raise ValueError(
'Expected lines as 5-tuples (Query_Id, Post_Id, Rank, Score, Run_Number) for task 1, '
'or 6-tuples (Query_Id, Formula_Id, Post_Id, Rank, Score, Run_Number) for task 2, '
'received %d-tuples: %s' % (n, first_line)
)
parsed_result = dict()
for line in lines:
topic_id, result_id, *_, rank, __, ___ = line
if topic_id not in parsed_result:
parsed_result[topic_id] = dict()
parsed_result[topic_id][result_id] = 1.0 / int(rank)
ndcg = get_ndcg(parsed_result, task, 'all')
print('%.3f' % ndcg)
if __name__ == '__main__':
if len(sys.argv) == 1:
produce_leaderboards()
elif len(sys.argv) == 2:
filename = sys.argv[1]
evaluate_run(filename)
else:
raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment