Commit de8752c0 authored by Vít Novotný's avatar Vít Novotný
Browse files

Support computing NDCG' score of an ARQMath submission from the CLI

parent 9273b3ea
Loading
Loading
Loading
Loading
Loading
+9 −4
Original line number Original line Diff line number Diff line
@@ -33,10 +33,10 @@ all relevance judgements. Use these to evaluate a system that has not been
trained using subsets of the `task1` and `task2` tasks.
trained using subsets of the `task1` and `task2` tasks.


### Examples
### Examples
#### Using the `train` set to train your supervised system
#### Using the `train` subset to train your supervised system


``` sh
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$ python
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
>>>
@@ -61,10 +61,10 @@ Here is the documentation of the available evaluation functions:
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].


#### Using the `validation` set to compare various parameters of your system
#### Using the `validation` subset to compare various parameters of your system


``` sh
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$ python
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
>>>
@@ -92,7 +92,12 @@ $ git add -u # add the updated leaderboard to Git
$ git push                                 # publish your new result and the updated leaderboard
$ git push                                 # publish your new result and the updated leaderboard
```
```


#### Using the `all` subset to compute the NDCG' score of an ARQMath submission

``` sh
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv
0.238
```
```


 [arqmath-task1]:              https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
 [arqmath-task1]:              https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
+37 −1
Original line number Original line Diff line number Diff line
@@ -4,6 +4,7 @@ from glob import glob
from multiprocessing import Pool
from multiprocessing import Pool
import os.path
import os.path
import re
import re
import sys


from pytrec_eval import parse_run
from pytrec_eval import parse_run
from tqdm import tqdm
from tqdm import tqdm
@@ -20,7 +21,7 @@ def evaluate_worker(result_filename):
    return (result_name, ndcg)
    return (result_name, ndcg)




if __name__ == '__main__':
def produce_leaderboards():
    for task in TASKS:
    for task in TASKS:
        if not os.path.exists(task):
        if not os.path.exists(task):
            continue
            continue
@@ -60,3 +61,38 @@ if __name__ == '__main__':
                    f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name))
                    f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name))
                else:
                else:
                    f_readme.write('|  %.4f  |  %s  |  %s  |\n' % (ndcg, result_name, user_name))
                    f_readme.write('|  %.4f  |  %s  |  %s  |\n' % (ndcg, result_name, user_name))


def evaluate_run(filename):
    with open(filename, 'rt') as f:
        lines = [line.strip().split() for line in f]
    first_line = lines[0]
    n = len(first_line)
    if n == 5:
        task = 'task1'
    elif n == 6:
        task = 'task2'
    else:
        raise ValueError(
            'Expected lines as 5-tuples (Query_Id, Post_Id, Rank, Score, Run_Number) for task 1, '
            'or 6-tuples (Query_Id, Formula_Id, Post_Id, Rank, Score, Run_Number) for task 2, '
            'received %d-tuples: %s' % (n, first_line)
        )
    parsed_result = dict()
    for line in lines:
        topic_id, result_id, *_, rank, __, ___ = line
        if topic_id not in parsed_result:
            parsed_result[topic_id] = dict()
        parsed_result[topic_id][result_id] = 1.0 / int(rank)
    ndcg = get_ndcg(parsed_result, task, 'all')
    print('%.3f' % ndcg)


if __name__ == '__main__':
    if len(sys.argv) == 1:
        produce_leaderboards()
    elif len(sys.argv) == 2:
        filename = sys.argv[1]
        evaluate_run(filename)
    else:
        raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments")