Commit 394a85c8 authored by Vít Novotný's avatar Vít Novotný
Browse files

Support specifying subset from the CLO

parent b29cc72b
Pipeline #62901 failed with stage
......@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
......@@ -65,7 +65,7 @@ Here is the documentation of the available evaluation functions:
#### Using the `validation` subset to compare various parameters of your system
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
......@@ -96,8 +96,8 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
0.238
```
......
......@@ -63,7 +63,7 @@ def produce_leaderboards():
f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name))
def evaluate_run(filename):
def evaluate_run(filename, subset):
with open(filename, 'rt') as f:
lines = [line.strip().split() for line in f]
first_line = lines[0]
......@@ -84,7 +84,7 @@ def evaluate_run(filename):
if topic_id not in parsed_result:
parsed_result[topic_id] = dict()
parsed_result[topic_id][result_id] = 1.0 / int(rank)
ndcg = get_ndcg(parsed_result, task, 'all')
ndcg = get_ndcg(parsed_result, task, subset)
print('%.3f' % ndcg)
......@@ -92,7 +92,8 @@ if __name__ == '__main__':
if len(sys.argv) == 1:
produce_leaderboards()
elif len(sys.argv) == 2:
filename = sys.argv[1]
evaluate_run(filename)
evaluate_run(sys.argv[1], 'all')
elif len(sys.argv) == 3:
evaluate_run(sys.argv[1], sys.argv[2])
else:
raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments")
raise ValueError("Usage: {} [TSV_FILE [SUBSET]]".format(sys.argv[0]))
......@@ -5,7 +5,7 @@ from setuptools import setup
setup(
name='arqmath_eval',
version='0.0.15',
version='0.0.16',
description='Evaluation of ARQMath systems',
packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'},
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment