Loading README.md +4 −4 Original line number Diff line number Diff line Loading @@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks. #### Using the `train` subset to train your supervised system ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16 $ python >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> Loading Loading @@ -65,7 +65,7 @@ Here is the documentation of the available evaluation functions: #### Using the `validation` subset to compare various parameters of your system ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16 $ python >>> from arqmath_eval import get_topics, get_judged_documents >>> Loading Loading @@ -96,8 +96,8 @@ $ git push # publish your new result and the upd #### Using the `all` subset to compute the NDCG' score of an ARQMath submission ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all 0.238 ``` Loading scripts/evaluate.py +6 −5 Original line number Diff line number Diff line Loading @@ -63,7 +63,7 @@ def produce_leaderboards(): f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) def evaluate_run(filename): def evaluate_run(filename, subset): with open(filename, 'rt') as f: lines = [line.strip().split() for line in f] first_line = lines[0] Loading @@ -84,7 +84,7 @@ def evaluate_run(filename): if topic_id not in parsed_result: parsed_result[topic_id] = dict() parsed_result[topic_id][result_id] = 1.0 / int(rank) ndcg = get_ndcg(parsed_result, task, 'all') ndcg = get_ndcg(parsed_result, task, subset) print('%.3f' % ndcg) Loading @@ -92,7 +92,8 @@ if __name__ == '__main__': if len(sys.argv) == 1: produce_leaderboards() elif len(sys.argv) == 2: filename = sys.argv[1] evaluate_run(filename) evaluate_run(sys.argv[1], 'all') elif len(sys.argv) == 3: evaluate_run(sys.argv[1], sys.argv[2]) else: raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments") raise ValueError("Usage: {} [TSV_FILE [SUBSET]]".format(sys.argv[0])) setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ from setuptools import setup setup( name='arqmath_eval', version='0.0.15', version='0.0.16', description='Evaluation of ARQMath systems', packages=['arqmath_eval'], package_dir={'arqmath_eval': 'scripts'}, Loading Loading
README.md +4 −4 Original line number Diff line number Diff line Loading @@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks. #### Using the `train` subset to train your supervised system ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16 $ python >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> Loading Loading @@ -65,7 +65,7 @@ Here is the documentation of the available evaluation functions: #### Using the `validation` subset to compare various parameters of your system ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16 $ python >>> from arqmath_eval import get_topics, get_judged_documents >>> Loading Loading @@ -96,8 +96,8 @@ $ git push # publish your new result and the upd #### Using the `all` subset to compute the NDCG' score of an ARQMath submission ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.16 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all 0.238 ``` Loading
scripts/evaluate.py +6 −5 Original line number Diff line number Diff line Loading @@ -63,7 +63,7 @@ def produce_leaderboards(): f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) def evaluate_run(filename): def evaluate_run(filename, subset): with open(filename, 'rt') as f: lines = [line.strip().split() for line in f] first_line = lines[0] Loading @@ -84,7 +84,7 @@ def evaluate_run(filename): if topic_id not in parsed_result: parsed_result[topic_id] = dict() parsed_result[topic_id][result_id] = 1.0 / int(rank) ndcg = get_ndcg(parsed_result, task, 'all') ndcg = get_ndcg(parsed_result, task, subset) print('%.3f' % ndcg) Loading @@ -92,7 +92,8 @@ if __name__ == '__main__': if len(sys.argv) == 1: produce_leaderboards() elif len(sys.argv) == 2: filename = sys.argv[1] evaluate_run(filename) evaluate_run(sys.argv[1], 'all') elif len(sys.argv) == 3: evaluate_run(sys.argv[1], sys.argv[2]) else: raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments") raise ValueError("Usage: {} [TSV_FILE [SUBSET]]".format(sys.argv[0]))
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ from setuptools import setup setup( name='arqmath_eval', version='0.0.15', version='0.0.16', description='Evaluation of ARQMath systems', packages=['arqmath_eval'], package_dir={'arqmath_eval': 'scripts'}, Loading