Loading README.md +9 −4 Original line number Original line Diff line number Diff line Loading @@ -33,10 +33,10 @@ all relevance judgements. Use these to evaluate a system that has not been trained using subsets of the `task1` and `task2` tasks. trained using subsets of the `task1` and `task2` tasks. ### Examples ### Examples #### Using the `train` set to train your supervised system #### Using the `train` subset to train your supervised system ``` sh ``` sh $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8 $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> >>> Loading @@ -61,10 +61,10 @@ Here is the documentation of the available evaluation functions: - [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. #### Using the `validation` set to compare various parameters of your system #### Using the `validation` subset to compare various parameters of your system ``` sh ``` sh $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8 $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents >>> from arqmath_eval import get_topics, get_judged_documents >>> >>> Loading Loading @@ -92,7 +92,12 @@ $ git add -u # add the updated leaderboard to Git $ git push # publish your new result and the updated leaderboard $ git push # publish your new result and the updated leaderboard ``` ``` #### Using the `all` subset to compute the NDCG' score of an ARQMath submission ``` sh ``` sh $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv 0.238 ``` ``` [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) Loading scripts/evaluate.py +37 −1 Original line number Original line Diff line number Diff line Loading @@ -4,6 +4,7 @@ from glob import glob from multiprocessing import Pool from multiprocessing import Pool import os.path import os.path import re import re import sys from pytrec_eval import parse_run from pytrec_eval import parse_run from tqdm import tqdm from tqdm import tqdm Loading @@ -20,7 +21,7 @@ def evaluate_worker(result_filename): return (result_name, ndcg) return (result_name, ndcg) if __name__ == '__main__': def produce_leaderboards(): for task in TASKS: for task in TASKS: if not os.path.exists(task): if not os.path.exists(task): continue continue Loading Loading @@ -60,3 +61,38 @@ if __name__ == '__main__': f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name)) f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name)) else: else: f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) def evaluate_run(filename): with open(filename, 'rt') as f: lines = [line.strip().split() for line in f] first_line = lines[0] n = len(first_line) if n == 5: task = 'task1' elif n == 6: task = 'task2' else: raise ValueError( 'Expected lines as 5-tuples (Query_Id, Post_Id, Rank, Score, Run_Number) for task 1, ' 'or 6-tuples (Query_Id, Formula_Id, Post_Id, Rank, Score, Run_Number) for task 2, ' 'received %d-tuples: %s' % (n, first_line) ) parsed_result = dict() for line in lines: topic_id, result_id, *_, rank, __, ___ = line if topic_id not in parsed_result: parsed_result[topic_id] = dict() parsed_result[topic_id][result_id] = 1.0 / int(rank) ndcg = get_ndcg(parsed_result, task, 'all') print('%.3f' % ndcg) if __name__ == '__main__': if len(sys.argv) == 1: produce_leaderboards() elif len(sys.argv) == 2: filename = sys.argv[1] evaluate_run(filename) else: raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments") Loading
README.md +9 −4 Original line number Original line Diff line number Diff line Loading @@ -33,10 +33,10 @@ all relevance judgements. Use these to evaluate a system that has not been trained using subsets of the `task1` and `task2` tasks. trained using subsets of the `task1` and `task2` tasks. ### Examples ### Examples #### Using the `train` set to train your supervised system #### Using the `train` subset to train your supervised system ``` sh ``` sh $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8 $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> >>> Loading @@ -61,10 +61,10 @@ Here is the documentation of the available evaluation functions: - [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. #### Using the `validation` set to compare various parameters of your system #### Using the `validation` subset to compare various parameters of your system ``` sh ``` sh $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8 $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents >>> from arqmath_eval import get_topics, get_judged_documents >>> >>> Loading Loading @@ -92,7 +92,12 @@ $ git add -u # add the updated leaderboard to Git $ git push # publish your new result and the updated leaderboard $ git push # publish your new result and the updated leaderboard ``` ``` #### Using the `all` subset to compute the NDCG' score of an ARQMath submission ``` sh ``` sh $ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.13 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv 0.238 ``` ``` [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) Loading
scripts/evaluate.py +37 −1 Original line number Original line Diff line number Diff line Loading @@ -4,6 +4,7 @@ from glob import glob from multiprocessing import Pool from multiprocessing import Pool import os.path import os.path import re import re import sys from pytrec_eval import parse_run from pytrec_eval import parse_run from tqdm import tqdm from tqdm import tqdm Loading @@ -20,7 +21,7 @@ def evaluate_worker(result_filename): return (result_name, ndcg) return (result_name, ndcg) if __name__ == '__main__': def produce_leaderboards(): for task in TASKS: for task in TASKS: if not os.path.exists(task): if not os.path.exists(task): continue continue Loading Loading @@ -60,3 +61,38 @@ if __name__ == '__main__': f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name)) f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name)) else: else: f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) def evaluate_run(filename): with open(filename, 'rt') as f: lines = [line.strip().split() for line in f] first_line = lines[0] n = len(first_line) if n == 5: task = 'task1' elif n == 6: task = 'task2' else: raise ValueError( 'Expected lines as 5-tuples (Query_Id, Post_Id, Rank, Score, Run_Number) for task 1, ' 'or 6-tuples (Query_Id, Formula_Id, Post_Id, Rank, Score, Run_Number) for task 2, ' 'received %d-tuples: %s' % (n, first_line) ) parsed_result = dict() for line in lines: topic_id, result_id, *_, rank, __, ___ = line if topic_id not in parsed_result: parsed_result[topic_id] = dict() parsed_result[topic_id][result_id] = 1.0 / int(rank) ndcg = get_ndcg(parsed_result, task, 'all') print('%.3f' % ndcg) if __name__ == '__main__': if len(sys.argv) == 1: produce_leaderboards() elif len(sys.argv) == 2: filename = sys.argv[1] evaluate_run(filename) else: raise ValueError("Expected either zero (produce leaderboards) or one (produce NDCG' score for a file with task 1 or 2 result) arguments")