Loading README.md +11 −11 Original line number Original line Diff line number Diff line Loading @@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks. #### Using the `train` subset to train your supervised system #### Using the `train` subset to train your supervised system ``` sh ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> >>> Loading @@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions: - [`get_topics(task, subset=None)`][get_topics], - [`get_topics(task, subset=None)`][get_topics], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_random_ndcg(task, subset, topn)`][get_random_ndcg], - [`get_random_ndcg(task, subset, topn)`][get_random_ndcg], - [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn, confidence)`][get_ndcg], and - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_judgement(task, subset, topic, judged_document)`][get_judgement]. - [`get_judgement(task, subset, topic, judged_document)`][get_judgement]. #### Using the `validation` subset to compare various parameters of your system #### Using the `validation` subset to compare various parameters of your system ``` sh ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents >>> from arqmath_eval import get_topics, get_judged_documents >>> >>> Loading Loading @@ -96,19 +96,19 @@ $ git push # publish your new result and the upd #### Using the `all` subset to compute the NDCG' score of an ARQMath submission #### Using the `all` subset to compute the NDCG' score of an ARQMath submission ``` sh ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all 0.238 0.238, 95% CI: [0.198; 0.278] ``` ``` [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search) [arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search) [get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L61 [get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L35 [get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L94 [get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L62 [get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L129 [get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L95 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L174 [get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L140 [get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L213 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L185 [get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L34 [get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L224 [ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview) [ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview) [ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview) [ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview) [treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?) [treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?) Loading scripts/common.py +14 −3 Original line number Original line Diff line number Diff line Loading @@ -5,6 +5,7 @@ from itertools import chain from math import log2 from math import log2 import numpy as np import numpy as np import scipy.stats as st from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS Loading Loading @@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None): return judged_documents return judged_documents def get_ndcg(parsed_run, task, subset, topn=1000): def get_ndcg(parsed_run, task, subset, topn=1000, confidence=None): """Returns the NDCG' of a system's run on a subset of a task. """Returns the NDCG' of a system's run on a subset of a task. NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all Loading @@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000): topn : int, optional topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Default is 1000. confidence : float or None, optional The confidence level used to construct a confidence interval. If None, then no confidence interval is constructed. Default is None. Returns Returns ------- ------- ndcg : float ndcg : float The NDCG' of the system's run on the subset of the task. The NDCG' of the system's run on the subset of the task. interval : (float, float), optional The confidence interval for the NDCG'. Only produced when confidence is not None. """ """ evaluator = EVALUATORS[subset][task] evaluator = EVALUATORS[subset][task] Loading @@ -122,7 +128,12 @@ def get_ndcg(parsed_run, task, subset, topn=1000): if not parsed_run: if not parsed_run: return 0.0 return 0.0 evaluation = evaluator.evaluate(parsed_run) evaluation = evaluator.evaluate(parsed_run) ndcg = np.mean([measures['ndcg'] for topic, measures in evaluation.items()]) sample = [measures['ndcg'] for topic, measures in evaluation.items()] ndcg = np.mean(sample) if confidence is not None: interval = st.t.interval(confidence / 100.0, len(sample) - 1, loc=ndcg, scale=st.sem(sample)) return (ndcg, interval) else: return ndcg return ndcg Loading scripts/evaluate.py +3 −3 Original line number Original line Diff line number Diff line Loading @@ -63,7 +63,7 @@ def produce_leaderboards(): f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) def evaluate_run(filename, subset): def evaluate_run(filename, subset, confidence=95.0): with open(filename, 'rt') as f: with open(filename, 'rt') as f: lines = [line.strip().split() for line in f] lines = [line.strip().split() for line in f] first_line = lines[0] first_line = lines[0] Loading @@ -88,8 +88,8 @@ def evaluate_run(filename, subset): if topic_id not in parsed_result: if topic_id not in parsed_result: parsed_result[topic_id] = dict() parsed_result[topic_id] = dict() parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset) parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset) ndcg = get_ndcg(parsed_result, task, subset) ndcg, interval = get_ndcg(parsed_result, task, subset, confidence=confidence) print('%.3f' % ndcg) print('%.3f, %g%% CI: [%.3f; %.3f]' % (ndcg, confidence, *interval)) if __name__ == '__main__': if __name__ == '__main__': Loading setup.py +2 −1 Original line number Original line Diff line number Diff line Loading @@ -5,13 +5,14 @@ from setuptools import setup setup( setup( name='arqmath_eval', name='arqmath_eval', version='0.0.17', version='0.0.18', description='Evaluation of ARQMath systems', description='Evaluation of ARQMath systems', packages=['arqmath_eval'], packages=['arqmath_eval'], package_dir={'arqmath_eval': 'scripts'}, package_dir={'arqmath_eval': 'scripts'}, install_requires=[ install_requires=[ 'numpy~=1.18.2', 'numpy~=1.18.2', 'pytrec-eval~=0.4', 'pytrec-eval~=0.4', 'scipy~=1.5.2', 'tqdm~=4.46.0', 'tqdm~=4.46.0', ], ], package_data={ package_data={ Loading Loading
README.md +11 −11 Original line number Original line Diff line number Diff line Loading @@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks. #### Using the `train` subset to train your supervised system #### Using the `train` subset to train your supervised system ``` sh ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> >>> Loading @@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions: - [`get_topics(task, subset=None)`][get_topics], - [`get_topics(task, subset=None)`][get_topics], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_random_ndcg(task, subset, topn)`][get_random_ndcg], - [`get_random_ndcg(task, subset, topn)`][get_random_ndcg], - [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn, confidence)`][get_ndcg], and - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_judgement(task, subset, topic, judged_document)`][get_judgement]. - [`get_judgement(task, subset, topic, judged_document)`][get_judgement]. #### Using the `validation` subset to compare various parameters of your system #### Using the `validation` subset to compare various parameters of your system ``` sh ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 $ python $ python >>> from arqmath_eval import get_topics, get_judged_documents >>> from arqmath_eval import get_topics, get_judged_documents >>> >>> Loading Loading @@ -96,19 +96,19 @@ $ git push # publish your new result and the upd #### Using the `all` subset to compute the NDCG' score of an ARQMath submission #### Using the `all` subset to compute the NDCG' score of an ARQMath submission ``` sh ``` sh $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all 0.238 0.238, 95% CI: [0.198; 0.278] ``` ``` [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search) [arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search) [get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L61 [get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L35 [get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L94 [get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L62 [get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L129 [get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L95 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L174 [get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L140 [get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L213 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L185 [get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L34 [get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L224 [ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview) [ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview) [ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview) [ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview) [treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?) [treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?) Loading
scripts/common.py +14 −3 Original line number Original line Diff line number Diff line Loading @@ -5,6 +5,7 @@ from itertools import chain from math import log2 from math import log2 import numpy as np import numpy as np import scipy.stats as st from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS Loading Loading @@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None): return judged_documents return judged_documents def get_ndcg(parsed_run, task, subset, topn=1000): def get_ndcg(parsed_run, task, subset, topn=1000, confidence=None): """Returns the NDCG' of a system's run on a subset of a task. """Returns the NDCG' of a system's run on a subset of a task. NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all Loading @@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000): topn : int, optional topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Default is 1000. confidence : float or None, optional The confidence level used to construct a confidence interval. If None, then no confidence interval is constructed. Default is None. Returns Returns ------- ------- ndcg : float ndcg : float The NDCG' of the system's run on the subset of the task. The NDCG' of the system's run on the subset of the task. interval : (float, float), optional The confidence interval for the NDCG'. Only produced when confidence is not None. """ """ evaluator = EVALUATORS[subset][task] evaluator = EVALUATORS[subset][task] Loading @@ -122,7 +128,12 @@ def get_ndcg(parsed_run, task, subset, topn=1000): if not parsed_run: if not parsed_run: return 0.0 return 0.0 evaluation = evaluator.evaluate(parsed_run) evaluation = evaluator.evaluate(parsed_run) ndcg = np.mean([measures['ndcg'] for topic, measures in evaluation.items()]) sample = [measures['ndcg'] for topic, measures in evaluation.items()] ndcg = np.mean(sample) if confidence is not None: interval = st.t.interval(confidence / 100.0, len(sample) - 1, loc=ndcg, scale=st.sem(sample)) return (ndcg, interval) else: return ndcg return ndcg Loading
scripts/evaluate.py +3 −3 Original line number Original line Diff line number Diff line Loading @@ -63,7 +63,7 @@ def produce_leaderboards(): f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) def evaluate_run(filename, subset): def evaluate_run(filename, subset, confidence=95.0): with open(filename, 'rt') as f: with open(filename, 'rt') as f: lines = [line.strip().split() for line in f] lines = [line.strip().split() for line in f] first_line = lines[0] first_line = lines[0] Loading @@ -88,8 +88,8 @@ def evaluate_run(filename, subset): if topic_id not in parsed_result: if topic_id not in parsed_result: parsed_result[topic_id] = dict() parsed_result[topic_id] = dict() parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset) parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset) ndcg = get_ndcg(parsed_result, task, subset) ndcg, interval = get_ndcg(parsed_result, task, subset, confidence=confidence) print('%.3f' % ndcg) print('%.3f, %g%% CI: [%.3f; %.3f]' % (ndcg, confidence, *interval)) if __name__ == '__main__': if __name__ == '__main__': Loading
setup.py +2 −1 Original line number Original line Diff line number Diff line Loading @@ -5,13 +5,14 @@ from setuptools import setup setup( setup( name='arqmath_eval', name='arqmath_eval', version='0.0.17', version='0.0.18', description='Evaluation of ARQMath systems', description='Evaluation of ARQMath systems', packages=['arqmath_eval'], packages=['arqmath_eval'], package_dir={'arqmath_eval': 'scripts'}, package_dir={'arqmath_eval': 'scripts'}, install_requires=[ install_requires=[ 'numpy~=1.18.2', 'numpy~=1.18.2', 'pytrec-eval~=0.4', 'pytrec-eval~=0.4', 'scipy~=1.5.2', 'tqdm~=4.46.0', 'tqdm~=4.46.0', ], ], package_data={ package_data={ Loading