Commit c3aab4ef authored by Vít Novotný's avatar Vít Novotný
Browse files

Make get_ndcg produce confidence intervals

parent f777489d
Pipeline #62984 failed with stage
...@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks. ...@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system #### Using the `train` subset to train your supervised system
``` sh ``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python $ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg >>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>> >>>
...@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions: ...@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions:
- [`get_topics(task, subset=None)`][get_topics], - [`get_topics(task, subset=None)`][get_topics],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_random_ndcg(task, subset, topn)`][get_random_ndcg], - [`get_random_ndcg(task, subset, topn)`][get_random_ndcg],
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn, confidence)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_judgement(task, subset, topic, judged_document)`][get_judgement]. - [`get_judgement(task, subset, topic, judged_document)`][get_judgement].
#### Using the `validation` subset to compare various parameters of your system #### Using the `validation` subset to compare various parameters of your system
``` sh ``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python $ python
>>> from arqmath_eval import get_topics, get_judged_documents >>> from arqmath_eval import get_topics, get_judged_documents
>>> >>>
...@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd ...@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission #### Using the `all` subset to compute the NDCG' score of an ARQMath submission
``` sh ``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17 $ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
0.238 0.238, 95% CI: [0.198; 0.278]
``` ```
[arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers) [arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
[arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search) [arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search)
[get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L61 [get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L35
[get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L94 [get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L62
[get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L129 [get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L95
[get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L174 [get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L140
[get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L213 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L185
[get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L34 [get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L224
[ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview) [ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview)
[ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview) [ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview)
[treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?) [treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?)
......
...@@ -5,6 +5,7 @@ from itertools import chain ...@@ -5,6 +5,7 @@ from itertools import chain
from math import log2 from math import log2
import numpy as np import numpy as np
import scipy.stats as st
from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS
...@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None): ...@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None):
return judged_documents return judged_documents
def get_ndcg(parsed_run, task, subset, topn=1000): def get_ndcg(parsed_run, task, subset, topn=1000, confidence=None):
"""Returns the NDCG' of a system's run on a subset of a task. """Returns the NDCG' of a system's run on a subset of a task.
NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
...@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000): ...@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
topn : int, optional topn : int, optional
The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG.
Default is 1000. Default is 1000.
confidence : float or None, optional
The confidence level used to construct a confidence interval.
If None, then no confidence interval is constructed. Default is None.
Returns Returns
------- -------
ndcg : float ndcg : float
The NDCG' of the system's run on the subset of the task. The NDCG' of the system's run on the subset of the task.
interval : (float, float), optional
The confidence interval for the NDCG'. Only produced when confidence is not None.
""" """
evaluator = EVALUATORS[subset][task] evaluator = EVALUATORS[subset][task]
...@@ -122,8 +128,13 @@ def get_ndcg(parsed_run, task, subset, topn=1000): ...@@ -122,8 +128,13 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
if not parsed_run: if not parsed_run:
return 0.0 return 0.0
evaluation = evaluator.evaluate(parsed_run) evaluation = evaluator.evaluate(parsed_run)
ndcg = np.mean([measures['ndcg'] for topic, measures in evaluation.items()]) sample = [measures['ndcg'] for topic, measures in evaluation.items()]
return ndcg ndcg = np.mean(sample)
if confidence is not None:
interval = st.t.interval(confidence / 100.0, len(sample) - 1, loc=ndcg, scale=st.sem(sample))
return (ndcg, interval)
else:
return ndcg
def get_random_ndcg(task, subset, topn=1000): def get_random_ndcg(task, subset, topn=1000):
......
...@@ -63,7 +63,7 @@ def produce_leaderboards(): ...@@ -63,7 +63,7 @@ def produce_leaderboards():
f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name)) f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name))
def evaluate_run(filename, subset): def evaluate_run(filename, subset, confidence=95.0):
with open(filename, 'rt') as f: with open(filename, 'rt') as f:
lines = [line.strip().split() for line in f] lines = [line.strip().split() for line in f]
first_line = lines[0] first_line = lines[0]
...@@ -88,8 +88,8 @@ def evaluate_run(filename, subset): ...@@ -88,8 +88,8 @@ def evaluate_run(filename, subset):
if topic_id not in parsed_result: if topic_id not in parsed_result:
parsed_result[topic_id] = dict() parsed_result[topic_id] = dict()
parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset) parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset)
ndcg = get_ndcg(parsed_result, task, subset) ndcg, interval = get_ndcg(parsed_result, task, subset, confidence=confidence)
print('%.3f' % ndcg) print('%.3f, %g%% CI: [%.3f; %.3f]' % (ndcg, confidence, *interval))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -5,13 +5,14 @@ from setuptools import setup ...@@ -5,13 +5,14 @@ from setuptools import setup
setup( setup(
name='arqmath_eval', name='arqmath_eval',
version='0.0.17', version='0.0.18',
description='Evaluation of ARQMath systems', description='Evaluation of ARQMath systems',
packages=['arqmath_eval'], packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'}, package_dir={'arqmath_eval': 'scripts'},
install_requires=[ install_requires=[
'numpy~=1.18.2', 'numpy~=1.18.2',
'pytrec-eval~=0.4', 'pytrec-eval~=0.4',
'scipy~=1.5.2',
'tqdm~=4.46.0', 'tqdm~=4.46.0',
], ],
package_data={ package_data={
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment