Commit c3aab4ef authored by Vít Novotný's avatar Vít Novotný
Browse files

Make get_ndcg produce confidence intervals

parent f777489d
Pipeline #62984 failed with stage
......@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
......@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions:
- [`get_topics(task, subset=None)`][get_topics],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_random_ndcg(task, subset, topn)`][get_random_ndcg],
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_ndcg(parsed_run, task, subset, topn, confidence)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_judgement(task, subset, topic, judged_document)`][get_judgement].
#### Using the `validation` subset to compare various parameters of your system
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
......@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
0.238
0.238, 95% CI: [0.198; 0.278]
```
[arqmath-task1]: https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
[arqmath-task2]: https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search)
[get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L61
[get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L94
[get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L129
[get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L174
[get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L213
[get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L34
[get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L35
[get_judged_documents]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L62
[get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L95
[get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L140
[get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L185
[get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L224
[ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview)
[ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview)
[treceval-format]: https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?)
......
......@@ -5,6 +5,7 @@ from itertools import chain
from math import log2
import numpy as np
import scipy.stats as st
from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS
......@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None):
return judged_documents
def get_ndcg(parsed_run, task, subset, topn=1000):
def get_ndcg(parsed_run, task, subset, topn=1000, confidence=None):
"""Returns the NDCG' of a system's run on a subset of a task.
NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
......@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
topn : int, optional
The top N results, which will be considered in computing the NDCG.
Default is 1000.
confidence : float or None, optional
The confidence level used to construct a confidence interval.
If None, then no confidence interval is constructed. Default is None.
Returns
-------
ndcg : float
The NDCG' of the system's run on the subset of the task.
interval : (float, float), optional
The confidence interval for the NDCG'. Only produced when confidence is not None.
"""
evaluator = EVALUATORS[subset][task]
......@@ -122,8 +128,13 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
if not parsed_run:
return 0.0
evaluation = evaluator.evaluate(parsed_run)
ndcg = np.mean([measures['ndcg'] for topic, measures in evaluation.items()])
return ndcg
sample = [measures['ndcg'] for topic, measures in evaluation.items()]
ndcg = np.mean(sample)
if confidence is not None:
interval = st.t.interval(confidence / 100.0, len(sample) - 1, loc=ndcg, scale=st.sem(sample))
return (ndcg, interval)
else:
return ndcg
def get_random_ndcg(task, subset, topn=1000):
......
......@@ -63,7 +63,7 @@ def produce_leaderboards():
f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name))
def evaluate_run(filename, subset):
def evaluate_run(filename, subset, confidence=95.0):
with open(filename, 'rt') as f:
lines = [line.strip().split() for line in f]
first_line = lines[0]
......@@ -88,8 +88,8 @@ def evaluate_run(filename, subset):
if topic_id not in parsed_result:
parsed_result[topic_id] = dict()
parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset)
ndcg = get_ndcg(parsed_result, task, subset)
print('%.3f' % ndcg)
ndcg, interval = get_ndcg(parsed_result, task, subset, confidence=confidence)
print('%.3f, %g%% CI: [%.3f; %.3f]' % (ndcg, confidence, *interval))
if __name__ == '__main__':
......
......@@ -5,13 +5,14 @@ from setuptools import setup
setup(
name='arqmath_eval',
version='0.0.17',
version='0.0.18',
description='Evaluation of ARQMath systems',
packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'},
install_requires=[
'numpy~=1.18.2',
'pytrec-eval~=0.4',
'scipy~=1.5.2',
'tqdm~=4.46.0',
],
package_data={
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment