Commit c3aab4ef authored by Vít Novotný's avatar Vít Novotný
Browse files

Make get_ndcg produce confidence intervals

parent f777489d
Pipeline #62984 failed with stage
......@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system
``` sh
$ pip install --force-reinstall git+
$ pip install --force-reinstall git+
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
......@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions:
- [`get_topics(task, subset=None)`][get_topics],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_random_ndcg(task, subset, topn)`][get_random_ndcg],
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_ndcg(parsed_run, task, subset, topn, confidence)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_judgement(task, subset, topic, judged_document)`][get_judgement].
#### Using the `validation` subset to compare various parameters of your system
``` sh
$ pip install --force-reinstall git+
$ pip install --force-reinstall git+
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
......@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
``` sh
$ pip install --force-reinstall git+
$ pip install --force-reinstall git+
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
0.238, 95% CI: [0.198; 0.278]
[arqmath-task1]: (Task 1: Find Answers)
[arqmath-task2]: (Task 2: Formula Search)
[ntcir-11-math-2]: (NTCIR-11 Math-2 Task Overview)
[ntcir-12-mathir]: (NTCIR-12 MathIR Task Overview)
[treceval-format]: (How to evaluate a search/retrieval engine using trec_eval?)
......@@ -5,6 +5,7 @@ from itertools import chain
from math import log2
import numpy as np
import scipy.stats as st
......@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None):
return judged_documents
def get_ndcg(parsed_run, task, subset, topn=1000):
def get_ndcg(parsed_run, task, subset, topn=1000, confidence=None):
"""Returns the NDCG' of a system's run on a subset of a task.
NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
......@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
topn : int, optional
The top N results, which will be considered in computing the NDCG.
Default is 1000.
confidence : float or None, optional
The confidence level used to construct a confidence interval.
If None, then no confidence interval is constructed. Default is None.
ndcg : float
The NDCG' of the system's run on the subset of the task.
interval : (float, float), optional
The confidence interval for the NDCG'. Only produced when confidence is not None.
evaluator = EVALUATORS[subset][task]
......@@ -122,8 +128,13 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
if not parsed_run:
return 0.0
evaluation = evaluator.evaluate(parsed_run)
ndcg = np.mean([measures['ndcg'] for topic, measures in evaluation.items()])
return ndcg
sample = [measures['ndcg'] for topic, measures in evaluation.items()]
ndcg = np.mean(sample)
if confidence is not None:
interval = st.t.interval(confidence / 100.0, len(sample) - 1, loc=ndcg, scale=st.sem(sample))
return (ndcg, interval)
return ndcg
def get_random_ndcg(task, subset, topn=1000):
......@@ -63,7 +63,7 @@ def produce_leaderboards():
f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name))
def evaluate_run(filename, subset):
def evaluate_run(filename, subset, confidence=95.0):
with open(filename, 'rt') as f:
lines = [line.strip().split() for line in f]
first_line = lines[0]
......@@ -88,8 +88,8 @@ def evaluate_run(filename, subset):
if topic_id not in parsed_result:
parsed_result[topic_id] = dict()
parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset)
ndcg = get_ndcg(parsed_result, task, subset)
print('%.3f' % ndcg)
ndcg, interval = get_ndcg(parsed_result, task, subset, confidence=confidence)
print('%.3f, %g%% CI: [%.3f; %.3f]' % (ndcg, confidence, *interval))
if __name__ == '__main__':
......@@ -5,13 +5,14 @@ from setuptools import setup
description='Evaluation of ARQMath systems',
package_dir={'arqmath_eval': 'scripts'},
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment