Commit c3aab4ef authored by Vít Novotný's avatar Vít Novotný
Browse files

Make get_ndcg produce confidence intervals

parent f777489d
Loading
Loading
Loading
Loading
Loading
+11 −11
Original line number Original line Diff line number Diff line
@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system
#### Using the `train` subset to train your supervised system


``` sh
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
>>>
@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions:
- [`get_topics(task, subset=None)`][get_topics],
- [`get_topics(task, subset=None)`][get_topics],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_random_ndcg(task, subset, topn)`][get_random_ndcg],
- [`get_random_ndcg(task, subset, topn)`][get_random_ndcg],
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_ndcg(parsed_run, task, subset, topn, confidence)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_judgement(task, subset, topic, judged_document)`][get_judgement].
- [`get_judgement(task, subset, topic, judged_document)`][get_judgement].


#### Using the `validation` subset to compare various parameters of your system
#### Using the `validation` subset to compare various parameters of your system


``` sh
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
>>>
@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission


``` sh
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.17
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.18
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
0.238
0.238, 95% CI: [0.198; 0.278]
```
```


 [arqmath-task1]:              https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
 [arqmath-task1]:              https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html (Task 1: Find Answers)
 [arqmath-task2]:              https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search)
 [arqmath-task2]:              https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html (Task 2: Formula Search)
 [get_judged_documents]:       https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L61
 [get_topics]:                 https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L35
 [get_ndcg]:                   https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L94
 [get_judged_documents]:       https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L62
 [get_random_ndcg]:            https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L129
 [get_ndcg]:                   https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L95
 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L174
 [get_random_ndcg]:            https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L140
 [get_judgement]:              https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L213
 [get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L185
 [get_topics]:                 https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L34
 [get_judgement]:              https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L224
 [ntcir-11-math-2]:            http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview)
 [ntcir-11-math-2]:            http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview)
 [ntcir-12-mathir]:            https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview)
 [ntcir-12-mathir]:            https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview)
 [treceval-format]:            https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?)
 [treceval-format]:            https://stackoverflow.com/a/8175382/657401 (How to evaluate a search/retrieval engine using trec_eval?)
+14 −3
Original line number Original line Diff line number Diff line
@@ -5,6 +5,7 @@ from itertools import chain
from math import log2
from math import log2


import numpy as np
import numpy as np
import scipy.stats as st


from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS
from .configuration import EVALUATORS, PARSED_RELEVANCE_JUDGEMENTS


@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None):
    return judged_documents
    return judged_documents




def get_ndcg(parsed_run, task, subset, topn=1000):
def get_ndcg(parsed_run, task, subset, topn=1000, confidence=None):
    """Returns the NDCG' of a system's run on a subset of a task.
    """Returns the NDCG' of a system's run on a subset of a task.


    NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
    NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
    topn : int, optional
    topn : int, optional
        The top N results, which will be considered in computing the NDCG.
        The top N results, which will be considered in computing the NDCG.
        Default is 1000.
        Default is 1000.
    confidence : float or None, optional
        The confidence level used to construct a confidence interval.
        If None, then no confidence interval is constructed. Default is None.


    Returns
    Returns
    -------
    -------
    ndcg : float
    ndcg : float
        The NDCG' of the system's run on the subset of the task.
        The NDCG' of the system's run on the subset of the task.
    interval : (float, float), optional
        The confidence interval for the NDCG'. Only produced when confidence is not None.


    """
    """
    evaluator = EVALUATORS[subset][task]
    evaluator = EVALUATORS[subset][task]
@@ -122,7 +128,12 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
    if not parsed_run:
    if not parsed_run:
        return 0.0
        return 0.0
    evaluation = evaluator.evaluate(parsed_run)
    evaluation = evaluator.evaluate(parsed_run)
    ndcg = np.mean([measures['ndcg'] for topic, measures in evaluation.items()])
    sample = [measures['ndcg'] for topic, measures in evaluation.items()]
    ndcg = np.mean(sample)
    if confidence is not None:
        interval = st.t.interval(confidence / 100.0, len(sample) - 1, loc=ndcg, scale=st.sem(sample))
        return (ndcg, interval)
    else:
        return ndcg
        return ndcg




+3 −3
Original line number Original line Diff line number Diff line
@@ -63,7 +63,7 @@ def produce_leaderboards():
                    f_readme.write('|  %.4f  |  %s  |  %s  |\n' % (ndcg, result_name, user_name))
                    f_readme.write('|  %.4f  |  %s  |  %s  |\n' % (ndcg, result_name, user_name))




def evaluate_run(filename, subset):
def evaluate_run(filename, subset, confidence=95.0):
    with open(filename, 'rt') as f:
    with open(filename, 'rt') as f:
        lines = [line.strip().split() for line in f]
        lines = [line.strip().split() for line in f]
    first_line = lines[0]
    first_line = lines[0]
@@ -88,8 +88,8 @@ def evaluate_run(filename, subset):
        if topic_id not in parsed_result:
        if topic_id not in parsed_result:
            parsed_result[topic_id] = dict()
            parsed_result[topic_id] = dict()
        parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset)
        parsed_result[topic_id][result_id] = 1.0 / (int(rank) + rank_offset)
    ndcg = get_ndcg(parsed_result, task, subset)
    ndcg, interval = get_ndcg(parsed_result, task, subset, confidence=confidence)
    print('%.3f' % ndcg)
    print('%.3f, %g%% CI: [%.3f; %.3f]' % (ndcg, confidence, *interval))




if __name__ == '__main__':
if __name__ == '__main__':
+2 −1
Original line number Original line Diff line number Diff line
@@ -5,13 +5,14 @@ from setuptools import setup


setup(
setup(
    name='arqmath_eval',
    name='arqmath_eval',
    version='0.0.17',
    version='0.0.18',
    description='Evaluation of ARQMath systems',
    description='Evaluation of ARQMath systems',
    packages=['arqmath_eval'],
    packages=['arqmath_eval'],
    package_dir={'arqmath_eval': 'scripts'},
    package_dir={'arqmath_eval': 'scripts'},
    install_requires=[
    install_requires=[
        'numpy~=1.18.2',
        'numpy~=1.18.2',
        'pytrec-eval~=0.4',
        'pytrec-eval~=0.4',
        'scipy~=1.5.2',
        'tqdm~=4.46.0',
        'tqdm~=4.46.0',
    ],
    ],
    package_data={
    package_data={