Commit 2d2d8d87 authored by Vít Novotný's avatar Vít Novotný
Browse files

Add common.get_judgement

parent 0b94ae2e
Pipeline #62622 failed with stage
......@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.14
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
......@@ -60,11 +60,12 @@ Here is the documentation of the available evaluation functions:
- [`get_random_ndcg(task, subset, topn)`][get_random_ndcg],
- [`get_ndcg(parsed_run, task, subset, topn)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`][get_random_normalized_ndcg].
- [`get_judgement(task, subset, topic, judged_document)`][get_judgement].
#### Using the `validation` subset to compare various parameters of your system
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.14
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
......@@ -95,7 +96,7 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
``` sh
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.14
$ pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval@0.0.15
$ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv
0.238
```
......@@ -106,6 +107,7 @@ $ python -m arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv
[get_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L94
[get_random_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L129
[get_random_normalized_ndcg]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L174
[get_judgement]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L213
[get_topics]: https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L34
[ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview)
[ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview)
......
from .common import get_topics, get_judged_documents, get_ndcg as ndcg, get_ndcg, get_random_ndcg, get_random_normalized_ndcg
from .common import get_topics, get_judged_documents, get_ndcg as ndcg, get_ndcg, get_random_ndcg, get_random_normalized_ndcg, get_judgement
......@@ -208,3 +208,41 @@ def get_random_normalized_ndcg(parsed_run, task, subset, topn=1000, ndcg=None):
random_ndcg = get_random_ndcg(task, subset, topn)
random_normalized_ndcg = (ndcg - random_ndcg) / (1.0 - random_ndcg)
return random_normalized_ndcg
def get_judgement(task, subset, topic, judged_document):
"""Returns judgement of a document in a topic from a subset of a task, or None if none exists.
NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
non-judged documents in the run are disregarded, see
https://www.cs.rit.edu/~dprl/ARQMath/, section Ranking metrics.
The random-normalized NDCG' takes the expected NDCG' of a random system
into account. NDCG' of 1.0 is normalized to 1.0, NDCG' of a random system
is normalized to 0.0, NDCG' worse that a random system is normalized to 0.0.
Parameters
----------
task : str
A task.
subset : str
A subset of the task.
topic : str
A topic from the subset.
judged_document : str
A document in the topic.
Returns
-------
judgement : int or None
Returns judgement of a document in a topic for subset of a task, or None if none exists.
"""
if subset not in PARSED_RELEVANCE_JUDGEMENTS \
or task not in PARSED_RELEVANCE_JUDGEMENTS[subset] \
or topic not in PARSED_RELEVANCE_JUDGEMENTS[subset][task] \
or judged_document not in PARSED_RELEVANCE_JUDGEMENTS[subset][task][topic]:
judgement = None
else:
judgement = PARSED_RELEVANCE_JUDGEMENTS[subset][task][topic][judged_document]
return judgement
......@@ -5,7 +5,7 @@ from setuptools import setup
setup(
name='arqmath_eval',
version='0.0.14',
version='0.0.15',
description='Evaluation of ARQMath systems',
packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'},
......
from math import log2
import unittest
from arqmath_eval import get_judgement
class TestGetJudgement(unittest.TestCase):
def test_existent(self):
judgement = get_judgement('task1-example', 'test', 'A.78', '493764')
expected_judgement = 3
self.assertEqual(expected_judgement, judgement)
def test_nonexistent_judged_document(self):
judgement = get_judgement('task1-example', 'test', 'A.78', 'nonexistent')
expected_judgement = None
self.assertEqual(expected_judgement, judgement)
def test_nonexistent_topic(self):
judgement = get_judgement('task1-example', 'test', 'nonexistent', '493764')
expected_judgement = None
self.assertEqual(expected_judgement, judgement)
def test_nonexistent_subset(self):
judgement = get_judgement('task1-example', 'nonexistent', 'A.79', '493764')
expected_judgement = None
self.assertEqual(expected_judgement, judgement)
def test_nonexistent_task(self):
judgement = get_judgement('nonexistent', 'test', 'A.78', '493764')
expected_judgement = None
self.assertEqual(expected_judgement, judgement)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment