Unverified Commit 53d55b45 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Use only top 1000 results in the runs (cont.)

parent 1046b511
Loading
Loading
Loading
Loading
Loading
+3 −3
Original line number Original line Diff line number Diff line
@@ -51,9 +51,9 @@ Here is the documentation of the available evaluation functions:


- [`get_topics(task, subset=None)`][get_topics],
- [`get_topics(task, subset=None)`][get_topics],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents],
- [`get_random_ndcg(task, subset)`][get_random_ndcg],
- [`get_random_ndcg(task, subset, topn=1000)`][get_random_ndcg],
- [`get_ndcg(parsed_run, task, subset)`][get_ndcg], and
- [`get_ndcg(parsed_run, task, subset, topn=1000)`][get_ndcg], and
- [`get_random_normalized_ndcg(parsed_run, task, subset)`][get_random_normalized_ndcg].
- [`get_random_normalized_ndcg(parsed_run, task, subset, topn=1000)`][get_random_normalized_ndcg].


#### Using the `validation` set to compare various parameters of your system
#### Using the `validation` set to compare various parameters of your system


+6 −3
Original line number Original line Diff line number Diff line
@@ -102,8 +102,9 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
        A task.
        A task.
    subset : str
    subset : str
        A subset of the task.
        A subset of the task.
    topn : int
    topn : int, optional
        The top N results, which will be considered in computing the NDCG.
        The top N results, which will be considered in computing the NDCG.
        Default is 1000.


    Returns
    Returns
    -------
    -------
@@ -134,8 +135,9 @@ def get_random_ndcg(task, subset, topn=1000):
        A task.
        A task.
    subset : str
    subset : str
        A subset of the task.
        A subset of the task.
    topn : int
    topn : int, optional
        The top N results, which will be considered in computing the NDCG.
        The top N results, which will be considered in computing the NDCG.
        Default is 1000.


    Returns
    Returns
    -------
    -------
@@ -185,8 +187,9 @@ def get_random_normalized_ndcg(parsed_run, task, subset, topn=1000):
        A task.
        A task.
    subset : str
    subset : str
        A subset of the task.
        A subset of the task.
    topn : int
    topn : int, optional
        The top N results, which will be considered in computing the NDCG.
        The top N results, which will be considered in computing the NDCG.
        Default is 1000.


    Returns
    Returns
    -------
    -------
+1 −1
Original line number Original line Diff line number Diff line
@@ -5,7 +5,7 @@ from setuptools import setup


setup(
setup(
    name='arqmath_eval',
    name='arqmath_eval',
    version='0.0.2',
    version='0.0.3',
    description='Evaluation of ARQMath systems',
    description='Evaluation of ARQMath systems',
    packages=['arqmath_eval'],
    packages=['arqmath_eval'],
    package_dir={'arqmath_eval': 'scripts'},
    package_dir={'arqmath_eval': 'scripts'},
+86 −0
Original line number Original line Diff line number Diff line
@@ -172,3 +172,89 @@ class TestGetNDCG(unittest.TestCase):


        expected_ndcg = expected_dcg / expected_idcg
        expected_ndcg = expected_dcg / expected_idcg
        self.assertEqual(expected_ndcg, ndcg)
        self.assertEqual(expected_ndcg, ndcg)

    def test_best_with_topn(self):
        parsed_run = {
            'A.78': {
                '493782':  1.00,
                '493764':  0.95,
                '2008712': 0.90,
                '1282166': 0.85,
                '2008631': 0.80,
                '2008628': 0.75,
                '2008609': 0.70,
                '1116378': 0.65,
                '2008650': 0.60,
                '2008616': 0.55,
                '2008449': 0.50,
                '1282180': 0.45,
                '1282116': 0.40,
                '1282112': 0.35,
                '1116370': 0.30,
                '1116368': 0.25,
                '1282155': 0.20,
                '1282114': 0.15,
            }
        }
        ndcg = get_ndcg(parsed_run, 'task1', 'test', 4)

        expected_dcg = 0.0
        for i in range(1, 5):
            expected_dcg += 3.0 / log2(i + 1)

        expected_idcg = 0.0
        for i in range(1, 5):
            expected_idcg += 3.0 / log2(i + 1)
        for i in range(5, 9):
            expected_idcg += 2.0 / log2(i + 1)
        for i in range(9, 17):
            expected_idcg += 1.0 / log2(i + 1)
        for i in range(17, 19):
            expected_idcg += 0.0 / log2(i + 1)

        expected_ndcg = expected_dcg / expected_idcg
        self.assertEqual(expected_ndcg, ndcg)

    def test_worst_with_topn(self):
        parsed_run = {
            'A.78': {
                '493782':  0.15,
                '493764':  0.20,
                '2008712': 0.25,
                '1282166': 0.30,
                '2008631': 0.35,
                '2008628': 0.40,
                '2008609': 0.45,
                '1116378': 0.50,
                '2008650': 0.55,
                '2008616': 0.60,
                '2008449': 0.65,
                '1282180': 0.70,
                '1282116': 0.75,
                '1282112': 0.80,
                '1116370': 0.85,
                '1116368': 0.90,
                '1282155': 0.95,
                '1282114': 1.00,
            }
        }
        ndcg = get_ndcg(parsed_run, 'task1', 'test', 4)

        expected_dcg = 0.0
        for i in range(1, 3):
            expected_dcg += 0.0 / log2(i + 1)
        for i in range(3, 5):
            expected_dcg += 1.0 / log2(i + 1)

        expected_idcg = 0.0
        for i in range(1, 5):
            expected_idcg += 3.0 / log2(i + 1)
        for i in range(5, 9):
            expected_idcg += 2.0 / log2(i + 1)
        for i in range(9, 17):
            expected_idcg += 1.0 / log2(i + 1)
        for i in range(17, 19):
            expected_idcg += 0.0 / log2(i + 1)

        expected_ndcg = expected_dcg / expected_idcg
        self.assertEqual(expected_ndcg, ndcg)
+20 −0
Original line number Original line Diff line number Diff line
@@ -30,3 +30,23 @@ class TestGetRandomNDCG(unittest.TestCase):


        expected_ndcg = expected_dcg / expected_idcg
        expected_ndcg = expected_dcg / expected_idcg
        self.assertEqual(expected_ndcg, ndcg)
        self.assertEqual(expected_ndcg, ndcg)

    def test_with_topn(self):
        ndcg = get_random_ndcg('task1', 'test', 4)

        expected_judgement = (
            4 * 3.0 +
            4 * 2.0 +
            8 * 1.0 +
            2 * 0.0
        ) / 18
        expected_dcg = 0.0
        for i in range(1, 5):
            expected_dcg += expected_judgement / log2(i + 1)

        expected_idcg = 0.0
        for i in range(1, 5):
            expected_idcg += 3.0 / log2(i + 1)

        expected_ndcg = expected_dcg / expected_idcg
        self.assertEqual(expected_ndcg, ndcg)
Loading