Loading README.md +3 −3 Original line number Original line Diff line number Diff line Loading @@ -51,9 +51,9 @@ Here is the documentation of the available evaluation functions: - [`get_topics(task, subset=None)`][get_topics], - [`get_topics(task, subset=None)`][get_topics], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_random_ndcg(task, subset)`][get_random_ndcg], - [`get_random_ndcg(task, subset, topn=1000)`][get_random_ndcg], - [`get_ndcg(parsed_run, task, subset)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn=1000)`][get_ndcg], and - [`get_random_normalized_ndcg(parsed_run, task, subset)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn=1000)`][get_random_normalized_ndcg]. #### Using the `validation` set to compare various parameters of your system #### Using the `validation` set to compare various parameters of your system Loading scripts/common.py +6 −3 Original line number Original line Diff line number Diff line Loading @@ -102,8 +102,9 @@ def get_ndcg(parsed_run, task, subset, topn=1000): A task. A task. subset : str subset : str A subset of the task. A subset of the task. topn : int topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Returns Returns ------- ------- Loading Loading @@ -134,8 +135,9 @@ def get_random_ndcg(task, subset, topn=1000): A task. A task. subset : str subset : str A subset of the task. A subset of the task. topn : int topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Returns Returns ------- ------- Loading Loading @@ -185,8 +187,9 @@ def get_random_normalized_ndcg(parsed_run, task, subset, topn=1000): A task. A task. subset : str subset : str A subset of the task. A subset of the task. topn : int topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Returns Returns ------- ------- Loading setup.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -5,7 +5,7 @@ from setuptools import setup setup( setup( name='arqmath_eval', name='arqmath_eval', version='0.0.2', version='0.0.3', description='Evaluation of ARQMath systems', description='Evaluation of ARQMath systems', packages=['arqmath_eval'], packages=['arqmath_eval'], package_dir={'arqmath_eval': 'scripts'}, package_dir={'arqmath_eval': 'scripts'}, Loading test/test_get_ndcg.py +86 −0 Original line number Original line Diff line number Diff line Loading @@ -172,3 +172,89 @@ class TestGetNDCG(unittest.TestCase): expected_ndcg = expected_dcg / expected_idcg expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) self.assertEqual(expected_ndcg, ndcg) def test_best_with_topn(self): parsed_run = { 'A.78': { '493782': 1.00, '493764': 0.95, '2008712': 0.90, '1282166': 0.85, '2008631': 0.80, '2008628': 0.75, '2008609': 0.70, '1116378': 0.65, '2008650': 0.60, '2008616': 0.55, '2008449': 0.50, '1282180': 0.45, '1282116': 0.40, '1282112': 0.35, '1116370': 0.30, '1116368': 0.25, '1282155': 0.20, '1282114': 0.15, } } ndcg = get_ndcg(parsed_run, 'task1', 'test', 4) expected_dcg = 0.0 for i in range(1, 5): expected_dcg += 3.0 / log2(i + 1) expected_idcg = 0.0 for i in range(1, 5): expected_idcg += 3.0 / log2(i + 1) for i in range(5, 9): expected_idcg += 2.0 / log2(i + 1) for i in range(9, 17): expected_idcg += 1.0 / log2(i + 1) for i in range(17, 19): expected_idcg += 0.0 / log2(i + 1) expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) def test_worst_with_topn(self): parsed_run = { 'A.78': { '493782': 0.15, '493764': 0.20, '2008712': 0.25, '1282166': 0.30, '2008631': 0.35, '2008628': 0.40, '2008609': 0.45, '1116378': 0.50, '2008650': 0.55, '2008616': 0.60, '2008449': 0.65, '1282180': 0.70, '1282116': 0.75, '1282112': 0.80, '1116370': 0.85, '1116368': 0.90, '1282155': 0.95, '1282114': 1.00, } } ndcg = get_ndcg(parsed_run, 'task1', 'test', 4) expected_dcg = 0.0 for i in range(1, 3): expected_dcg += 0.0 / log2(i + 1) for i in range(3, 5): expected_dcg += 1.0 / log2(i + 1) expected_idcg = 0.0 for i in range(1, 5): expected_idcg += 3.0 / log2(i + 1) for i in range(5, 9): expected_idcg += 2.0 / log2(i + 1) for i in range(9, 17): expected_idcg += 1.0 / log2(i + 1) for i in range(17, 19): expected_idcg += 0.0 / log2(i + 1) expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) test/test_get_random_ndcg.py +20 −0 Original line number Original line Diff line number Diff line Loading @@ -30,3 +30,23 @@ class TestGetRandomNDCG(unittest.TestCase): expected_ndcg = expected_dcg / expected_idcg expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) self.assertEqual(expected_ndcg, ndcg) def test_with_topn(self): ndcg = get_random_ndcg('task1', 'test', 4) expected_judgement = ( 4 * 3.0 + 4 * 2.0 + 8 * 1.0 + 2 * 0.0 ) / 18 expected_dcg = 0.0 for i in range(1, 5): expected_dcg += expected_judgement / log2(i + 1) expected_idcg = 0.0 for i in range(1, 5): expected_idcg += 3.0 / log2(i + 1) expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) Loading
README.md +3 −3 Original line number Original line Diff line number Diff line Loading @@ -51,9 +51,9 @@ Here is the documentation of the available evaluation functions: - [`get_topics(task, subset=None)`][get_topics], - [`get_topics(task, subset=None)`][get_topics], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_judged_documents(task, subset=None, topic=None)`][get_judged_documents], - [`get_random_ndcg(task, subset)`][get_random_ndcg], - [`get_random_ndcg(task, subset, topn=1000)`][get_random_ndcg], - [`get_ndcg(parsed_run, task, subset)`][get_ndcg], and - [`get_ndcg(parsed_run, task, subset, topn=1000)`][get_ndcg], and - [`get_random_normalized_ndcg(parsed_run, task, subset)`][get_random_normalized_ndcg]. - [`get_random_normalized_ndcg(parsed_run, task, subset, topn=1000)`][get_random_normalized_ndcg]. #### Using the `validation` set to compare various parameters of your system #### Using the `validation` set to compare various parameters of your system Loading
scripts/common.py +6 −3 Original line number Original line Diff line number Diff line Loading @@ -102,8 +102,9 @@ def get_ndcg(parsed_run, task, subset, topn=1000): A task. A task. subset : str subset : str A subset of the task. A subset of the task. topn : int topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Returns Returns ------- ------- Loading Loading @@ -134,8 +135,9 @@ def get_random_ndcg(task, subset, topn=1000): A task. A task. subset : str subset : str A subset of the task. A subset of the task. topn : int topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Returns Returns ------- ------- Loading Loading @@ -185,8 +187,9 @@ def get_random_normalized_ndcg(parsed_run, task, subset, topn=1000): A task. A task. subset : str subset : str A subset of the task. A subset of the task. topn : int topn : int, optional The top N results, which will be considered in computing the NDCG. The top N results, which will be considered in computing the NDCG. Default is 1000. Returns Returns ------- ------- Loading
setup.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -5,7 +5,7 @@ from setuptools import setup setup( setup( name='arqmath_eval', name='arqmath_eval', version='0.0.2', version='0.0.3', description='Evaluation of ARQMath systems', description='Evaluation of ARQMath systems', packages=['arqmath_eval'], packages=['arqmath_eval'], package_dir={'arqmath_eval': 'scripts'}, package_dir={'arqmath_eval': 'scripts'}, Loading
test/test_get_ndcg.py +86 −0 Original line number Original line Diff line number Diff line Loading @@ -172,3 +172,89 @@ class TestGetNDCG(unittest.TestCase): expected_ndcg = expected_dcg / expected_idcg expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) self.assertEqual(expected_ndcg, ndcg) def test_best_with_topn(self): parsed_run = { 'A.78': { '493782': 1.00, '493764': 0.95, '2008712': 0.90, '1282166': 0.85, '2008631': 0.80, '2008628': 0.75, '2008609': 0.70, '1116378': 0.65, '2008650': 0.60, '2008616': 0.55, '2008449': 0.50, '1282180': 0.45, '1282116': 0.40, '1282112': 0.35, '1116370': 0.30, '1116368': 0.25, '1282155': 0.20, '1282114': 0.15, } } ndcg = get_ndcg(parsed_run, 'task1', 'test', 4) expected_dcg = 0.0 for i in range(1, 5): expected_dcg += 3.0 / log2(i + 1) expected_idcg = 0.0 for i in range(1, 5): expected_idcg += 3.0 / log2(i + 1) for i in range(5, 9): expected_idcg += 2.0 / log2(i + 1) for i in range(9, 17): expected_idcg += 1.0 / log2(i + 1) for i in range(17, 19): expected_idcg += 0.0 / log2(i + 1) expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) def test_worst_with_topn(self): parsed_run = { 'A.78': { '493782': 0.15, '493764': 0.20, '2008712': 0.25, '1282166': 0.30, '2008631': 0.35, '2008628': 0.40, '2008609': 0.45, '1116378': 0.50, '2008650': 0.55, '2008616': 0.60, '2008449': 0.65, '1282180': 0.70, '1282116': 0.75, '1282112': 0.80, '1116370': 0.85, '1116368': 0.90, '1282155': 0.95, '1282114': 1.00, } } ndcg = get_ndcg(parsed_run, 'task1', 'test', 4) expected_dcg = 0.0 for i in range(1, 3): expected_dcg += 0.0 / log2(i + 1) for i in range(3, 5): expected_dcg += 1.0 / log2(i + 1) expected_idcg = 0.0 for i in range(1, 5): expected_idcg += 3.0 / log2(i + 1) for i in range(5, 9): expected_idcg += 2.0 / log2(i + 1) for i in range(9, 17): expected_idcg += 1.0 / log2(i + 1) for i in range(17, 19): expected_idcg += 0.0 / log2(i + 1) expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg)
test/test_get_random_ndcg.py +20 −0 Original line number Original line Diff line number Diff line Loading @@ -30,3 +30,23 @@ class TestGetRandomNDCG(unittest.TestCase): expected_ndcg = expected_dcg / expected_idcg expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg) self.assertEqual(expected_ndcg, ndcg) def test_with_topn(self): ndcg = get_random_ndcg('task1', 'test', 4) expected_judgement = ( 4 * 3.0 + 4 * 2.0 + 8 * 1.0 + 2 * 0.0 ) / 18 expected_dcg = 0.0 for i in range(1, 5): expected_dcg += expected_judgement / log2(i + 1) expected_idcg = 0.0 for i in range(1, 5): expected_idcg += 3.0 / log2(i + 1) expected_ndcg = expected_dcg / expected_idcg self.assertEqual(expected_ndcg, ndcg)