Commit 9273b3ea authored by Vít Novotný's avatar Vít Novotný
Browse files

Add ARQMath task 1 and 2 relevance judgements

parent dcc5ddb0
......@@ -5,7 +5,7 @@ from arqmath_eval import get_judged_documents
class TestGetJudgedDocuments(unittest.TestCase):
def test_all_subsets_all_topics(self):
documents = get_judged_documents('task1')
documents = get_judged_documents('task1-example')
expected_documents = {
'48162',
'48164',
......@@ -94,7 +94,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
self.assertEqual(expected_documents, documents)
def test_selected_subsets_all_topics(self):
documents = get_judged_documents('task1', 'train')
documents = get_judged_documents('task1-example', 'train')
expected_documents = {
'70741',
'70739',
......@@ -149,7 +149,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
}
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'validation')
documents = get_judged_documents('task1-example', 'validation')
expected_documents = {
'263828',
'264299',
......@@ -173,7 +173,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
}
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'test')
documents = get_judged_documents('task1-example', 'test')
expected_documents = {
'493764',
'493782',
......@@ -197,7 +197,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
self.assertEqual(expected_documents, documents)
def test_all_subsets_selected_topics(self):
documents = get_judged_documents('task1', topic='A.31')
documents = get_judged_documents('task1-example', topic='A.31')
expected_documents = {
'48162',
'48164',
......@@ -252,7 +252,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
}
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', topic='A.101')
documents = get_judged_documents('task1-example', topic='A.101')
expected_documents = {
'263828',
'264299',
......@@ -276,7 +276,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
}
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', topic='A.78')
documents = get_judged_documents('task1-example', topic='A.78')
expected_documents = {
'493764',
'493782',
......@@ -300,7 +300,7 @@ class TestGetJudgedDocuments(unittest.TestCase):
self.assertEqual(expected_documents, documents)
def test_selected_subsets_selected_topics(self):
documents = get_judged_documents('task1', 'train', 'A.31')
documents = get_judged_documents('task1-example', 'train', 'A.31')
expected_documents = {
'48162',
'48164',
......@@ -355,19 +355,19 @@ class TestGetJudgedDocuments(unittest.TestCase):
}
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'validation', 'A.31')
documents = get_judged_documents('task1-example', 'validation', 'A.31')
expected_documents = set()
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'test', 'A.31')
documents = get_judged_documents('task1-example', 'test', 'A.31')
expected_documents = set()
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'train', 'A.101')
documents = get_judged_documents('task1-example', 'train', 'A.101')
expected_documents = set()
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'validation', 'A.101')
documents = get_judged_documents('task1-example', 'validation', 'A.101')
expected_documents = {
'263828',
'264299',
......@@ -391,19 +391,19 @@ class TestGetJudgedDocuments(unittest.TestCase):
}
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'test', 'A.101')
documents = get_judged_documents('task1-example', 'test', 'A.101')
expected_documents = set()
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'train', 'A.78')
documents = get_judged_documents('task1-example', 'train', 'A.78')
expected_documents = set()
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'validation', 'A.78')
documents = get_judged_documents('task1-example', 'validation', 'A.78')
expected_documents = set()
self.assertEqual(expected_documents, documents)
documents = get_judged_documents('task1', 'test', 'A.78')
documents = get_judged_documents('task1-example', 'test', 'A.78')
expected_documents = {
'493764',
'493782',
......
......@@ -28,7 +28,7 @@ class TestGetNDCG(unittest.TestCase):
'1282114': 0.15,
}
}
ndcg = get_ndcg(parsed_run, 'task1', 'test')
ndcg = get_ndcg(parsed_run, 'task1-example', 'test')
expected_ndcg = 1.0
self.assertEqual(expected_ndcg, ndcg)
......@@ -59,7 +59,7 @@ class TestGetNDCG(unittest.TestCase):
'692232': 0.50,
},
}
ndcg = get_ndcg(parsed_run, 'task1', 'test')
ndcg = get_ndcg(parsed_run, 'task1-example', 'test')
expected_ndcg = 1.0
self.assertEqual(expected_ndcg, ndcg)
......@@ -88,7 +88,7 @@ class TestGetNDCG(unittest.TestCase):
'unjudged_2': 0.05,
},
}
ndcg = get_ndcg(parsed_run, 'task1', 'test')
ndcg = get_ndcg(parsed_run, 'task1-example', 'test')
expected_ndcg = 1.0
self.assertEqual(expected_ndcg, ndcg)
......@@ -121,7 +121,7 @@ class TestGetNDCG(unittest.TestCase):
'692232': 0.50,
},
}
ndcg = get_ndcg(parsed_run, 'task1', 'test')
ndcg = get_ndcg(parsed_run, 'task1-example', 'test')
expected_ndcg = 1.0
self.assertEqual(expected_ndcg, ndcg)
......@@ -148,7 +148,7 @@ class TestGetNDCG(unittest.TestCase):
'1282114': 1.00,
}
}
ndcg = get_ndcg(parsed_run, 'task1', 'test')
ndcg = get_ndcg(parsed_run, 'task1-example', 'test')
expected_dcg = 0.0
for i in range(1, 3):
......@@ -196,7 +196,7 @@ class TestGetNDCG(unittest.TestCase):
'1282114': 0.15,
}
}
ndcg = get_ndcg(parsed_run, 'task1', 'test', 4)
ndcg = get_ndcg(parsed_run, 'task1-example', 'test', 4)
expected_dcg = 0.0
for i in range(1, 5):
......@@ -238,7 +238,7 @@ class TestGetNDCG(unittest.TestCase):
'1282114': 1.00,
}
}
ndcg = get_ndcg(parsed_run, 'task1', 'test', 4)
ndcg = get_ndcg(parsed_run, 'task1-example', 'test', 4)
expected_dcg = 0.0
for i in range(1, 3):
......
......@@ -7,7 +7,7 @@ from arqmath_eval import get_random_ndcg, get_ndcg, get_topics, get_judged_docum
class TestGetRandomNDCG(unittest.TestCase):
def test_using_equation(self):
ndcg = get_random_ndcg('task1', 'test')
ndcg = get_random_ndcg('task1-example', 'test')
expected_judgement = (
4 * 3.0 +
......@@ -48,7 +48,7 @@ class TestGetRandomNDCG(unittest.TestCase):
self.assertAlmostEqual(expected_ndcg, ndcg, places=2)
def test_with_topn(self):
ndcg = get_random_ndcg('task1', 'test', 4)
ndcg = get_random_ndcg('task1-example', 'test', 4)
expected_judgement = (
4 * 3.0 +
......
......@@ -7,13 +7,13 @@ from arqmath_eval import get_random_ndcg, get_random_normalized_ndcg
class TestGetRandomNormalizedNDCG(unittest.TestCase):
def test_hand_picked(self):
parsed_run = {}
random_ndcg = get_random_ndcg('task1', 'test')
random_ndcg = get_random_ndcg('task1-example', 'test')
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1', 'test', ndcg=1.0)
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1-example', 'test', ndcg=1.0)
expected_random_normalized_ndcg = 1.0
self.assertEqual(expected_random_normalized_ndcg, random_normalized_ndcg)
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1', 'test', ndcg=random_ndcg)
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1-example', 'test', ndcg=random_ndcg)
expected_random_normalized_ndcg = 0.0
self.assertEqual(expected_random_normalized_ndcg, random_normalized_ndcg)
......@@ -40,7 +40,7 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
'1282114': 0.15,
}
}
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1', 'test')
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1-example', 'test')
expected_random_normalized_ndcg = 1.0
......@@ -69,7 +69,7 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
'1282114': 1.00,
}
}
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1', 'test')
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1-example', 'test')
expected_dcg = 0.0
for i in range(1, 3):
......@@ -92,7 +92,7 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
expected_idcg += 0.0 / log2(i + 1)
expected_ndcg = expected_dcg / expected_idcg
random_ndcg = get_random_ndcg('task1', 'test')
random_ndcg = get_random_ndcg('task1-example', 'test')
expected_random_normalized_ndcg = (expected_ndcg - random_ndcg) / (1.0 - random_ndcg)
self.assertEqual(expected_random_normalized_ndcg, random_normalized_ndcg)
......@@ -119,7 +119,7 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
'1282114': 0.15,
}
}
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1', 'test', 4)
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1-example', 'test', 4)
expected_dcg = 0.0
for i in range(1, 5):
......@@ -136,7 +136,7 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
expected_idcg += 0.0 / log2(i + 1)
expected_ndcg = expected_dcg / expected_idcg
random_ndcg = get_random_ndcg('task1', 'test', 4)
random_ndcg = get_random_ndcg('task1-example', 'test', 4)
expected_random_normalized_ndcg = (expected_ndcg - random_ndcg) / (1.0 - random_ndcg)
self.assertEqual(expected_random_normalized_ndcg, random_normalized_ndcg)
......@@ -163,7 +163,7 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
'1282114': 1.00,
}
}
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1', 'test', 4)
random_normalized_ndcg = get_random_normalized_ndcg(parsed_run, 'task1-example', 'test', 4)
expected_dcg = 0.0
for i in range(1, 3):
......@@ -182,6 +182,6 @@ class TestGetRandomNormalizedNDCG(unittest.TestCase):
expected_idcg += 0.0 / log2(i + 1)
expected_ndcg = expected_dcg / expected_idcg
random_ndcg = get_random_ndcg('task1', 'test', 4)
random_ndcg = get_random_ndcg('task1-example', 'test', 4)
expected_random_normalized_ndcg = (expected_ndcg - random_ndcg) / (1.0 - random_ndcg)
self.assertEqual(expected_random_normalized_ndcg, random_normalized_ndcg)
......@@ -6,29 +6,29 @@ from arqmath_eval.configuration import TASKS
class TestGetTopics(unittest.TestCase):
def test_all_subsets(self):
topics = get_topics('task1')
topics = get_topics('task1-example')
expected_topics = {'A.31', 'A.101', 'A.78'}
self.assertEqual(expected_topics, topics)
def test_selected_subsets(self):
topics = get_topics('task1', 'train')
topics = get_topics('task1-example', 'train')
expected_topics = {'A.31'}
self.assertEqual(expected_topics, topics)
topics = get_topics('task1', 'validation')
topics = get_topics('task1-example', 'validation')
expected_topics = {'A.101'}
self.assertEqual(expected_topics, topics)
topics = get_topics('task1', 'test')
topics = get_topics('task1-example', 'test')
expected_topics = {'A.78'}
self.assertEqual(expected_topics, topics)
def test_train_validation_test_split(self):
for task in TASKS:
train_topics = get_topics('task1', 'train')
validation_topics = get_topics('task1', 'validation')
test_topics = get_topics('task1', 'test')
all_topics = get_topics('task1')
train_topics = get_topics('task1-example', 'train')
validation_topics = get_topics('task1-example', 'validation')
test_topics = get_topics('task1-example', 'test')
all_topics = get_topics('task1-example')
self.assertEqual(len(validation_topics), len(test_topics))
train_ratio = len(train_topics) / len(all_topics)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment