Commit cee59291 authored by Vít Novotný's avatar Vít Novotný
Browse files

Create a python package

parent 7e0a6c1b
Pipeline #57583 failed with stage
in 41 seconds
scripts/NTCIR11_Math-qrels-train.dat
scripts/NTCIR11_Math-qrels-test.dat
scripts/NTCIR12_Math-qrels_agg-train.dat
scripts/NTCIR12_Math-qrels_agg-test.dat
scripts/NTCIR12_MathWikiFrm-qrels_agg-train.dat
scripts/NTCIR12_MathWikiFrm-qrels_agg-test.dat
scripts/qrel.V1.0-train.tsv
scripts/qrel.V1.0-test.tsv
# -*- coding:utf-8 -*-
import numpy as np
from .configuration import PARSED_RELEVANCE_JUDGEMENTS
def ndcg(parsed_run, task='task1', subset='test'):
evaluator = PARSED_RELEVANCE_JUDGEMENTS[subset][task]
evaluation = evaluator.evaluate(parsed_run)
ndcg = np.mean([
measures['ndcg']
for topic, measures
in evaluation.items()
])
return ndcg
RELEVANCE_JUDGEMENTS = {
'task1': 'qrel.V1.0-test.tsv',
'ntcir-11-math-2-main': 'NTCIR11_Math-qrels-test.dat',
'ntcir-12-mathir-arxiv-main': 'NTCIR12_Math-qrels_agg-test.dat',
'ntcir-12-mathir-math-wiki-formula': 'NTCIR12_MathWikiFrm-qrels_agg-test.dat',
}
import os.path
from pytrec_eval import parse_qrel, RelevanceEvaluator
TASK_README_HEAD = r'''
This table contains the best result for every user.
......@@ -18,3 +17,30 @@ underscores (`_`) replaced with a comma and a space for improved readability.
| nDCG | Result name |
|------|:------------|
'''.strip()
RELEVANCE_JUDGEMENTS = {
'train': {
'task1': 'qrel.V1.0-train.tsv',
'ntcir-11-math-2-main': 'NTCIR11_Math-qrels-train.dat',
'ntcir-12-mathir-arxiv-main': 'NTCIR12_Math-qrels_agg-train.dat',
'ntcir-12-mathir-math-wiki-formula': 'NTCIR12_MathWikiFrm-qrels_agg-train.dat',
},
'test': {
'task1': 'qrel.V1.0-test.tsv',
'ntcir-11-math-2-main': 'NTCIR11_Math-qrels-test.dat',
'ntcir-12-mathir-arxiv-main': 'NTCIR12_Math-qrels_agg-test.dat',
'ntcir-12-mathir-math-wiki-formula': 'NTCIR12_MathWikiFrm-qrels_agg-test.dat',
},
}
TASKS = list(RELEVANCE_JUDGEMENTS['test'].keys())
PARSED_RELEVANCE_JUDGEMENTS = {}
for subset, filenames in RELEVANCE_JUDGEMENTS.items():
PARSED_RELEVANCE_JUDGEMENTS[subset] = {}
for task, filename in filenames.items():
relevance_judgements_filename = os.path.join(
os.path.dirname(__file__),
RELEVANCE_JUDGEMENTS[subset][task],
)
with open(relevance_judgements_filename, 'rt') as f:
parsed_relevance_judgements = parse_qrel(f)
evaluator = RelevanceEvaluator(parsed_relevance_judgements, {'ndcg'})
PARSED_RELEVANCE_JUDGEMENTS[subset][task] = evaluator
......@@ -4,17 +4,14 @@ from glob import glob
import os.path
import re
import numpy as np
from pytrec_eval import RelevanceEvaluator, parse_qrel, parse_run
from pytrec_eval import parse_run
from .configuration import RELEVANCE_JUDGEMENTS, TASK_README_HEAD, USER_README_HEAD
from .common import ndcg
from .configuration import TASKS, TASK_README_HEAD, USER_README_HEAD
if __name__ == '__main__':
for task, relevance_judgements in RELEVANCE_JUDGEMENTS.items():
with open(os.path.join(task, relevance_judgements), 'rt') as f:
parsed_relevance_judgements = parse_qrel(f)
evaluator = RelevanceEvaluator(parsed_relevance_judgements, {'ndcg'})
for task in TASKS:
task_results = []
for user in glob(os.path.join(task, '*', '')):
user = os.path.normpath(user)
......@@ -24,22 +21,16 @@ if __name__ == '__main__':
result_name = re.sub('_', ', ', os.path.basename(result)[:-4])
with open(result, 'rt') as f:
parsed_result = parse_run(f)
evaluation = evaluator.evaluate(parsed_result)
ndcg = np.mean([
measures['ndcg']
for topic, measures
in evaluation.items()
])
user_results.append((ndcg, result_name))
user_results.append((ndcg(parsed_result, task), result_name))
best_ndcg, best_result_name = max(user_results)
task_results.append((best_ndcg, user_name, best_result_name))
with open(os.path.join(user, 'README.md'), 'wt') as f:
f.write(USER_README_HEAD % user_name)
f.write('\n')
for ndgc, result_name in sorted(user_results, reverse=True):
f.write('| %.4f | %s |\n' % (ndcg, result_name))
for ndcg_score, result_name in sorted(user_results, reverse=True):
f.write('| %.4f | %s |\n' % (ndcg_score, result_name))
with open(os.path.join(task, 'README.md'), 'wt') as f:
f.write(TASK_README_HEAD)
f.write('\n')
for ndgc, user_name, result_name in sorted(task_results, reverse=True):
f.write('| %.4f | %s | %s |\n' % (ndcg, user_name, result_name))
for ndcg_score, user_name, result_name in sorted(task_results, reverse=True):
f.write('| %.4f | %s | %s |\n' % (ndcg_score, user_name, result_name))
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from setuptools import setup
setup(
name='arqmath_eval',
version='0.0.1',
description='Evaluation of ARQMath systens',
packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'},
install_requires=[
'numpy~=1.18.2',
'pytrec-eval~=0.4',
],
package_data={
'arqmath_eval': [
'NTCIR11_Math-qrels-train.dat',
'NTCIR11_Math-qrels-test.dat',
'NTCIR12_Math-qrels_agg-train.dat',
'NTCIR12_Math-qrels_agg-test.dat',
'NTCIR12_MathWikiFrm-qrels_agg-train.dat',
'NTCIR12_MathWikiFrm-qrels_agg-test.dat',
'qrel.V1.0-train.tsv',
'qrel.V1.0-test.tsv',
],
},
include_package_data=True,
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment