Commit 5b796f63 authored by Vít Novotný's avatar Vít Novotný
Browse files

Parallelize scripts.evaluate and support LEGEND.md

parent 506a6b40
Pipeline #60497 failed with stage
......@@ -30,7 +30,7 @@ Each task comes with three *subsets*:
#### Using the `train` set to train your supervised system
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.7
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8
$ python
>>> from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
......@@ -58,7 +58,7 @@ Here is the documentation of the available evaluation functions:
#### Using the `validation` set to compare various parameters of your system
``` sh
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.7
$ pip install --force-reinstall git+https://gitlab.fi.muni.cz/xstefan3/arqmath-eval@0.0.8
$ python
>>> from arqmath_eval import get_topics, get_judged_documents
>>>
......
# -*- coding:utf-8 -*-
from glob import glob
from multiprocessing import Pool
import os.path
import re
......@@ -11,6 +12,14 @@ from .common import get_ndcg, get_random_ndcg
from .configuration import TASKS, USER_README_HEAD
def evaluate_worker(result_filename):
result_name = re.sub('_', ', ', os.path.basename(result_filename)[:-4])
with open(result_filename, 'rt') as f:
parsed_result = parse_run(f)
ndcg = get_ndcg(parsed_result, task, 'validation')
return (result_name, ndcg)
if __name__ == '__main__':
for task in TASKS:
random_ndcg = get_random_ndcg(task, 'validation')
......@@ -21,17 +30,22 @@ if __name__ == '__main__':
user_results = [(random_ndcg, 'random')]
results = glob(os.path.join(user, '*.tsv'))
if results:
for result in tqdm(results, desc='Evaluating {} systems'.format(user)):
result_name = re.sub('_', ', ', os.path.basename(result)[:-4])
with open(result, 'rt') as f:
parsed_result = parse_run(f)
ndcg = get_ndcg(parsed_result, task, 'validation')
user_results.append((ndcg, result_name))
with open(os.path.join(user, 'README.md'), 'wt') as f:
f.write(USER_README_HEAD % user_name)
f.write('\n')
for ndcg, result_name in sorted(user_results, reverse=True):
if result_name == 'random':
f.write('| *%.4f* | *%s* |\n' % (ndcg, result_name))
else:
f.write('| %.4f | %s |\n' % (ndcg, result_name))
results = tqdm(results, desc='Evaluating {} results'.format(user))
with Pool(None) as pool:
for result_name, ndcg in pool.map(evaluate_worker, results):
user_results.append((ndcg, result_name))
with open(os.path.join(user, 'README.md'), 'wt') as f_readme:
f_readme.write(USER_README_HEAD % user_name)
f_readme.write('\n')
for ndcg, result_name in sorted(user_results, reverse=True):
if result_name == 'random':
f_readme.write('| *%.4f* | *%s* |\n' % (ndcg, result_name))
else:
f_readme.write('| %.4f | %s |\n' % (ndcg, result_name))
try:
with open(os.path.join(user, 'LEGEND.md'), 'rt') as f_legend:
f_readme.write('\n## Legend\n\n')
f_readme.write(f_legend.read())
except IOError:
pass
......@@ -5,7 +5,7 @@ from setuptools import setup
setup(
name='arqmath_eval',
version='0.0.7',
version='0.0.8',
description='Evaluation of ARQMath systems',
packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'},
......
......@@ -11,3 +11,40 @@ underscores (`_`) replaced with a comma and a space for improved readability.
| 0.7602 | arxmliv, latex, 08, 2019, no-problem, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7600 | arxmliv, nomath, 08, 2019, no-problem, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| *0.7578* | *random* |
## Legend
The system recogizes the following parameters:
- Dataset:
- arxmliv, 08, 2019, no-problem – The `no_problem` subset (150,701 documents) of [the arXMLiv 08.2019 dataset][arxmliv-08-2019]
- phrases – Whether phrases are modeled in the corpus
- Math representation:
- opt – paths in operator tree
- slt – paths in syntax layout tree
- infix – nodes in operator tree in infix notation
- prefix – nodes in operator tree in prefix notation
- latex – untokenized LaTeX formulae
- nomath – no math formulae
- FastText:
- alpha – initial learning rate
- bucket – hash table bucket size
- iter – the number of epochs
- min-alpha – minimum learning rate
- min-n, max-n – the range of modeled subword sizes
- min-count – minimum term frequency
- negative – the number of negative samples
- sample – sampling threshold
- sg – the skipgram model
- size – vector dimensions
- window – window size
- workers – the number of threads used in HogWild
- Soft Cosine Measure:
- dominant – whether the term similarity matrix will be strongly diagonally dominant
- nonzero-limit – the maximum number of non-zero elements outside the diagonal in a single column of the term similarity matrix
- symmetric – whether the term similarity matrix will be symmetric
- exponent – parameter `o` in the [term similarity matrix formula][]
- threshold – parameter `t` in the [term similarity matrix formula][]
[arxmliv-08-2019]: https://sigmathling.kwarc.info/resources/arxmliv-dataset-082019/
[term similarity matrix formula]: https://arxiv.org/pdf/2003.05019.pdf#page=4
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment