Commit 3c14793e authored by stefanik12's avatar stefanik12
Browse files
parents f492afac b844217e
Pipeline #61775 canceled with stage
This table contains the best result for every user on the *ntcir-11-math-2-main* task.
| nDCG | Result name | User |
|:-----|:------------|------|
| *0.6894* | *random* | *xrando42* |
This table contains the best result for every user on the *ntcir-12-mathir-arxiv-main* task.
| nDCG | Result name | User |
|:-----|:------------|------|
| *0.6471* | *random* | *xrando42* |
This table contains the best result for every user on the *ntcir-12-mathir-math-wiki-formula* task.
| nDCG | Result name | User |
|:-----|:------------|------|
| *0.7336* | *random* | *xrando42* |
......@@ -4,10 +4,10 @@ from pytrec_eval import parse_qrel, RelevanceEvaluator
TASK_README_HEAD = r'''
This table contains the best result for every user.
This table contains the best result for every user on the *%s* task.
| nDCG | User | Result name |
|:-----|------|:------------|
| nDCG | Result name | User |
|:-----|:------------|------|
'''.strip()
USER_README_HEAD = r'''
This table contains all results for user *%s* in descending order of task
......@@ -42,8 +42,12 @@ RELEVANCE_JUDGEMENTS = {
'ntcir-12-mathir-arxiv-main': 'NTCIR12_Math-qrels_agg-test.dat',
'ntcir-12-mathir-math-wiki-formula': 'NTCIR12_MathWikiFrm-qrels_agg-test.dat',
},
'all': {
'task1-votes.V1.2': 'votes-qrels.V1.2.tsv',
'task2-topics-formula_ids.V.1.1': 'topics-formula_ids-qrels.V1.1.tsv',
}
}
TASKS = list(RELEVANCE_JUDGEMENTS['test'].keys())
TASKS = list(RELEVANCE_JUDGEMENTS['validation'].keys())
PARSED_RELEVANCE_JUDGEMENTS = {}
EVALUATORS = {}
for subset, filenames in RELEVANCE_JUDGEMENTS.items():
......
......@@ -9,7 +9,7 @@ from pytrec_eval import parse_run
from tqdm import tqdm
from .common import get_ndcg, get_random_ndcg
from .configuration import TASKS, USER_README_HEAD
from .configuration import TASKS, USER_README_HEAD, TASK_README_HEAD
def evaluate_worker(result_filename):
......@@ -24,6 +24,7 @@ if __name__ == '__main__':
for task in TASKS:
random_ndcg = get_random_ndcg(task, 'validation')
users = glob(os.path.join(task, '*', ''))
task_results = [(random_ndcg, 'random', 'xrando42')]
for user in users:
user = os.path.normpath(user)
user_name = os.path.basename(user)
......@@ -37,6 +38,7 @@ if __name__ == '__main__':
with open(os.path.join(user, 'README.md'), 'wt') as f_readme:
f_readme.write(USER_README_HEAD % user_name)
f_readme.write('\n')
task_results.append((*max(user_results), user_name))
for ndcg, result_name in sorted(user_results, reverse=True):
if result_name == 'random':
f_readme.write('| *%.4f* | *%s* |\n' % (ndcg, result_name))
......@@ -48,4 +50,11 @@ if __name__ == '__main__':
f_readme.write(f_legend.read())
except IOError:
pass
with open(os.path.join(task, 'README.md'), 'wt') as f_readme:
f_readme.write(TASK_README_HEAD % task)
f_readme.write('\n')
for ndcg, result_name, user_name in sorted(task_results, reverse=True):
if result_name == 'random':
f_readme.write('| *%.4f* | *%s* | *%s* |\n' % (ndcg, result_name, user_name))
else:
f_readme.write('| %.4f | %s | %s |\n' % (ndcg, result_name, user_name))
q_4 xxx B.1 0
q_55 xxx B.10 0
q_71 xxx B.11 0
q_73 xxx B.12 0
q_76 xxx B.13 0
q_77 xxx B.14 0
q_82 xxx B.15 0
q_92 xxx B.16 0
q_97 xxx B.17 0
q_111 xxx B.18 0
q_9 xxx B.2 0
q_130 xxx B.20 0
q_136 xxx B.21 0
q_172 xxx B.24 0
q_199 xxx B.25 0
q_205 xxx B.26 0
q_207 xxx B.27 0
q_213 xxx B.28 0
q_238 xxx B.29 0
q_13 xxx B.3 0
q_245 xxx B.30 0
q_262 xxx B.32 0
q_267 xxx B.33 0
q_278 xxx B.34 0
q_290 xxx B.35 0
q_294 xxx B.36 0
q_305 xxx B.37 0
q_319 xxx B.38 0
q_22 xxx B.4 0
q_329 xxx B.40 0
q_336 xxx B.41 0
q_340 xxx B.43 0
q_351 xxx B.44 0
q_362 xxx B.45 0
q_370 xxx B.46 0
q_402 xxx B.47 0
q_427 xxx B.48 0
q_459 xxx B.50 0
q_472 xxx B.51 0
q_479 xxx B.52 0
q_501 xxx B.53 0
q_503 xxx B.54 0
q_504 xxx B.55 0
q_510 xxx B.56 0
q_514 xxx B.57 0
q_525 xxx B.58 0
q_528 xxx B.59 0
q_25 xxx B.6 0
q_535 xxx B.60 0
q_561 xxx B.62 0
q_569 xxx B.63 0
q_574 xxx B.64 0
q_604 xxx B.65 0
q_612 xxx B.66 0
q_616 xxx B.67 0
q_620 xxx B.68 0
q_637 xxx B.69 0
q_643 xxx B.70 0
q_646 xxx B.71 0
q_663 xxx B.73 0
q_690 xxx B.74 0
q_698 xxx B.75 0
q_707 xxx B.76 0
q_727 xxx B.77 0
q_739 xxx B.79 0
q_49 xxx B.8 0
q_743 xxx B.80 0
q_754 xxx B.81 0
q_807 xxx B.82 0
q_819 xxx B.83 0
q_825 xxx B.84 0
q_840 xxx B.85 0
q_849 xxx B.86 0
q_854 xxx B.87 0
q_883 xxx B.88 0
q_884 xxx B.89 0
q_52 xxx B.9 0
q_894 xxx B.90 0
q_931 xxx B.92 0
q_940 xxx B.93 0
q_944 xxx B.94 0
q_954 xxx B.95 0
q_977 xxx B.96 0
q_997 xxx B.97 0
q_1010 xxx B.98 0
This diff is collapsed.
......@@ -5,7 +5,7 @@ from setuptools import setup
setup(
name='arqmath_eval',
version='0.0.8',
version='0.0.12',
description='Evaluation of ARQMath systems',
packages=['arqmath_eval'],
package_dir={'arqmath_eval': 'scripts'},
......@@ -32,6 +32,8 @@ setup(
'votes-qrels-small-validation.V1.0.tsv',
'votes-qrels-validation.V1.0.tsv',
'votes-qrels-test.V1.0.tsv',
'votes-qrels.V1.2.tsv',
'topics-formula_ids-qrels.V1.1.tsv',
],
},
include_package_data=True,
......
This table contains the best result for every user on the *task1-votes* task.
| nDCG | Result name | User |
|:-----|:------------|------|
| 0.7796 | sbert, validation, html-removal, exid9 | xstefan3 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=800, symmetric=True, exponent=4.0, threshold=-1.0 | xnovot32 |
| 0.7604 | prefix, phrases=2, alpha=0.1, dm=0, dm-concat=1, epochs=5, hs=0, min-alpha=0, min-count=5, negative=12, vector-size=300, window=8 | ayetiran |
| *0.7578* | *random* | *xrando42* |
The [Formula2Vec system][scm-at-arqmath] recogizes the following parameters:
- Dataset:
- arxmliv, 08, 2019, no-problem – the no\_problem subset (150,701 documents) of [the arXMLiv 08.2019 dataset][arxmliv-08-2019]
- phrases – how many times [collocation detection][] and bigram merging are iteratively applied to the corpus:
- 0 – the text and math tokens in the corpus are unchanged,
- N – [collocation detection][] and bigram merging are iteratively applied to both text and math tokens in the corpus N times
......@@ -21,10 +20,8 @@ The [Formula2Vec system][scm-at-arqmath] recogizes the following parameters:
- min-count – the minimum term frequency
- vector-size – vector dimensions
- window – window size
- workers – the number of threads used for [hogwild][]
- epochs – the number of epochs
[arxmliv-08-2019]: https://sigmathling.kwarc.info/resources/arxmliv-dataset-082019/
[collocation detection]: https://radimrehurek.com/gensim/models/phrases.html
[hogwild]: https://papers.nips.cc/paper/4390-hogwild-a-lock-free-approach-to-parallelizing-stochastic-gradient-descent
[scm-at-arqmath]: https://gitlab.fi.muni.cz/xnovot32/scm-at-arqmath (Soft Cosine Measure at ARQMath)
......@@ -4,7 +4,8 @@ underscores (`_`) replaced with a comma and a space for improved readability.
| nDCG | Result name |
|------|:------------|
| 0.7580 | prefix, phrases=2, alpha=0.05, dm=1, dm-concat=1, epochs=5, hs=1, min-alpha=0, min-count=5, vector-size=400, window=4, workers=64 |
| 0.7604 | prefix, phrases=2, alpha=0.1, dm=0, dm-concat=1, epochs=5, hs=0, min-alpha=0, min-count=5, negative=12, vector-size=300, window=8 |
| 0.7579 | prefix, phrases=2, alpha=0.05, dm=1, dm-concat=1, epochs=5, hs=1, min-alpha=0, min-count=5, vector-size=400, window=4 |
| *0.7578* | *random* |
## Legend
......@@ -12,7 +13,6 @@ underscores (`_`) replaced with a comma and a space for improved readability.
The [Formula2Vec system][scm-at-arqmath] recogizes the following parameters:
- Dataset:
- arxmliv, 08, 2019, no-problem – the no\_problem subset (150,701 documents) of [the arXMLiv 08.2019 dataset][arxmliv-08-2019]
- phrases – how many times [collocation detection][] and bigram merging are iteratively applied to the corpus:
- 0 – the text and math tokens in the corpus are unchanged,
- N – [collocation detection][] and bigram merging are iteratively applied to both text and math tokens in the corpus N times
......@@ -32,10 +32,8 @@ The [Formula2Vec system][scm-at-arqmath] recogizes the following parameters:
- min-count – the minimum term frequency
- vector-size – vector dimensions
- window – window size
- workers – the number of threads used for [hogwild][]
- epochs – the number of epochs
[arxmliv-08-2019]: https://sigmathling.kwarc.info/resources/arxmliv-dataset-082019/
[collocation detection]: https://radimrehurek.com/gensim/models/phrases.html
[hogwild]: https://papers.nips.cc/paper/4390-hogwild-a-lock-free-approach-to-parallelizing-stochastic-gradient-descent
[scm-at-arqmath]: https://gitlab.fi.muni.cz/xnovot32/scm-at-arqmath (Soft Cosine Measure at ARQMath)
The [SCM system][scm-at-arqmath] recogizes the following parameters:
- Dataset:
- arxmliv, 08, 2019, no-problem – the no\_problem subset (150,701 documents) of [the arXMLiv 08.2019 dataset][arxmliv-08-2019]
- phrases – how many times [collocation detection][] and bigram merging are iteratively applied to the corpus:
- 0 – the text and math tokens in the corpus are unchanged,
- N – [collocation detection][] and bigram merging are iteratively applied to both text and math tokens in the corpus N times
......@@ -24,7 +23,6 @@ The [SCM system][scm-at-arqmath] recogizes the following parameters:
- sg – the skipgram model
- size – vector dimensions
- window – window size
- workers – the number of threads used for [hogwild][]
- Soft Cosine Measure:
- dominant – whether the term similarity matrix will be strongly diagonally dominant
- nonzero-limit – the maximum number of non-zero elements outside the diagonal in a single column of the term similarity matrix
......@@ -34,6 +32,5 @@ The [SCM system][scm-at-arqmath] recogizes the following parameters:
[arxmliv-08-2019]: https://sigmathling.kwarc.info/resources/arxmliv-dataset-082019/
[collocation detection]: https://radimrehurek.com/gensim/models/phrases.html
[hogwild]: https://papers.nips.cc/paper/4390-hogwild-a-lock-free-approach-to-parallelizing-stochastic-gradient-descent
[scm-at-arqmath]: https://gitlab.fi.muni.cz/xnovot32/scm-at-arqmath (Soft Cosine Measure at ARQMath)
[term similarity matrix formula]: https://arxiv.org/pdf/2003.05019.pdf#page=4
......@@ -4,36 +4,43 @@ underscores (`_`) replaced with a comma and a space for improved readability.
| nDCG | Result name |
|------|:------------|
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=800, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=200, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=50, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=6, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=5, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=200, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=1600, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=1, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=800, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=0, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=400, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | infix, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=50, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=10, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=1600, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=1000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=400, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=3, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=8000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7611 | prefix, phrases=2, alpha=0.05, bucket=4000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7610 | prefix, phrases=4, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7607 | slt, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7606 | opt, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7602 | latex, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7600 | nomath, phrases=0, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7598 | nomath, phrases=1, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7596 | nomath, phrases=2, alpha=0.05, bucket=2000000, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, workers=64, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=800, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=200, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=50, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7614 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=6, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=5, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=200, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=1600, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=1, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=800, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=0, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=400, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=50, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7613 | infix, phrases=0, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=400, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7613 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=50, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=0, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=10, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=1600, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=1M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=400, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=3, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=50, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7612 | prefix, phrases=2, alpha=0.05, bucket=8M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7611 | prefix, phrases=2, alpha=0.05, bucket=4M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7611 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=200, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7610 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=200, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7610 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7610 | prefix, phrases=4, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7610 | prefix, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=False, nonzero-limit=100, symmetric=False, exponent=4.0, threshold=-1.0 |
| 0.7607 | slt, phrases=0, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7606 | opt, phrases=0, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7602 | latex, phrases=0, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7600 | nomath, phrases=0, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7598 | nomath, phrases=1, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| 0.7596 | nomath, phrases=2, alpha=0.05, bucket=2M, iter=5, max-n=6, min-alpha=0, min-count=5, min-n=3, negative=5, sample=0.0001, sg=1, size=300, window=5, dominant=True, nonzero-limit=100, symmetric=True, exponent=4.0, threshold=-1.0 |
| *0.7578* | *random* |
## Legend
......@@ -41,7 +48,6 @@ underscores (`_`) replaced with a comma and a space for improved readability.
The [SCM system][scm-at-arqmath] recogizes the following parameters:
- Dataset:
- arxmliv, 08, 2019, no-problem – the no\_problem subset (150,701 documents) of [the arXMLiv 08.2019 dataset][arxmliv-08-2019]
- phrases – how many times [collocation detection][] and bigram merging are iteratively applied to the corpus:
- 0 – the text and math tokens in the corpus are unchanged,
- N – [collocation detection][] and bigram merging are iteratively applied to both text and math tokens in the corpus N times
......@@ -64,7 +70,6 @@ The [SCM system][scm-at-arqmath] recogizes the following parameters:
- sg – the skipgram model
- size – vector dimensions
- window – window size
- workers – the number of threads used for [hogwild][]
- Soft Cosine Measure:
- dominant – whether the term similarity matrix will be strongly diagonally dominant
- nonzero-limit – the maximum number of non-zero elements outside the diagonal in a single column of the term similarity matrix
......@@ -74,6 +79,5 @@ The [SCM system][scm-at-arqmath] recogizes the following parameters:
[arxmliv-08-2019]: https://sigmathling.kwarc.info/resources/arxmliv-dataset-082019/
[collocation detection]: https://radimrehurek.com/gensim/models/phrases.html
[hogwild]: https://papers.nips.cc/paper/4390-hogwild-a-lock-free-approach-to-parallelizing-stochastic-gradient-descent
[scm-at-arqmath]: https://gitlab.fi.muni.cz/xnovot32/scm-at-arqmath (Soft Cosine Measure at ARQMath)
[term similarity matrix formula]: https://arxiv.org/pdf/2003.05019.pdf#page=4
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment