Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Michal Štefánik
ARQMath-eval
Commits
c3aab4ef
Commit
c3aab4ef
authored
Aug 27, 2020
by
Vít Novotný
Browse files
Make get_ndcg produce confidence intervals
parent
f777489d
Pipeline
#62984
failed with stage
Changes
4
Pipelines
2
Show whitespace changes
Inline
Side-by-side
README.md
View file @
c3aab4ef
...
@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
...
@@ -36,7 +36,7 @@ trained using subsets of the `task1` and `task2` tasks.
#### Using the `train` subset to train your supervised system
#### Using the `train` subset to train your supervised system
```
sh
```
sh
$
pip
install
--force-reinstall
git+https://github.com/MIR-MU/ARQMath-eval@0.0.1
7
$
pip
install
--force-reinstall
git+https://github.com/MIR-MU/ARQMath-eval@0.0.1
8
$
python
$
python
>>>
from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
from arqmath_eval import get_topics, get_judged_documents, get_ndcg
>>>
>>>
...
@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions:
...
@@ -58,14 +58,14 @@ Here is the documentation of the available evaluation functions:
-
[
`get_topics(task, subset=None)`
][
get_topics
]
,
-
[
`get_topics(task, subset=None)`
][
get_topics
]
,
-
[
`get_judged_documents(task, subset=None, topic=None)`
][
get_judged_documents
]
,
-
[
`get_judged_documents(task, subset=None, topic=None)`
][
get_judged_documents
]
,
-
[
`get_random_ndcg(task, subset, topn)`
][
get_random_ndcg
]
,
-
[
`get_random_ndcg(task, subset, topn)`
][
get_random_ndcg
]
,
-
[
`get_ndcg(parsed_run, task, subset, topn)`
][
get_ndcg
]
, and
-
[
`get_ndcg(parsed_run, task, subset, topn
, confidence
)`
][
get_ndcg
]
, and
-
[
`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`
][
get_random_normalized_ndcg
]
.
-
[
`get_random_normalized_ndcg(parsed_run, task, subset, topn, ndcg)`
][
get_random_normalized_ndcg
]
.
-
[
`get_judgement(task, subset, topic, judged_document)`
][
get_judgement
]
.
-
[
`get_judgement(task, subset, topic, judged_document)`
][
get_judgement
]
.
#### Using the `validation` subset to compare various parameters of your system
#### Using the `validation` subset to compare various parameters of your system
```
sh
```
sh
$
pip
install
--force-reinstall
git+https://github.com/MIR-MU/ARQMath-eval@0.0.1
7
$
pip
install
--force-reinstall
git+https://github.com/MIR-MU/ARQMath-eval@0.0.1
8
$
python
$
python
>>>
from arqmath_eval import get_topics, get_judged_documents
>>>
from arqmath_eval import get_topics, get_judged_documents
>>>
>>>
...
@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd
...
@@ -96,19 +96,19 @@ $ git push # publish your new result and the upd
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
#### Using the `all` subset to compute the NDCG' score of an ARQMath submission
```
sh
```
sh
$
pip
install
--force-reinstall
git+https://github.com/MIR-MU/ARQMath-eval@0.0.1
7
$
pip
install
--force-reinstall
git+https://github.com/MIR-MU/ARQMath-eval@0.0.1
8
$
python
-m
arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
$
python
-m
arqmath_eval.evaluate MIRMU-task1-Ensemble-auto-both-A.tsv all
0.238
0.238
, 95% CI:
[
0.198
;
0.278]
```
```
[
arqmath-task1
]:
https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html
(Task 1: Find Answers)
[
arqmath-task1
]:
https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html
(Task 1: Find Answers)
[
arqmath-task2
]:
https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html
(Task 2: Formula Search)
[
arqmath-task2
]:
https://www.cs.rit.edu/~dprl/ARQMath/task2-formulas.html
(Task 2: Formula Search)
[
get_
judged_documents
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
61
[
get_
topics
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
35
[
get_
ndcg
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
94
[
get_
judged_documents
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
62
[
get_
random_ndcg
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
12
9
[
get_
ndcg
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L9
5
[
get_random_n
ormalized_ndcg
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L1
7
4
[
get_random_n
dcg
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L14
0
[
get_
judgement
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
213
[
get_
random_normalized_ndcg
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
185
[
get_
topics
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
3
4
[
get_
judgement
]:
https://github.com/MIR-MU/ARQMath-eval/blob/master/scripts/common.py#L
22
4
[
ntcir-11-math-2
]:
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf
(NTCIR-11 Math-2 Task Overview)
[
ntcir-11-math-2
]:
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf
(NTCIR-11 Math-2 Task Overview)
[
ntcir-12-mathir
]:
https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf
(NTCIR-12 MathIR Task Overview)
[
ntcir-12-mathir
]:
https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf
(NTCIR-12 MathIR Task Overview)
[
treceval-format
]:
https://stackoverflow.com/a/8175382/657401
(How to evaluate a search/retrieval engine using trec_eval?)
[
treceval-format
]:
https://stackoverflow.com/a/8175382/657401
(How to evaluate a search/retrieval engine using trec_eval?)
...
...
scripts/common.py
View file @
c3aab4ef
...
@@ -5,6 +5,7 @@ from itertools import chain
...
@@ -5,6 +5,7 @@ from itertools import chain
from
math
import
log2
from
math
import
log2
import
numpy
as
np
import
numpy
as
np
import
scipy.stats
as
st
from
.configuration
import
EVALUATORS
,
PARSED_RELEVANCE_JUDGEMENTS
from
.configuration
import
EVALUATORS
,
PARSED_RELEVANCE_JUDGEMENTS
...
@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None):
...
@@ -91,7 +92,7 @@ def get_judged_documents(task, subset=None, topic=None):
return
judged_documents
return
judged_documents
def
get_ndcg
(
parsed_run
,
task
,
subset
,
topn
=
1000
):
def
get_ndcg
(
parsed_run
,
task
,
subset
,
topn
=
1000
,
confidence
=
None
):
"""Returns the NDCG' of a system's run on a subset of a task.
"""Returns the NDCG' of a system's run on a subset of a task.
NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
NDCG' is the same as NDCG (Normalized Discounted Cumulative Gain), but all
...
@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
...
@@ -109,11 +110,16 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
topn : int, optional
topn : int, optional
The top N results, which will be considered in computing the NDCG.
The top N results, which will be considered in computing the NDCG.
Default is 1000.
Default is 1000.
confidence : float or None, optional
The confidence level used to construct a confidence interval.
If None, then no confidence interval is constructed. Default is None.
Returns
Returns
-------
-------
ndcg : float
ndcg : float
The NDCG' of the system's run on the subset of the task.
The NDCG' of the system's run on the subset of the task.
interval : (float, float), optional
The confidence interval for the NDCG'. Only produced when confidence is not None.
"""
"""
evaluator
=
EVALUATORS
[
subset
][
task
]
evaluator
=
EVALUATORS
[
subset
][
task
]
...
@@ -122,7 +128,12 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
...
@@ -122,7 +128,12 @@ def get_ndcg(parsed_run, task, subset, topn=1000):
if
not
parsed_run
:
if
not
parsed_run
:
return
0.0
return
0.0
evaluation
=
evaluator
.
evaluate
(
parsed_run
)
evaluation
=
evaluator
.
evaluate
(
parsed_run
)
ndcg
=
np
.
mean
([
measures
[
'ndcg'
]
for
topic
,
measures
in
evaluation
.
items
()])
sample
=
[
measures
[
'ndcg'
]
for
topic
,
measures
in
evaluation
.
items
()]
ndcg
=
np
.
mean
(
sample
)
if
confidence
is
not
None
:
interval
=
st
.
t
.
interval
(
confidence
/
100.0
,
len
(
sample
)
-
1
,
loc
=
ndcg
,
scale
=
st
.
sem
(
sample
))
return
(
ndcg
,
interval
)
else
:
return
ndcg
return
ndcg
...
...
scripts/evaluate.py
View file @
c3aab4ef
...
@@ -63,7 +63,7 @@ def produce_leaderboards():
...
@@ -63,7 +63,7 @@ def produce_leaderboards():
f_readme
.
write
(
'| %.4f | %s | %s |
\n
'
%
(
ndcg
,
result_name
,
user_name
))
f_readme
.
write
(
'| %.4f | %s | %s |
\n
'
%
(
ndcg
,
result_name
,
user_name
))
def
evaluate_run
(
filename
,
subset
):
def
evaluate_run
(
filename
,
subset
,
confidence
=
95.0
):
with
open
(
filename
,
'rt'
)
as
f
:
with
open
(
filename
,
'rt'
)
as
f
:
lines
=
[
line
.
strip
().
split
()
for
line
in
f
]
lines
=
[
line
.
strip
().
split
()
for
line
in
f
]
first_line
=
lines
[
0
]
first_line
=
lines
[
0
]
...
@@ -88,8 +88,8 @@ def evaluate_run(filename, subset):
...
@@ -88,8 +88,8 @@ def evaluate_run(filename, subset):
if
topic_id
not
in
parsed_result
:
if
topic_id
not
in
parsed_result
:
parsed_result
[
topic_id
]
=
dict
()
parsed_result
[
topic_id
]
=
dict
()
parsed_result
[
topic_id
][
result_id
]
=
1.0
/
(
int
(
rank
)
+
rank_offset
)
parsed_result
[
topic_id
][
result_id
]
=
1.0
/
(
int
(
rank
)
+
rank_offset
)
ndcg
=
get_ndcg
(
parsed_result
,
task
,
subset
)
ndcg
,
interval
=
get_ndcg
(
parsed_result
,
task
,
subset
,
confidence
=
confidence
)
print
(
'%.3f'
%
ndcg
)
print
(
'%.3f
, %g%% CI: [%.3f; %.3f]
'
%
(
ndcg
,
confidence
,
*
interval
)
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
setup.py
View file @
c3aab4ef
...
@@ -5,13 +5,14 @@ from setuptools import setup
...
@@ -5,13 +5,14 @@ from setuptools import setup
setup
(
setup
(
name
=
'arqmath_eval'
,
name
=
'arqmath_eval'
,
version
=
'0.0.1
7
'
,
version
=
'0.0.1
8
'
,
description
=
'Evaluation of ARQMath systems'
,
description
=
'Evaluation of ARQMath systems'
,
packages
=
[
'arqmath_eval'
],
packages
=
[
'arqmath_eval'
],
package_dir
=
{
'arqmath_eval'
:
'scripts'
},
package_dir
=
{
'arqmath_eval'
:
'scripts'
},
install_requires
=
[
install_requires
=
[
'numpy~=1.18.2'
,
'numpy~=1.18.2'
,
'pytrec-eval~=0.4'
,
'pytrec-eval~=0.4'
,
'scipy~=1.5.2'
,
'tqdm~=4.46.0'
,
'tqdm~=4.46.0'
,
],
],
package_data
=
{
package_data
=
{
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment