Commit 56637c7b authored by Vít Novotný's avatar Vít Novotný
Browse files

Add NTCIR-11 Math-2 Main and NTCIR-12 MathIR ArXiv Main topics

parent 44be0748
Loading
Loading
Loading
Loading
+10 −6
Original line number Diff line number Diff line
@@ -4,8 +4,10 @@ This repository contains scripts for producting preprocessed [ARQMath
competition][ARQMath] datasets:

- `output_data/ARQMath_CLEF2020/Formulas/formula_*.V0.2.{tsv,failures}` – the training set of formulae for the [ARQMath competition][ARQMath],
- `output_data/ARQMath_CLEF2020/Task1/Formula_topics_*_V1.2.{tsv,failures}` – the test set of formulae for the [ARQMath competition][ARQMath], and
- `output_data/arxiv-dataset-arXMLiv-08-2019/arxmliv_*_08_2019_*.json.gz.{json.gz,failures}` – tokenized documents and paragraphs from the [arXMLiv 08.2019 dataset][arXMLiv].
- `output_data/ARQMath_CLEF2020/Task1/Formula_topics_*_V1.2.{tsv,failures}` – the test set of formulae for the [ARQMath competition][ARQMath],
- `output_data/arxiv-dataset-arXMLiv-08-2019/arxmliv_*_08_2019_*.json.gz.{json.gz,failures}` – tokenized documents and paragraphs from the [arXMLiv 08.2019 dataset][arXMLiv],
- `output_data/ntcir/NTCIR11-Math/NTCIR11-Math2-queries-*-participants.{json,failures}` – tokenized topics from the [NTCIR-11 Math-2 Task Main Subtask][ntcir-11-math-2], and
- `output_data/ntcir/NTCIR12-Math/NTCIR12-Math-queries-*-participants.{json,failures}` – tokenized topics from the [NTCIR-12 MathIR Task ArXiv Main Subtask][ntcir-12-mathir].

## Downloading the preprocessed datasets

@@ -31,5 +33,7 @@ To produce the preprocessed datasets yourself,

 [arqmath]:         https://www.cs.rit.edu/~dprl/ARQMath/ (Answer Retrieval for Questions on Math)
 [arxmliv]:         https://sigmathling.kwarc.info/resources/arxmliv-dataset-082019/ (arXMLiv 08.2019 – An HTML5 dataset for arXiv.org)
 [mathmlcan]: https://github.com/MIR-MU/MathMLCan (MathMLCan – Canonicalization of different MathML encodings of equivalent formulae)
 [latexml]:         https://dlmf.nist.gov/LaTeXML/ (LaTeXML: A LaTeX to XML/HTML/MathML Converter)
 [mathmlcan]:       https://github.com/MIR-MU/MathMLCan (MathMLCan – Canonicalization of different MathML encodings of equivalent formulae)
 [ntcir-11-math-2]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.686.444&rep=rep1&type=pdf (NTCIR-11 Math-2 Task Overview)
 [ntcir-12-mathir]: https://www.cs.rit.edu/~rlaz/files/ntcir12-mathir.pdf (NTCIR-12 MathIR Task Overview)
+15 −15
Original line number Diff line number Diff line
@@ -29,46 +29,46 @@ LATEX=NOPROBLEM_LATEX WARNING1_LATEX WARNING2_LATEX
all: $(OPT) $(SLT) $(PREFIX) $(INFIX)

$(NOPROBLEM_OPT):
	$(RUN_SCRIPT) scripts.xhtml_to_opt_json no_problem
	$(RUN_SCRIPT) scripts.html5_to_opt_json no_problem

$(NOPROBLEM_SLT):
	$(RUN_SCRIPT) scripts.xhtml_to_slt_json no_problem
	$(RUN_SCRIPT) scripts.html5_to_slt_json no_problem

$(NOPROBLEM_PREFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_prefix_json no_problem
	$(RUN_SCRIPT) scripts.html5_to_prefix_json no_problem

$(NOPROBLEM_INFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_infix_json no_problem
	$(RUN_SCRIPT) scripts.html5_to_infix_json no_problem

$(NOPROBLEM_LATEX):
	$(RUN_SCRIPT) scripts.xhtml_to_latex_json no_problem
	$(RUN_SCRIPT) scripts.html5_to_latex_json no_problem

$(WARNING1_OPT):
	$(RUN_SCRIPT) scripts.xhtml_to_opt_json warning_1
	$(RUN_SCRIPT) scripts.html5_to_opt_json warning_1

$(WARNING1_SLT):
	$(RUN_SCRIPT) scripts.xhtml_to_slt_json warning_1
	$(RUN_SCRIPT) scripts.html5_to_slt_json warning_1

$(WARNING1_PREFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_prefix_json warning_1
	$(RUN_SCRIPT) scripts.html5_to_prefix_json warning_1

$(WARNING1_INFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_infix_json warning_1
	$(RUN_SCRIPT) scripts.html5_to_infix_json warning_1

$(WARNING1_LATEX):
	$(RUN_SCRIPT) scripts.xhtml_to_latex_json warning_1
	$(RUN_SCRIPT) scripts.html5_to_latex_json warning_1

$(WARNING2_OPT):
	$(RUN_SCRIPT) scripts.xhtml_to_opt_json warning_2
	$(RUN_SCRIPT) scripts.html5_to_opt_json warning_2

$(WARNING2_SLT):
	$(RUN_SCRIPT) scripts.xhtml_to_slt_json warning_2
	$(RUN_SCRIPT) scripts.html5_to_slt_json warning_2

$(WARNING2_PREFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_prefix_json warning_2
	$(RUN_SCRIPT) scripts.html5_to_prefix_json warning_2

$(WARNING2_INFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_infix_json warning_2
	$(RUN_SCRIPT) scripts.html5_to_infix_json warning_2

$(WARNING2_LATEX):
	$(RUN_SCRIPT) scripts.xhtml_to_latex_json warning_2
	$(RUN_SCRIPT) scripts.html5_to_latex_json warning_2
+2 −0
Original line number Diff line number Diff line
/NTCIR11-Math2-queries-latex-participants.failures
/NTCIR11-Math2-queries-latex-participants.json
+26 −0
Original line number Diff line number Diff line
.PHONY: all

RUN_SCRIPT=cd ../../.. && python3 -m

OPT=NTCIR11-Math2-queries-opt-participants.json NTCIR11-Math2-queries-opt-participants.failures
SLT=NTCIR11-Math2-queries-slt-participants.json NTCIR11-Math2-queries-slt-participants.failures
PREFIX=NTCIR11-Math2-queries-prefix-participants.json NTCIR11-Math2-queries-prefix-participants.failures
INFIX=NTCIR11-Math2-queries-infix-participants.json NTCIR11-Math2-queries-infix-participants.failures
LATEX=NTCIR11-Math2-queries-latex-participants.json NTCIR11-Math2-queries-latex-participants.failures

all: $(OPT) $(SLT) $(PREFIX) $(INFIX) $(LATEX)

$(OPT):
	$(RUN_SCRIPT) scripts.xhtml_to_opt_json ntcir-11-math-2-main

$(SLT):
	$(RUN_SCRIPT) scripts.xhtml_to_slt_json ntcir-11-math-2-main

$(PREFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_prefix_json ntcir-11-math-2-main

$(INFIX):
	$(RUN_SCRIPT) scripts.xhtml_to_infix_json ntcir-11-math-2-main

$(LATEX):
	$(RUN_SCRIPT) scripts.xhtml_to_latex_json ntcir-11-math-2-main
+13 −0
Original line number Diff line number Diff line
md5: 2c49e47dbbd87c7d63582fe53ae63dae
outs:
- metric: false
  persist: false
  cache: true
  md5: ba64e577823320380da247c27ed289af
  path: NTCIR11-Math2-queries-latex-participants.json
- metric: false
  persist: false
  cache: true
  md5: d41d8cd98f00b204e9800998ecf8427e
  path: NTCIR11-Math2-queries-latex-participants.failures
cmd: make NTCIR11-Math2-queries-latex-participants.json
Loading