Commit ee6b82eb authored by Vít Novotný's avatar Vít Novotný
Browse files

Do not normalize NTCIR12 Math-IR MathWikiFormula topics

parent 9f0a0a70
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
md5: 7d782aa6a3497e19cd28581952c40cf6
md5: bbe5d5ae748f6a10f24b9e22b8bf8e79
outs:
- path: NTCIR12-MathWikiFormula-queries-infix-participants.json
  cache: true
  metric: false
  md5: 3680b17ccca5986a34d0d01ba2ad8109
  md5: 697744f232a517a044afc555ac922644
  persist: false
- path: NTCIR12-MathWikiFormula-queries-infix-participants.failures
  cache: true
  metric: false
  md5: b94c6e4a3eadd9e33cf0975ed8891f66
  md5: d41d8cd98f00b204e9800998ecf8427e
  persist: false
cmd: make NTCIR12-MathWikiFormula-queries-infix-participants.json
+3 −3
Original line number Diff line number Diff line
md5: 14e5a4068a2fdecf4ddb8a7bdba97bef
md5: d843ebd9b0f72d7e781184b427663cde
outs:
- path: NTCIR12-MathWikiFormula-queries-opt-participants.json
  cache: true
  metric: false
  md5: d4b70aaa165574927749693fa1364bd2
  md5: ce1bc6d54bbb351cfa33b6f6faba002e
  persist: false
- path: NTCIR12-MathWikiFormula-queries-opt-participants.failures
  cache: true
  metric: false
  md5: b94c6e4a3eadd9e33cf0975ed8891f66
  md5: d41d8cd98f00b204e9800998ecf8427e
  persist: false
cmd: make NTCIR12-MathWikiFormula-queries-opt-participants.json
+3 −3
Original line number Diff line number Diff line
md5: 2058f8dc1e651e9e2aa96693a5970d29
md5: 8b0b7d2003a62682e21bf44930ec9956
outs:
- path: NTCIR12-MathWikiFormula-queries-prefix-participants.json
  cache: true
  metric: false
  md5: 9e06bd7b89b086abb737a39e707275ec
  md5: 61286a9c37fd6e4ee416050d1446b745
  persist: false
- path: NTCIR12-MathWikiFormula-queries-prefix-participants.failures
  cache: true
  metric: false
  md5: b94c6e4a3eadd9e33cf0975ed8891f66
  md5: d41d8cd98f00b204e9800998ecf8427e
  persist: false
cmd: make NTCIR12-MathWikiFormula-queries-prefix-participants.json
+2 −2
Original line number Diff line number Diff line
md5: 0462fa27b85a53e0d4d3b8afb7cbeb0a
md5: a85b0186f92938d3b7e9002b8f486b18
outs:
- path: NTCIR12-MathWikiFormula-queries-slt-participants.json
  cache: true
  metric: false
  md5: 27b659906d4602c06497132ab16ae0f8
  md5: b52c5555989f2cb48090be6cd8177bb9
  persist: false
- path: NTCIR12-MathWikiFormula-queries-slt-participants.failures
  cache: true
+5 −2
Original line number Diff line number Diff line
@@ -43,9 +43,12 @@ def simple_preprocess(text):
    return gensim_simple_preprocess(text, max_len=float('inf'))


def ntcir_topic_read_xhtml(filename):
def ntcir_topic_read_xhtml(filename, normalize_math):
    with open(filename, 'rt') as f:
        if normalize_math:
            xml_tokens = mathmlcan(f.read())
        else:
            xml_tokens = f.read()
        xml_document = unicode_to_tree(xml_tokens)
    for topic_element in xml_document.xpath('//ntcir-math:topic | //mathml:topic', namespaces=XML_NAMESPACES):
        topic_number_elements = topic_element.xpath('.//ntcir-math:num | .//mathml:num', namespaces=XML_NAMESPACES)
Loading