Commit 87a3fdad authored by Marek Medveď's avatar Marek Medveď
Browse files

get_csv_form_manual_check.py

parent 32059f6d
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -2,5 +2,7 @@
database/sqad_*
evaluation/evaluate_db/
evaluation/results/
evaluation/*.tar.xz
tables/
__pycache__
scripts/gensim_git_2/
+91 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# coding: utf-8
# Author: Marek Medved, marek.medved@sketchengine.eu, Lexical Computing CZ
import re
import sys

q_type_re = re.compile('<q_type>(.*?)</q_type>')
a_type_re = re.compile('<a_type>(.*?)</a_type>')


def vert2plain(file, part='word'):
    result = []
    for line in file:
        if len(line.split('\t')) > 1:
            if part == 'lemma':
                result.append(line.strip().split('\t')[1])
            elif part == 'word':
                result.append(line.strip().split('\t')[0])
    return result


def get_qa_type(file):
    q_type = 'NONE'
    a_type = 'NONE'

    for line in file:
        if q_type_re.match(line):
            q_type = q_type_re.match(line).group(1)
        if a_type_re.match(line):
            a_type = a_type_re.match(line).group(1)

    return q_type, a_type


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Pick records for manual evaluation')
    parser.add_argument('-i', '--input', type=argparse.FileType('r'),
                        required=False, default=sys.stdin,
                        help='Input')
    parser.add_argument('-o', '--output', type=argparse.FileType('w'),
                        required=False, default=sys.stdout,
                        help='Output')
    parser.add_argument('-s', '--sqad_path', type=str,
                        required=True,
                        help='SQAD data path')
    args = parser.parse_args()

    total_num = 0
    correct_ds = 0
    incorrect_ds = 0

    # args.output.write(f'Status\tQuestion\tQ_type\tA_type\tAnswer\tCorr. DocId\tSel. DocID\n')
    for line in args.input:
        total_num += 1
        scores, correct_position = line.strip().split('\t')
        correct_position = int(correct_position)
        if correct_position > 0:
            scores_data = eval(scores)
            question_rid = scores_data[0][0]
            first_selected_doc = scores_data[0][1]
            with open(f'{args.sqad_path}/{question_rid}/01question.vert', 'r') as q_f:
                question = ' '.join(vert2plain(q_f, part='word'))

            with open(f'{args.sqad_path}/{question_rid}/05metadata.txt', 'r') as m_f:
                q_type, a_type = get_qa_type(m_f)

            with open(f'{args.sqad_path}/{question_rid}/09answer_extraction.vert', 'r') as a_f:
                answer_vert = a_f.readlines()
                answer = ' '.join(vert2plain(answer_vert, part='word'))
                answer_lemma = ''.join(vert2plain(answer_vert, part='lemma'))

            with open(f'{args.sqad_path}/{first_selected_doc}/03text.vert', 'r') as selected_t_f:
                selected_text_lemma = ''.join(vert2plain(selected_t_f, part='lemma'))

            if answer_lemma not in selected_text_lemma:
                # args.output.write(f'FAIL\t{question}\t{q_type}\t{a_type}\t{answer}\tvim -O {args.sqad_path}/{question_rid}/03text.vert {args.sqad_path}/{first_selected_doc}/03text.vert\n')
                incorrect_ds += 1
            else:
                # args.output.write(f'OK\t{question}\t{q_type}\t{a_type}\t{answer}\tvim -O {args.sqad_path}/{question_rid}/03text.vert {args.sqad_path}/{first_selected_doc}/03text.vert\n')
                if not a_type == 'YES_NO':
                    correct_ds += 1
        else:
            correct_ds += 1

    # args.output.write(f'total\tcorr\tcorr n %\tincorr\tincorr in %\n')
    args.output.write(f'{total_num}\t{correct_ds}\t{(correct_ds/total_num)*100:.2f}\t{incorrect_ds}\t{(incorrect_ds/total_num)*100:.2f}\n')


if __name__ == '__main__':
    main()