Loading .gitignore +2 −0 Original line number Diff line number Diff line Loading @@ -2,5 +2,7 @@ database/sqad_* evaluation/evaluate_db/ evaluation/results/ evaluation/*.tar.xz tables/ __pycache__ scripts/gensim_git_2/ scripts/get_csv_form_manual_check.py 0 → 100755 +91 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # coding: utf-8 # Author: Marek Medved, marek.medved@sketchengine.eu, Lexical Computing CZ import re import sys q_type_re = re.compile('<q_type>(.*?)</q_type>') a_type_re = re.compile('<a_type>(.*?)</a_type>') def vert2plain(file, part='word'): result = [] for line in file: if len(line.split('\t')) > 1: if part == 'lemma': result.append(line.strip().split('\t')[1]) elif part == 'word': result.append(line.strip().split('\t')[0]) return result def get_qa_type(file): q_type = 'NONE' a_type = 'NONE' for line in file: if q_type_re.match(line): q_type = q_type_re.match(line).group(1) if a_type_re.match(line): a_type = a_type_re.match(line).group(1) return q_type, a_type def main(): import argparse parser = argparse.ArgumentParser(description='Pick records for manual evaluation') parser.add_argument('-i', '--input', type=argparse.FileType('r'), required=False, default=sys.stdin, help='Input') parser.add_argument('-o', '--output', type=argparse.FileType('w'), required=False, default=sys.stdout, help='Output') parser.add_argument('-s', '--sqad_path', type=str, required=True, help='SQAD data path') args = parser.parse_args() total_num = 0 correct_ds = 0 incorrect_ds = 0 # args.output.write(f'Status\tQuestion\tQ_type\tA_type\tAnswer\tCorr. DocId\tSel. DocID\n') for line in args.input: total_num += 1 scores, correct_position = line.strip().split('\t') correct_position = int(correct_position) if correct_position > 0: scores_data = eval(scores) question_rid = scores_data[0][0] first_selected_doc = scores_data[0][1] with open(f'{args.sqad_path}/{question_rid}/01question.vert', 'r') as q_f: question = ' '.join(vert2plain(q_f, part='word')) with open(f'{args.sqad_path}/{question_rid}/05metadata.txt', 'r') as m_f: q_type, a_type = get_qa_type(m_f) with open(f'{args.sqad_path}/{question_rid}/09answer_extraction.vert', 'r') as a_f: answer_vert = a_f.readlines() answer = ' '.join(vert2plain(answer_vert, part='word')) answer_lemma = ''.join(vert2plain(answer_vert, part='lemma')) with open(f'{args.sqad_path}/{first_selected_doc}/03text.vert', 'r') as selected_t_f: selected_text_lemma = ''.join(vert2plain(selected_t_f, part='lemma')) if answer_lemma not in selected_text_lemma: # args.output.write(f'FAIL\t{question}\t{q_type}\t{a_type}\t{answer}\tvim -O {args.sqad_path}/{question_rid}/03text.vert {args.sqad_path}/{first_selected_doc}/03text.vert\n') incorrect_ds += 1 else: # args.output.write(f'OK\t{question}\t{q_type}\t{a_type}\t{answer}\tvim -O {args.sqad_path}/{question_rid}/03text.vert {args.sqad_path}/{first_selected_doc}/03text.vert\n') if not a_type == 'YES_NO': correct_ds += 1 else: correct_ds += 1 # args.output.write(f'total\tcorr\tcorr n %\tincorr\tincorr in %\n') args.output.write(f'{total_num}\t{correct_ds}\t{(correct_ds/total_num)*100:.2f}\t{incorrect_ds}\t{(incorrect_ds/total_num)*100:.2f}\n') if __name__ == '__main__': main() Loading
.gitignore +2 −0 Original line number Diff line number Diff line Loading @@ -2,5 +2,7 @@ database/sqad_* evaluation/evaluate_db/ evaluation/results/ evaluation/*.tar.xz tables/ __pycache__ scripts/gensim_git_2/
scripts/get_csv_form_manual_check.py 0 → 100755 +91 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # coding: utf-8 # Author: Marek Medved, marek.medved@sketchengine.eu, Lexical Computing CZ import re import sys q_type_re = re.compile('<q_type>(.*?)</q_type>') a_type_re = re.compile('<a_type>(.*?)</a_type>') def vert2plain(file, part='word'): result = [] for line in file: if len(line.split('\t')) > 1: if part == 'lemma': result.append(line.strip().split('\t')[1]) elif part == 'word': result.append(line.strip().split('\t')[0]) return result def get_qa_type(file): q_type = 'NONE' a_type = 'NONE' for line in file: if q_type_re.match(line): q_type = q_type_re.match(line).group(1) if a_type_re.match(line): a_type = a_type_re.match(line).group(1) return q_type, a_type def main(): import argparse parser = argparse.ArgumentParser(description='Pick records for manual evaluation') parser.add_argument('-i', '--input', type=argparse.FileType('r'), required=False, default=sys.stdin, help='Input') parser.add_argument('-o', '--output', type=argparse.FileType('w'), required=False, default=sys.stdout, help='Output') parser.add_argument('-s', '--sqad_path', type=str, required=True, help='SQAD data path') args = parser.parse_args() total_num = 0 correct_ds = 0 incorrect_ds = 0 # args.output.write(f'Status\tQuestion\tQ_type\tA_type\tAnswer\tCorr. DocId\tSel. DocID\n') for line in args.input: total_num += 1 scores, correct_position = line.strip().split('\t') correct_position = int(correct_position) if correct_position > 0: scores_data = eval(scores) question_rid = scores_data[0][0] first_selected_doc = scores_data[0][1] with open(f'{args.sqad_path}/{question_rid}/01question.vert', 'r') as q_f: question = ' '.join(vert2plain(q_f, part='word')) with open(f'{args.sqad_path}/{question_rid}/05metadata.txt', 'r') as m_f: q_type, a_type = get_qa_type(m_f) with open(f'{args.sqad_path}/{question_rid}/09answer_extraction.vert', 'r') as a_f: answer_vert = a_f.readlines() answer = ' '.join(vert2plain(answer_vert, part='word')) answer_lemma = ''.join(vert2plain(answer_vert, part='lemma')) with open(f'{args.sqad_path}/{first_selected_doc}/03text.vert', 'r') as selected_t_f: selected_text_lemma = ''.join(vert2plain(selected_t_f, part='lemma')) if answer_lemma not in selected_text_lemma: # args.output.write(f'FAIL\t{question}\t{q_type}\t{a_type}\t{answer}\tvim -O {args.sqad_path}/{question_rid}/03text.vert {args.sqad_path}/{first_selected_doc}/03text.vert\n') incorrect_ds += 1 else: # args.output.write(f'OK\t{question}\t{q_type}\t{a_type}\t{answer}\tvim -O {args.sqad_path}/{question_rid}/03text.vert {args.sqad_path}/{first_selected_doc}/03text.vert\n') if not a_type == 'YES_NO': correct_ds += 1 else: correct_ds += 1 # args.output.write(f'total\tcorr\tcorr n %\tincorr\tincorr in %\n') args.output.write(f'{total_num}\t{correct_ds}\t{(correct_ds/total_num)*100:.2f}\t{incorrect_ds}\t{(incorrect_ds/total_num)*100:.2f}\n') if __name__ == '__main__': main()