Commit bbcd62e0 authored by Marek Medved's avatar Marek Medved
Browse files

phrases with order per sentence

parent de76ceca
Loading
Loading
Loading
Loading
+27 −4
Original line number Diff line number Diff line
@@ -70,7 +70,7 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v)
        context_phrases = persistent.list.PersistentList()
        context_position = curr_sent_pos - 1
        while (context_position >= 0) and (curr_sent_pos - context_position <= context_window):
            context_phrases += phrases_per_sentence[context_position][:num_phr_per_sent]
            context_phrases.append(phrases_per_sentence[context_position][:num_phr_per_sent])
            context_position -= 1

        # Title as a context for first sentence in document
@@ -81,7 +81,7 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v)
                    word, lemma, tag = token.strip().split('\t')[:3]
                    wid = word2id(vocabulary, word, lemma, tag, w2v)
                    title_phr.append(wid)
            context_phrases.append(title_phr)
            context_phrases.append([title_phr])

        text_context.append(context_phrases)

@@ -131,14 +131,37 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False):
        for sent_num, sent in enumerate(text['text']):
            if verbose:
                print(f"s:{' '.join([id2word(vocabulary, x)['word'] for x in sent['sent']])}")
                for phr in phrases[sent_num]:
                    print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}')
                for phrs in phrases[sent_num]:
                    for phr in phrs:
                        print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}')
            if not sent['ctx'].get(f'name_phrs_w{context_window}_n{num_phr_per_sent}'):
                sent['ctx'][f'name_phrs_w{context_window}_n{num_phr_per_sent}'] = phrases[sent_num]
                db._p_changed = True
                transaction.commit()


def get_ctx(phrs, vocabulary, part):
    sentence_phrases = []
    for sent_phr in phrs:
        phr_per_sent = []
        for p in sent_phr:
            p_content = []
            for w_id_cx in p:
                if part:
                    p_content.append(id2word(vocabulary, w_id_cx, part))
                else:
                    p_content.append(id2word(vocabulary, w_id_cx))
            phr_per_sent.append(p_content)
        sentence_phrases.append(phr_per_sent)
    return sentence_phrases


def print_ctx(phrs):
    for idx, sent_phr in enumerate(phrs):
        for p in sent_phr:
            print(f'\t\t\tc(-{idx+1}): {" ".join([x["word"] for x in p])}')


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences')
+18 −0
Original line number Diff line number Diff line
@@ -26,6 +26,24 @@ def add_ctx(db, number, verbose=False):
                db._p_changed = True
                transaction.commit()

def get_ctx(phrs, vocabulary, part):
    content = []
    for p in phrs:
        p_content = []
        for w_id_cx in p:
            if part:
                p_content.append(id2word(vocabulary, w_id_cx, part))
            else:
                p_content.append(id2word(vocabulary, w_id_cx))
        content.append(p_content)

    return content


def print_ctx(phrs):
    for idx, p in enumerate(phrs):
        print(f'\t\t\tc(-{idx+1}): {" ".join([x["word"] for x in p])}')


def main():
    import argparse
+11 −13
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@ from sqad_db import id2word
from sqad_db import id2qt
from pprint import pprint
import sys
import context_np
import context_previous_senteces


def get_ctx(data, vocabulary, part='', context_type=''):
@@ -15,17 +17,11 @@ def get_ctx(data, vocabulary, part='', context_type=''):
        required_ctx = ['all']
    for ctx_type, phrs in data.items():
        if ctx_type in required_ctx or 'all' in required_ctx:
            for p in phrs:
                p_content = []
                for w_id_cx in p:
                    if part:
                        p_content.append(id2word(vocabulary, w_id_cx, part))
                    else:
                        p_content.append(id2word(vocabulary, w_id_cx))
                if sentence_phrases.get(ctx_type):
                    sentence_phrases[ctx_type].append(p_content)
                else:
                    sentence_phrases[ctx_type] = [p_content]
            if ctx_type.startswith('name_phrs'):
                sentence_phrases[ctx_type] = context_np.get_ctx(phrs, vocabulary, part)

            else:
                sentence_phrases[ctx_type] = context_previous_senteces.get_ctx(phrs, vocabulary, part)

    return sentence_phrases

@@ -130,8 +126,10 @@ def print_record(db, record_id, context_type=''):
        print(f'\ts_{idx}: {" ".join([x["word"] for x in sent_and_phrs["sent"]])}')
        for key, phrs in sent_and_phrs['ctx'].items():
            print(f'\t\tctx_type: {key}')
            for p in phrs:
                print(f'\t\t\tc: {" ".join([x["word"] for x in p])}')
            if key.startswith('name_phrs'):
                context_np.print_ctx(phrs)
            else:
                context_previous_senteces.print_ctx(phrs)

    print('No. text sentences that contain answer')
    print(f'\t{len(record.similar_answers["sents_containing_ans_ext"])}')