Commit fd9acf21 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Import packages more lazily to guard against SIGSEGVs

parent c1a0b6d2
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ gensim~=3.8.3
lxml~=4.6.2
matplotlib~=3.3.2
# mysqlclient~=1.4.6
numpy~=1.19.0
numpy==1.19.2
opencv-python~=4.4.0.42
pycountry~=20.7.3
scikit-learn~=0.24.2
+15 −5
Original line number Diff line number Diff line
@@ -14,14 +14,9 @@ import warnings

from edit_distance import SequenceMatcher
import cv2 as cv
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from gensim.utils import tokenize as _tokenize, deaccent
from lxml import etree
import numpy as np
import pycountry
import scipy.stats as st

from .configuration import CSV_PARAMETERS, SQL_PARAMETERS, JSON_PARAMETERS, PREPROCESSED_IMAGE_WIDTH, PREPROCESSED_IMAGE_HEIGHT, ANNOY_N_TREES

@@ -544,7 +539,13 @@ def index_images(input_root, filenames, width=PREPROCESSED_IMAGE_WIDTH, height=P
    return (filename_map, index)


def deaccent(text):
    from gensim.utils import deaccent as _deaccent
    return _deaccent(text)


def tokenize(text):
    from gensim.utils import tokenize as _tokenize
    return list(_tokenize(text, lowercase=True, deacc=True))


@@ -563,6 +564,9 @@ def get_character_error_rate(first_sequence, second_sequence):


def get_tfidf(texts):
    from gensim.corpora.dictionary import Dictionary
    from gensim.models.tfidfmodel import TfidfModel

    with Pool(None) as pool:
        texts = pool.imap_unordered(tokenize, texts)
        dictionary = Dictionary(texts, prune_at=None)
@@ -571,12 +575,16 @@ def get_tfidf(texts):


def get_cossims(dictionary, query, corpus):
    from gensim.similarities import SparseMatrixSimilarity

    index = SparseMatrixSimilarity(corpus, num_features=len(dictionary), num_docs=len(corpus))
    sims = index[query]
    return sims


def get_confidence_interval(sample, confidence):
    import scipy.stats as st

    mean = np.mean(sample)
    interval = st.t.interval(
        confidence / 100.0,
@@ -908,6 +916,8 @@ def get_jaccard_index(first_dict, second_dict):


def get_spearman_r(first_dict, second_dict):
    import scipy.stats as st

    first_values = []
    second_values = []
    for key in first_dict.keys() | second_dict.keys():
+0 −8
Original line number Diff line number Diff line
@@ -8,14 +8,6 @@ csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))


import logging
import gensim.similarities.docsim  # noqa: F401
import gensim.matutils  # noqa: F401
import gensim.utils  # noqa: F401


logging.getLogger('gensim.similarities.docsim').setLevel(logging.WARNING)
logging.getLogger('gensim.matutils').setLevel(logging.WARNING)
logging.getLogger('gensim.utils').setLevel(logging.WARNING)


CSV_PARAMETERS = {