Unverified Commit 34e7bfd3 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Use non-word tokens and max other string length as paragraph features

parent 1b4acdcc
Loading
Loading
Loading
Loading
Loading
+14 −1
Original line number Diff line number Diff line
@@ -46,7 +46,19 @@ def get_paragraph_features(paragraph: List[Word]) -> Optional[List[float]]:
        return None
    num_entity_tokens = sum(0 if isinstance(word, Other) else 1 for word in paragraph)
    num_nonentity_tokens = num_all_tokens - num_entity_tokens
    sample = [num_entity_tokens, num_nonentity_tokens, num_entity_tokens / num_all_tokens]
    num_nonword_tokens = sum(1 if not str(word).isalpha() else 0 for word in paragraph)
    num_punctuation_tokens = sum(1 if not str(word).isalnum() else 0 for word in paragraph)
    num_numeric_tokens = sum(1 if str(word).isnumeric() else 0 for word in paragraph)

    sample = [
        num_entity_tokens,
        num_nonentity_tokens,
        num_all_tokens,
        num_entity_tokens / num_all_tokens,
        num_nonword_tokens / num_all_tokens,
        num_punctuation_tokens / num_all_tokens,
        num_numeric_tokens / num_all_tokens,
    ]
    return sample


@@ -82,6 +94,7 @@ def train_classifier(root_directory: Path, ground_truths: Iterable[Path]) -> Pip
    ]

    # Train classifier
    logger.debug('Training paragraph classifier')
    classifier = make_pipeline(StandardScaler(), LinearRegression())
    classifier.fit(X, y, linearregression__sample_weight=sample_weight)
    return classifier