Loading scripts/07_paragraph_classification.py +14 −1 Original line number Diff line number Diff line Loading @@ -46,7 +46,19 @@ def get_paragraph_features(paragraph: List[Word]) -> Optional[List[float]]: return None num_entity_tokens = sum(0 if isinstance(word, Other) else 1 for word in paragraph) num_nonentity_tokens = num_all_tokens - num_entity_tokens sample = [num_entity_tokens, num_nonentity_tokens, num_entity_tokens / num_all_tokens] num_nonword_tokens = sum(1 if not str(word).isalpha() else 0 for word in paragraph) num_punctuation_tokens = sum(1 if not str(word).isalnum() else 0 for word in paragraph) num_numeric_tokens = sum(1 if str(word).isnumeric() else 0 for word in paragraph) sample = [ num_entity_tokens, num_nonentity_tokens, num_all_tokens, num_entity_tokens / num_all_tokens, num_nonword_tokens / num_all_tokens, num_punctuation_tokens / num_all_tokens, num_numeric_tokens / num_all_tokens, ] return sample Loading Loading @@ -82,6 +94,7 @@ def train_classifier(root_directory: Path, ground_truths: Iterable[Path]) -> Pip ] # Train classifier logger.debug('Training paragraph classifier') classifier = make_pipeline(StandardScaler(), LinearRegression()) classifier.fit(X, y, linearregression__sample_weight=sample_weight) return classifier Loading Loading
scripts/07_paragraph_classification.py +14 −1 Original line number Diff line number Diff line Loading @@ -46,7 +46,19 @@ def get_paragraph_features(paragraph: List[Word]) -> Optional[List[float]]: return None num_entity_tokens = sum(0 if isinstance(word, Other) else 1 for word in paragraph) num_nonentity_tokens = num_all_tokens - num_entity_tokens sample = [num_entity_tokens, num_nonentity_tokens, num_entity_tokens / num_all_tokens] num_nonword_tokens = sum(1 if not str(word).isalpha() else 0 for word in paragraph) num_punctuation_tokens = sum(1 if not str(word).isalnum() else 0 for word in paragraph) num_numeric_tokens = sum(1 if str(word).isnumeric() else 0 for word in paragraph) sample = [ num_entity_tokens, num_nonentity_tokens, num_all_tokens, num_entity_tokens / num_all_tokens, num_nonword_tokens / num_all_tokens, num_punctuation_tokens / num_all_tokens, num_numeric_tokens / num_all_tokens, ] return sample Loading Loading @@ -82,6 +94,7 @@ def train_classifier(root_directory: Path, ground_truths: Iterable[Path]) -> Pip ] # Train classifier logger.debug('Training paragraph classifier') classifier = make_pipeline(StandardScaler(), LinearRegression()) classifier.fit(X, y, linearregression__sample_weight=sample_weight) return classifier Loading