Commit 51b285b6 authored by Vít Novotný's avatar Vít Novotný
Browse files

Use micro-averaging in `AggregateMeanFScoreEvaluator.__call__()`

parent 152f26f4
Pipeline #147314 failed with stage
in 7 minutes and 31 seconds
......@@ -237,7 +237,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Loading documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 268669/268669 [00:06<00:00, 44001.38it/s]\n"
"Loading documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 268669/268669 [00:06<00:00, 44308.95it/s]\n"
]
}
],
......@@ -462,7 +462,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 21,
"id": "694daad3-2b04-4e3f-8bfb-bb3fe0c87dd3",
"metadata": {},
"outputs": [],
......@@ -473,7 +473,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 22,
"id": "fed4d0a4-5bc4-4af2-8e1b-c5a8a6b61c52",
"metadata": {},
"outputs": [],
......@@ -491,7 +491,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 23,
"id": "38efa732-8afd-4798-809a-ca828a8b960c",
"metadata": {},
"outputs": [],
......@@ -501,7 +501,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 24,
"id": "1c9d3bab-53de-4c39-8e4d-cd2978f00925",
"metadata": {},
"outputs": [],
......@@ -513,7 +513,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 25,
"id": "f47f62ca-1164-45b5-94b9-ff082787e8a9",
"metadata": {},
"outputs": [],
......@@ -526,7 +526,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 26,
"id": "053ff5dd-775c-431d-a988-18bf3c4f4f6d",
"metadata": {},
"outputs": [
......@@ -559,116 +559,116 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>model_ner_manatee_non-crossing_only-relevant_fine-tuning</th>\n",
" <td>49.97944%</td>\n",
" <td>96.66777%</td>\n",
" <td>41.04577%</td>\n",
" <td>62.56433%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_all_only-relevant_fine-tuning</th>\n",
" <td>49.63405%</td>\n",
" <td>96.74517%</td>\n",
" <td>39.86766%</td>\n",
" <td>62.08229%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_all_only-relevant_parallel</th>\n",
" <td>51.62152%</td>\n",
" <td>96.17568%</td>\n",
" <td>38.21471%</td>\n",
" <td>62.00397%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_non-crossing_only-relevant_parallel</th>\n",
" <td>49.71262%</td>\n",
" <td>96.10254%</td>\n",
" <td>38.21755%</td>\n",
" <td>61.34424%</td>\n",
" <td>94.77053%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_all_all_parallel</th>\n",
" <td>43.69512%</td>\n",
" <td>96.65764%</td>\n",
" <td>39.60441%</td>\n",
" <td>59.98572%</td>\n",
" <td>94.52101%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_non-crossing_all_parallel</th>\n",
" <td>44.26929%</td>\n",
" <td>96.20140%</td>\n",
" <td>34.92015%</td>\n",
" <td>58.46362%</td>\n",
" <th>model_ner_manatee_non-crossing_only-relevant_fine-tuning</th>\n",
" <td>49.97944%</td>\n",
" <td>96.66777%</td>\n",
" <td>41.04577%</td>\n",
" <td>94.46477%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_all_only-relevant_fine-tuning</th>\n",
" <td>34.07244%</td>\n",
" <td>96.38580%</td>\n",
" <td>36.68019%</td>\n",
" <td>93.57176%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_non-crossing_only-relevant_fine-tuning</th>\n",
" <td>34.74969%</td>\n",
" <td>96.34174%</td>\n",
" <td>36.64832%</td>\n",
" <td>55.91325%</td>\n",
" <td>93.41004%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_all_only-relevant_fine-tuning</th>\n",
" <td>34.07244%</td>\n",
" <td>96.38580%</td>\n",
" <td>36.68019%</td>\n",
" <td>55.71281%</td>\n",
" <th>model_ner_manatee_all_only-relevant_parallel</th>\n",
" <td>51.62152%</td>\n",
" <td>96.17568%</td>\n",
" <td>38.21471%</td>\n",
" <td>93.32879%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_non-crossing_all_parallel</th>\n",
" <td>44.26929%</td>\n",
" <td>96.20140%</td>\n",
" <td>34.92015%</td>\n",
" <td>93.31223%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_non-crossing_only-relevant_parallel</th>\n",
" <td>49.71262%</td>\n",
" <td>96.10254%</td>\n",
" <td>38.21755%</td>\n",
" <td>93.23893%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_all_all_parallel</th>\n",
" <td>33.04513%</td>\n",
" <td>95.95721%</td>\n",
" <td>37.66807%</td>\n",
" <td>55.55681%</td>\n",
" <td>92.63503%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_non-crossing_all_parallel</th>\n",
" <td>33.17825%</td>\n",
" <td>95.71407%</td>\n",
" <td>32.79457%</td>\n",
" <td>53.89563%</td>\n",
" <td>92.03093%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_all_only-relevant_parallel</th>\n",
" <td>31.30961%</td>\n",
" <td>95.59693%</td>\n",
" <td>32.23955%</td>\n",
" <td>53.04870%</td>\n",
" <td>91.78359%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_non-crossing_only-relevant_parallel</th>\n",
" <td>32.51893%</td>\n",
" <td>95.45893%</td>\n",
" <td>30.89947%</td>\n",
" <td>52.95911%</td>\n",
" <td>91.47703%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_all_all_fine-tuning</th>\n",
" <td>2.17360%</td>\n",
" <td>42.86388%</td>\n",
" <td>3.80143%</td>\n",
" <td>16.27964%</td>\n",
" <td>25.11350%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_manatee_non-crossing_all_fine-tuning</th>\n",
" <td>2.34774%</td>\n",
" <td>23.59883%</td>\n",
" <td>2.75918%</td>\n",
" <td>9.56858%</td>\n",
" <td>13.38802%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Babelscape/wikineural-multilingual-ner baseline</th>\n",
" <td>7.35338%</td>\n",
" <td>13.35824%</td>\n",
" <td>2.84895%</td>\n",
" <td>7.85352%</td>\n",
" <td>8.07667%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>model_ner_fuzzy-regex_all_all_fine-tuning</th>\n",
" <td>2.38798%</td>\n",
" <td>7.33972%</td>\n",
" <td>3.32850%</td>\n",
" <td>4.35207%</td>\n",
" <td>4.96872%</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
......@@ -676,14 +676,14 @@
],
"text/plain": [
" PER O \\\n",
"model_ner_manatee_non-crossing_only-relevant_fi... 49.97944% 96.66777% \n",
"model_ner_manatee_all_only-relevant_fine-tuning 49.63405% 96.74517% \n",
"model_ner_manatee_all_only-relevant_parallel 51.62152% 96.17568% \n",
"model_ner_manatee_non-crossing_only-relevant_pa... 49.71262% 96.10254% \n",
"model_ner_manatee_all_all_parallel 43.69512% 96.65764% \n",
"model_ner_manatee_non-crossing_all_parallel 44.26929% 96.20140% \n",
"model_ner_fuzzy-regex_non-crossing_only-relevan... 34.74969% 96.34174% \n",
"model_ner_manatee_non-crossing_only-relevant_fi... 49.97944% 96.66777% \n",
"model_ner_fuzzy-regex_all_only-relevant_fine-tu... 34.07244% 96.38580% \n",
"model_ner_fuzzy-regex_non-crossing_only-relevan... 34.74969% 96.34174% \n",
"model_ner_manatee_all_only-relevant_parallel 51.62152% 96.17568% \n",
"model_ner_manatee_non-crossing_all_parallel 44.26929% 96.20140% \n",
"model_ner_manatee_non-crossing_only-relevant_pa... 49.71262% 96.10254% \n",
"model_ner_fuzzy-regex_all_all_parallel 33.04513% 95.95721% \n",
"model_ner_fuzzy-regex_non-crossing_all_parallel 33.17825% 95.71407% \n",
"model_ner_fuzzy-regex_all_only-relevant_parallel 31.30961% 95.59693% \n",
......@@ -694,22 +694,22 @@
"model_ner_fuzzy-regex_all_all_fine-tuning 2.38798% 7.33972% \n",
"\n",
" LOC all \n",
"model_ner_manatee_non-crossing_only-relevant_fi... 41.04577% 62.56433% \n",
"model_ner_manatee_all_only-relevant_fine-tuning 39.86766% 62.08229% \n",
"model_ner_manatee_all_only-relevant_parallel 38.21471% 62.00397% \n",
"model_ner_manatee_non-crossing_only-relevant_pa... 38.21755% 61.34424% \n",
"model_ner_manatee_all_all_parallel 39.60441% 59.98572% \n",
"model_ner_manatee_non-crossing_all_parallel 34.92015% 58.46362% \n",
"model_ner_fuzzy-regex_non-crossing_only-relevan... 36.64832% 55.91325% \n",
"model_ner_fuzzy-regex_all_only-relevant_fine-tu... 36.68019% 55.71281% \n",
"model_ner_fuzzy-regex_all_all_parallel 37.66807% 55.55681% \n",
"model_ner_fuzzy-regex_non-crossing_all_parallel 32.79457% 53.89563% \n",
"model_ner_fuzzy-regex_all_only-relevant_parallel 32.23955% 53.04870% \n",
"model_ner_fuzzy-regex_non-crossing_only-relevan... 30.89947% 52.95911% \n",
"model_ner_manatee_all_all_fine-tuning 3.80143% 16.27964% \n",
"model_ner_manatee_non-crossing_all_fine-tuning 2.75918% 9.56858% \n",
"Babelscape/wikineural-multilingual-ner baseline 2.84895% 7.85352% \n",
"model_ner_fuzzy-regex_all_all_fine-tuning 3.32850% 4.35207% "
"model_ner_manatee_all_only-relevant_fine-tuning 39.86766% 94.77053% \n",
"model_ner_manatee_all_all_parallel 39.60441% 94.52101% \n",
"model_ner_manatee_non-crossing_only-relevant_fi... 41.04577% 94.46477% \n",
"model_ner_fuzzy-regex_all_only-relevant_fine-tu... 36.68019% 93.57176% \n",
"model_ner_fuzzy-regex_non-crossing_only-relevan... 36.64832% 93.41004% \n",
"model_ner_manatee_all_only-relevant_parallel 38.21471% 93.32879% \n",
"model_ner_manatee_non-crossing_all_parallel 34.92015% 93.31223% \n",
"model_ner_manatee_non-crossing_only-relevant_pa... 38.21755% 93.23893% \n",
"model_ner_fuzzy-regex_all_all_parallel 37.66807% 92.63503% \n",
"model_ner_fuzzy-regex_non-crossing_all_parallel 32.79457% 92.03093% \n",
"model_ner_fuzzy-regex_all_only-relevant_parallel 32.23955% 91.78359% \n",
"model_ner_fuzzy-regex_non-crossing_only-relevan... 30.89947% 91.47703% \n",
"model_ner_manatee_all_all_fine-tuning 3.80143% 25.11350% \n",
"model_ner_manatee_non-crossing_all_fine-tuning 2.75918% 13.38802% \n",
"Babelscape/wikineural-multilingual-ner baseline 2.84895% 8.07667% \n",
"model_ner_fuzzy-regex_all_all_fine-tuning 3.32850% 4.96872% "
]
},
"metadata": {},
......@@ -731,7 +731,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 27,
"id": "f7975e45-ba27-45b4-9b61-1a9c119d434d",
"metadata": {},
"outputs": [
......@@ -739,7 +739,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"/nlp/projekty/ahisto/public_html/named-entity-search/results/model_ner_manatee_non-crossing_only-relevant_fine-tuning/TokenClassification\n"
"/nlp/projekty/ahisto/public_html/named-entity-search/results/model_ner_manatee_all_only-relevant_fine-tuning/TokenClassification\n"
]
}
],
......@@ -761,7 +761,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 28,
"id": "fffd0beb-ac50-4c81-9eb3-cc225214ff63",
"metadata": {},
"outputs": [],
......@@ -777,7 +777,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 29,
"id": "b83d2a1a-4d8f-400a-a8e8-cba43fe41a83",
"metadata": {},
"outputs": [
......@@ -805,7 +805,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 30,
"id": "de04e6a9-33e5-4e85-9cfc-f0f5bc344677",
"metadata": {},
"outputs": [],
......@@ -815,7 +815,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 31,
"id": "d11a74e1-f3c8-416c-897e-e921a69dc661",
"metadata": {},
"outputs": [],
......@@ -829,7 +829,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 32,
"id": "675dd306-50b0-4245-9904-effff2432921",
"metadata": {},
"outputs": [
......@@ -862,7 +862,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 33,
"id": "2b1c528d-b4ec-4e96-8ca7-ff05fd244d80",
"metadata": {},
"outputs": [
......@@ -890,7 +890,7 @@
"- I-LOC: ##de\n",
"- I-LOC: ##ch\n"
]
}
},
],
"source": [
"tag_sentence(baseline_model, example_sentence)"
......
from typing import Dict, Optional, Set, List
from typing import Dict, Optional, Set, List, Tuple
from functools import total_ordering
from more_itertools import zip_equal
......@@ -38,13 +38,14 @@ class AggregateMeanFScoreEvaluator(TokenClassificationEvaluator):
expected_labels, actual_labels = self._collect_token_predictions(model, dataset)
if self.group_name is None:
f_scores = [
self.get_f_score(self.GROUPS[group_name], expected_labels, actual_labels)
for group_name
in self.__class__.get_all_group_names()
]
assert len(f_scores) > 0
mean_f_score = sum(f_scores) / len(f_scores)
mean_f_score, total_number_of_samples = 0, 0
for group_name in self.__class__.get_all_group_names():
number_of_samples, f_score = self.get_f_score(
self.GROUPS[group_name], expected_labels, actual_labels)
mean_f_score += number_of_samples * f_score
total_number_of_samples += number_of_samples
if total_number_of_samples > 0:
_, mean_f_score /= total_number_of_samples
else:
group = self.GROUPS[self.group_name]
mean_f_score = self.get_f_score(group, expected_labels, actual_labels)
......@@ -52,7 +53,7 @@ class AggregateMeanFScoreEvaluator(TokenClassificationEvaluator):
return mean_f_score
def get_f_score(self, group: Group, expected_labels: List[Category],
actual_labels: List[Category]) -> FScore:
actual_labels: List[Category]) -> Tuple[int, FScore]:
expected_categories: Set[Category] = {
self.category_map[category]
for category
......@@ -69,8 +70,9 @@ class AggregateMeanFScoreEvaluator(TokenClassificationEvaluator):
elif expected_label in expected_categories and actual_label not in expected_categories:
false_negatives += 1
number_of_samples = true_positives + false_positives + false_negatives
f_score = true_positives / (true_positives + (0.5 * (false_positives + false_negatives)))
return f_score
return number_of_samples, f_score
@classmethod
def get_all_group_names(cls) -> Set[GroupName]:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment