Skip to content
Snippets Groups Projects
Commit ed8bc3db authored by Ota Mikušek's avatar Ota Mikušek
Browse files

Added more models.

parent a26bf9a8
No related branches found
No related tags found
No related merge requests found
annotations annotations/*
fasttext_bin/lid.176.bin fasttext_bin/lid.176.bin
fasttext_ftz/lid.176.ftz fasttext_ftz/lid.176.ftz
\ No newline at end of file lingua-build/target/*
\ No newline at end of file
#!/usr/bin/python3
import cld3, sys
for line in sys.stdin:
if line[-1] == "\n":
line = line[:-1]
res = cld3.get_language(line)
if res.language == "cs":
print("%s\tces" % (line))
elif res.language == "en":
print("%s\teng" % (line))
elif res.language == "de":
print("%s\tdeu" % (line))
elif res.language == "la":
print("%s\tlat" % (line))
else:
print("%s\tunk" % (line))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# source: https://fasttext.cc/docs/en/language-identification.html
# modified
DETECTION_VECTOR_SIZE = 20000000
import fasttext, sys
class LanguageIdentification:
def __init__(self):
pretrained_lang_model = "fasttext_bin/lid.176.bin"
self.model = fasttext.load_model(pretrained_lang_model)
def predict_lang(self, text):
predictions = self.model.predict(text, k=DETECTION_VECTOR_SIZE)
return predictions
LANGUAGE = LanguageIdentification()
for line in sys.stdin:
if line[-1] == "\n":
line = line[:-1]
lang = LANGUAGE.predict_lang(line)[0]
try:
for i in range(DETECTION_VECTOR_SIZE):
if lang[i] == "__label__cs":
print("%s\tces" % (line))
break
elif lang[i] == "__label__en":
print("%s\teng" % (line))
break
elif lang[i] == "__label__de":
print("%s\tdeu" % (line))
break
elif lang[i] == "__label__la":
print("%s\tlat" % (line))
break
except IndexError:
print("%s\tunk" % (line))
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# source: https://fasttext.cc/docs/en/language-identification.html
# modified
import fasttext DETECTION_VECTOR_SIZE = 20000000
import fasttext, sys
class LanguageIdentification: class LanguageIdentification:
def __init__(self): def __init__(self):
pretrained_lang_model = "/tmp/lid.176.bin" pretrained_lang_model = "fasttext_ftz/lid.176.ftz"
self.model = fasttext.load_model(pretrained_lang_model) self.model = fasttext.load_model(pretrained_lang_model)
def predict_lang(self, text): def predict_lang(self, text):
predictions = self.model.predict(text, k=2) # returns top 2 matching languages predictions = self.model.predict(text, k=DETECTION_VECTOR_SIZE)
return predictions return predictions
if __name__ == '__main__': LANGUAGE = LanguageIdentification()
LANGUAGE = LanguageIdentification() for line in sys.stdin:
lang = LANGUAGE.predict_lang("Hej") if line[-1] == "\n":
print(lang) line = line[:-1]
\ No newline at end of file lang = LANGUAGE.predict_lang(line)[0]
try:
for i in range(DETECTION_VECTOR_SIZE):
if lang[i] == "__label__cs":
print("%s\tces" % (line))
break
elif lang[i] == "__label__en":
print("%s\teng" % (line))
break
elif lang[i] == "__label__de":
print("%s\tdeu" % (line))
break
elif lang[i] == "__label__la":
print("%s\tlat" % (line))
break
except IndexError:
print("%s\tunk" % (line))
#!/bin/bash
./get_sentences.sh | ./ngram-gen.py $1 | sed -e "s/\t.*$//g" | ./cld3/detector.py
\ No newline at end of file
#!/bin/bash
./get_sentences.sh | ./ngram-gen.py $1 | sed -e "s/\t.*$//g" | ./fasttext_bin/detector.py
\ No newline at end of file
#!/bin/bash
./get_sentences.sh | ./ngram-gen.py $1 | sed -e "s/\t.*$//g" | ./fasttext_ftz/detector.py
\ No newline at end of file
#!/bin/bash
./get_sentences.sh | ./ngram-gen.py $1 | sed -e "s/\t.*$//g" | ./langid/detector.py
\ No newline at end of file
#!/bin/bash #!/bin/bash
cat /nlp/projekty/lingua/language-detection-annotation/annotations/*hotovo.md | sed -e "s/\`[^\`]*\`{.error}//g" | grep -e "{lang=[^}]*}" | sed -e "s/{lang=\([^}]*\)}/{lang=\1}\n/g" | grep -e "{lang=[^}]*}" | sed -e "s/^[^\`]*\`//g" | sed -e "s/\`{lang=\([^}]*\)}[[:blank:]]*$/\t\1/g" cat annotations/*hotovo.md | sed -e "s/\`[^\`]*\`{.error}//g" | grep -e "{lang=[^}]*}" | sed -e "s/{lang=\([^}]*\)}/{lang=\1}\n/g" | grep -e "{lang=[^}]*}" | sed -e "s/^[^\`]*\`//g" | sed -e "s/\`{lang=\([^}]*\)}[[:blank:]]*$/\t\1/g"
#!/bin/bash
cat /nlp/projekty/lingua/language-detection-annotation/annotations/*hotovo.md | sed -e "s/{lang=\([^}]*\)}/{lang=\1}\n/g" | grep -e "{lang=[^}]*}" | sed -e "s/^\`//g" | sed -e "s/\`/\t/g" | sed -e "s/\t{lang=\(.*\)}$/\t\1/g"
#!/bin/bash
cat /nlp/projekty/lingua/language-detection-annotation/annotations/*hotovo.md | grep -e "{lang=[^}]*}" | sed -e "s/{lang=\([^}]*\)}/{lang=\1}\n/g" | grep -e "{lang=[^}]*}" | sed -e "s/^[^\`]*\`//g" | sed -e "s/\`{lang=\([^}]*\)}[[:blank:]]*$/\t\1/g"
#!/bin/bash
cat /nlp/projekty/lingua/language-detection-annotation/annotations/*hotovo.md | sed -e "s/ *$//g" | sed -e "/^[^\`].*/d" | sed -e "s/.*[^}]$//g" | sed "/^$/d" | sed -e "s/^\`//g" | sed -e "s/\`/\t/g" | sed -e "s/\t{lang=\(.*\)}$/\t\1/g"
#!/usr/bin/python3
import langid, sys
langid.set_languages(["cs", "en", "de", "la"])
for line in sys.stdin:
if line[-1] == "\n":
line = line[:-1]
res = langid.classify(line)
if res[0] == "cs":
print("%s\tces" % (line))
elif res[0] == "en":
print("%s\teng" % (line))
elif res[0] == "de":
print("%s\tdeu" % (line))
elif res[0] == "la":
print("%s\tlat" % (line))
else:
print("%s\tunk" % (line))
\ No newline at end of file
build: build:
cargo build cargo build --release
run: run:
cargo run cargo run
#!/bin/bash #!/bin/bash
./merger.py <(./get_sentences.sh | ./ngram-gen.py $1) <(./get_lingua_n.sh $1) <(./get_pycld2_n.sh $1) | ./statistics.py 3 ./merger.py <(./get_sentences.sh | ./ngram-gen.py $1) <(./get_lingua_n.sh $1) <(./get_pycld2_n.sh $1) <(./get_fasttext_ftz_n.sh $1) <(./get_fasttext_bin_n.sh $1) <(./get_cld3_n.sh $1) <(./get_langid_n.sh $1)| ./statistics.py 7
\ No newline at end of file \ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment