Commit 5263b65a authored by Insectslayer's avatar Insectslayer
Browse files

Add wordcloud output.

Compute average title.
parent 3571cd87
Loading
Loading
Loading
Loading
+28 −21
Original line number Diff line number Diff line
%% Cell type:code id: tags:

``` python
import csv
import itertools
import nltk
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
```

%% Cell type:code id: tags:

``` python
input_filename = 'vis2011-2021.tsv'
```

%% Cell type:code id: tags:

``` python
stop_words = set(stopwords.words("english"))
stemmer = EnglishStemmer()
```

%% Cell type:code id: tags:

``` python
class Title():
    def __init__(self, text:str):
        self.text = text
        self.words = list(map(str.casefold, word_tokenize(text)))
        self.filtered_words = self._remove_stop_words(self.words)
        self.stemmed_words = [stemmer.stem(word) for word in self.filtered_words]
        self.hyphen = self.count_symbol('-')
        self.question = self.count_symbol('?')
        self.exclamation = self.count_symbol('!')
        self.colon = self.count_symbol(':')
        self.semicolon = self.count_symbol(';')
        self.comma = self.count_symbol(',')

    def _remove_stop_words(self, words:list):
        filtered_words = []
        for word in words:
            if word.casefold() not in stop_words:
                filtered_words.append(word)
        return filtered_words

    def count_symbol(self, symbol:str):
        occurences = []
        for word in self.words:
            occurences.append(word.count(symbol))
        return occurences
```

%% Cell type:code id: tags:

``` python
with open(input_filename, 'rt', encoding='utf-8') as f:
    csv_reader = csv.reader(f, delimiter='\t')
    titles = []
    for line in csv_reader:
        titles.append(Title(line[0]))
```

%% Cell type:markdown id: tags:

Names of first five titles.

%% Cell type:code id: tags:

``` python
for i in range(5):
    print(titles[i].text)
```

%% Cell type:markdown id: tags:

Number of hyphens occuring in a title

%% Cell type:code id: tags:

``` python
hyphens = list(map(lambda x: sum(x.hyphen), titles))
plt.hist(hyphens, max(hyphens))
```

%% Cell type:markdown id: tags:

How many words are made of three or more words / how many words have two or more hyphens?

%% Cell type:code id: tags:

``` python
count_2 = 0
for title in titles:
    for word in title.hyphen:
        if word < 2:
            continue
        if word == 2:
            count_2 += 1
        else:
            print(title.text)
print(f"Words with two hyphens: {count_2}")
```

%% Cell type:markdown id: tags:

There are many titles that have at least one hyphenated word and some have even more. Let's see what words are most commonly used in those composites?

%% Cell type:code id: tags:

``` python
def split_hyphens(word_tokens:list):
    segments = []
    for word in word_tokens:
        if '-' in word:
            segments.append(word.split('-'))
    return segments

hyphen_segments = list(map(lambda x: split_hyphens(x.words), titles))

first_words = []
second_words = []
for segment in hyphen_segments:
    if len(segment) > 0:
        for word in segment:
            first_words.append(word[0])
            second_words.append(word[1])

freq_dist_first = FreqDist(first_words)
freq_dist_second = FreqDist(second_words)

print(freq_dist_first.most_common(20))
print(freq_dist_second.most_common(20))
```

%% Cell type:markdown id: tags:

How many titles have a colon in them?

%% Cell type:code id: tags:

``` python
colons = list(map(lambda x: sum(x.colon), titles))
print(sum(colons))
```

%% Cell type:markdown id: tags:

Where is the first colon in the title? Each position is one word. Excluding titles without a colon.

%% Cell type:code id: tags:

``` python
def find_first(lst:list):
    try:
        return lst.index(1)
    except ValueError:
        return -1
colons = list(map(lambda x: find_first(x.colon), titles))
plt.hist(colons, max(colons)+1, range=(0,max(colons)+1))
```

%% Cell type:markdown id: tags:

There are many titles with a colon on the second position, implying the first word is a name of the tool.

%% Cell type:code id: tags:

``` python
tool_names = []
for title in titles:
    if len(title.words) > 1 and title.words[1] == ':':
        tool_names.append(title.words[0])
for i in range(20):
    print(tool_names[i])
```

%% Cell type:markdown id: tags:

Now that we extracted the tool names, what are the most common substrings appearing in them?
For every length of the substring <3, 7> show the top 8 results, and how many times did they show up in the tool names.

%% Cell type:code id: tags:

``` python
tool_names_str = '\n'.join(tool_names)
for i in range(3, 8):
    l = [tool_names_str[j:j+i]for j in range(len(tool_names_str))]
    l_dict = {}
    for item in l:
        if '\n' in item:
            continue
        if item in l_dict:
            l_dict[item]+=1
        else:
            l_dict[item] = 1
    # print(sorted(l, key=l.count, reverse=True))
    for i in range(8):
        max_val = (max(l_dict, key=l_dict.get), l_dict[max(l_dict, key=l_dict.get)])
        del l_dict[max_val[0]]
        print(max_val)
with open('world_cloud.txt', 'w') as f:
    for i in range(3, 8):
        l = [tool_names_str[j:j+i]for j in range(len(tool_names_str))]
        l_dict = {}
        for item in l:
            if '\n' in item:
                continue
            if item in l_dict:
                l_dict[item]+=1
            else:
                l_dict[item] = 1
        # print(sorted(l, key=l.count, reverse=True))
        for i in range(8):
            max_val = (max(l_dict, key=l_dict.get), l_dict[max(l_dict, key=l_dict.get)])
            for j in range(max_val[1]):
                f.write(max_val[0]+'\n')
            del l_dict[max_val[0]]
            print(max_val)

```

%% Cell type:markdown id: tags:

What is the word frequency in the titles?

%% Cell type:code id: tags:

``` python
words_list = list(itertools.chain.from_iterable([x.filtered_words for x in titles]))
word_freq_dist = FreqDist(words_list)
print(word_freq_dist.most_common(50))
```

%% Cell type:markdown id: tags:

Some similar words (visualization/visual) are counted in different bins. Use the stemmed words instead to compute the frequency.

%% Cell type:code id: tags:

``` python
stem_words_list = list(itertools.chain.from_iterable([x.stemmed_words for x in titles]))
stem_freq_dist = FreqDist(stem_words_list)
print(stem_freq_dist.most_common(50))
```

%% Cell type:markdown id: tags:

What (stemmed) words are showing up together often?

%% Cell type:code id: tags:

``` python
nltk_text = nltk.Text(stem_words_list)
nltk_text.collocations(30, 2)
```

%% Cell type:markdown id: tags:

Find the longest title (string length).

%% Cell type:code id: tags:

``` python
longest = ""
for title in titles:
    if len(title.text) > len(longest):
        longest = title.text
print(len(longest))
print(longest)
```

%% Cell type:markdown id: tags:

Find the longest title (number of words).

%% Cell type:code id: tags:

``` python
longest = 0
text = ""
for title in titles:
    if longest < len(title.words):
        longest = len(title.words)
        text = title.text
print(longest)
print(text)
```

%% Cell type:markdown id: tags:

Histogram of word count of the titles.

%% Cell type:code id: tags:

``` python
counts = {}
for title in titles:
    count = len(title.words)
    if count in counts:
        counts[count] += 1
    else:
        counts[count] = 1
counts = dict(sorted(counts.items()))
plt.bar(list(counts.keys()), counts.values())
print(counts.values())
print(counts.keys())
```

%% Cell type:markdown id: tags:

What is the average number of words in the title?

%% Cell type:code id: tags:

``` python
sum_words = 0
for title in titles:
    sum_words += len(title.words)
print(sum_words/len(titles))
```

%% Cell type:markdown id: tags:

What are the most common words on each position?
What are the most common words in each position?

%% Cell type:code id: tags:

``` python
words_at_pos = {}
for title in titles:
    for i in range(21):
    for i in range(10):
        if i in words_at_pos:
            try:
                words_at_pos[i].append(title.words[i])
                if title.words[i] not in stop_words:
                    words_at_pos[i].append(title.words[i])
            except:
                pass
        else:
            try:
                words_at_pos[i] = [title.words[i]]
                if title.words[i] not in stop_words:
                    words_at_pos[i] = [title.words[i]]
            except:
                pass
avg_title = []
for i in range(21):
for i in range(10):
    freq_dist = FreqDist(words_at_pos[i])
    avg_title.append(freq_dist.max())
    avg_title.append((freq_dist.max(), int(len(words_at_pos[i])*freq_dist.freq(freq_dist.max()))))
print(avg_title)
print(" ".join([x[0] for x in avg_title]))

```