Add wordcloud output. (5263b65a) · Commits · Matěj Lang / VIS Name Generator

analysis.ipynb

+28 −21

Original line number	Diff line number	Diff line
		%% Cell type:code id: tags:

		``` python
		import csv
		import itertools
		import nltk
		import matplotlib.pyplot as plt

		nltk.download('punkt')
		nltk.download('stopwords')
		from nltk import FreqDist
		from nltk.tokenize import word_tokenize
		from nltk.corpus import stopwords
		from nltk.stem.snowball import EnglishStemmer
		```

		%% Cell type:code id: tags:

		``` python
		input_filename = 'vis2011-2021.tsv'
		```

		%% Cell type:code id: tags:

		``` python
		stop_words = set(stopwords.words("english"))
		stemmer = EnglishStemmer()
		```

		%% Cell type:code id: tags:

		``` python
		class Title():
		def __init__(self, text:str):
		self.text = text
		self.words = list(map(str.casefold, word_tokenize(text)))
		self.filtered_words = self._remove_stop_words(self.words)
		self.stemmed_words = [stemmer.stem(word) for word in self.filtered_words]
		self.hyphen = self.count_symbol('-')
		self.question = self.count_symbol('?')
		self.exclamation = self.count_symbol('!')
		self.colon = self.count_symbol(':')
		self.semicolon = self.count_symbol(';')
		self.comma = self.count_symbol(',')

		def _remove_stop_words(self, words:list):
		filtered_words = []
		for word in words:
		if word.casefold() not in stop_words:
		filtered_words.append(word)
		return filtered_words

		def count_symbol(self, symbol:str):
		occurences = []
		for word in self.words:
		occurences.append(word.count(symbol))
		return occurences
		```

		%% Cell type:code id: tags:

		``` python
		with open(input_filename, 'rt', encoding='utf-8') as f:
		csv_reader = csv.reader(f, delimiter='\t')
		titles = []
		for line in csv_reader:
		titles.append(Title(line[0]))
		```

		%% Cell type:markdown id: tags:

		Names of first five titles.

		%% Cell type:code id: tags:

		``` python
		for i in range(5):
		print(titles[i].text)
		```

		%% Cell type:markdown id: tags:

		Number of hyphens occuring in a title

		%% Cell type:code id: tags:

		``` python
		hyphens = list(map(lambda x: sum(x.hyphen), titles))
		plt.hist(hyphens, max(hyphens))
		```

		%% Cell type:markdown id: tags:

		How many words are made of three or more words / how many words have two or more hyphens?

		%% Cell type:code id: tags:

		``` python
		count_2 = 0
		for title in titles:
		for word in title.hyphen:
		if word < 2:
		continue
		if word == 2:
		count_2 += 1
		else:
		print(title.text)
		print(f"Words with two hyphens: {count_2}")
		```

		%% Cell type:markdown id: tags:

		There are many titles that have at least one hyphenated word and some have even more. Let's see what words are most commonly used in those composites?

		%% Cell type:code id: tags:

		``` python
		def split_hyphens(word_tokens:list):
		segments = []
		for word in word_tokens:
		if '-' in word:
		segments.append(word.split('-'))
		return segments

		hyphen_segments = list(map(lambda x: split_hyphens(x.words), titles))

		first_words = []
		second_words = []
		for segment in hyphen_segments:
		if len(segment) > 0:
		for word in segment:
		first_words.append(word[0])
		second_words.append(word[1])

		freq_dist_first = FreqDist(first_words)
		freq_dist_second = FreqDist(second_words)

		print(freq_dist_first.most_common(20))
		print(freq_dist_second.most_common(20))
		```

		%% Cell type:markdown id: tags:

		How many titles have a colon in them?

		%% Cell type:code id: tags:

		``` python
		colons = list(map(lambda x: sum(x.colon), titles))
		print(sum(colons))
		```

		%% Cell type:markdown id: tags:

		Where is the first colon in the title? Each position is one word. Excluding titles without a colon.

		%% Cell type:code id: tags:

		``` python
		def find_first(lst:list):
		try:
		return lst.index(1)
		except ValueError:
		return -1
		colons = list(map(lambda x: find_first(x.colon), titles))
		plt.hist(colons, max(colons)+1, range=(0,max(colons)+1))
		```

		%% Cell type:markdown id: tags:

		There are many titles with a colon on the second position, implying the first word is a name of the tool.

		%% Cell type:code id: tags:

		``` python
		tool_names = []
		for title in titles:
		if len(title.words) > 1 and title.words[1] == ':':
		tool_names.append(title.words[0])
		for i in range(20):
		print(tool_names[i])
		```

		%% Cell type:markdown id: tags:

		Now that we extracted the tool names, what are the most common substrings appearing in them?
		For every length of the substring <3, 7> show the top 8 results, and how many times did they show up in the tool names.

		%% Cell type:code id: tags:

		``` python
		tool_names_str = '\n'.join(tool_names)
		for i in range(3, 8):
		l = [tool_names_str[j:j+i]for j in range(len(tool_names_str))]
		l_dict = {}
		for item in l:
		if '\n' in item:
		continue
		if item in l_dict:
		l_dict[item]+=1
		else:
		l_dict[item] = 1
		# print(sorted(l, key=l.count, reverse=True))
		for i in range(8):
		max_val = (max(l_dict, key=l_dict.get), l_dict[max(l_dict, key=l_dict.get)])
		del l_dict[max_val[0]]
		print(max_val)
		with open('world_cloud.txt', 'w') as f:
		for i in range(3, 8):
		l = [tool_names_str[j:j+i]for j in range(len(tool_names_str))]
		l_dict = {}
		for item in l:
		if '\n' in item:
		continue
		if item in l_dict:
		l_dict[item]+=1
		else:
		l_dict[item] = 1
		# print(sorted(l, key=l.count, reverse=True))
		for i in range(8):
		max_val = (max(l_dict, key=l_dict.get), l_dict[max(l_dict, key=l_dict.get)])
		for j in range(max_val[1]):
		f.write(max_val[0]+'\n')
		del l_dict[max_val[0]]
		print(max_val)

		```

		%% Cell type:markdown id: tags:

		What is the word frequency in the titles?

		%% Cell type:code id: tags:

		``` python
		words_list = list(itertools.chain.from_iterable([x.filtered_words for x in titles]))
		word_freq_dist = FreqDist(words_list)
		print(word_freq_dist.most_common(50))
		```

		%% Cell type:markdown id: tags:

		Some similar words (visualization/visual) are counted in different bins. Use the stemmed words instead to compute the frequency.

		%% Cell type:code id: tags:

		``` python
		stem_words_list = list(itertools.chain.from_iterable([x.stemmed_words for x in titles]))
		stem_freq_dist = FreqDist(stem_words_list)
		print(stem_freq_dist.most_common(50))
		```

		%% Cell type:markdown id: tags:

		What (stemmed) words are showing up together often?

		%% Cell type:code id: tags:

		``` python
		nltk_text = nltk.Text(stem_words_list)
		nltk_text.collocations(30, 2)
		```

		%% Cell type:markdown id: tags:

		Find the longest title (string length).

		%% Cell type:code id: tags:

		``` python
		longest = ""
		for title in titles:
		if len(title.text) > len(longest):
		longest = title.text
		print(len(longest))
		print(longest)
		```

		%% Cell type:markdown id: tags:

		Find the longest title (number of words).

		%% Cell type:code id: tags:

		``` python
		longest = 0
		text = ""
		for title in titles:
		if longest < len(title.words):
		longest = len(title.words)
		text = title.text
		print(longest)
		print(text)
		```

		%% Cell type:markdown id: tags:

		Histogram of word count of the titles.

		%% Cell type:code id: tags:

		``` python
		counts = {}
		for title in titles:
		count = len(title.words)
		if count in counts:
		counts[count] += 1
		else:
		counts[count] = 1
		counts = dict(sorted(counts.items()))
		plt.bar(list(counts.keys()), counts.values())
		print(counts.values())
		print(counts.keys())
		```

		%% Cell type:markdown id: tags:

		What is the average number of words in the title?

		%% Cell type:code id: tags:

		``` python
		sum_words = 0
		for title in titles:
		sum_words += len(title.words)
		print(sum_words/len(titles))
		```

		%% Cell type:markdown id: tags:

		What are the most common words on each position?
		What are the most common words in each position?

		%% Cell type:code id: tags:

		``` python
		words_at_pos = {}
		for title in titles:
		for i in range(21):
		for i in range(10):
		if i in words_at_pos:
		try:
		words_at_pos[i].append(title.words[i])
		if title.words[i] not in stop_words:
		words_at_pos[i].append(title.words[i])
		except:
		pass
		else:
		try:
		words_at_pos[i] = [title.words[i]]
		if title.words[i] not in stop_words:
		words_at_pos[i] = [title.words[i]]
		except:
		pass
		avg_title = []
		for i in range(21):
		for i in range(10):
		freq_dist = FreqDist(words_at_pos[i])
		avg_title.append(freq_dist.max())
		avg_title.append((freq_dist.max(), int(len(words_at_pos[i])*freq_dist.freq(freq_dist.max()))))
		print(avg_title)
		print(" ".join([x[0] for x in avg_title]))

		```