feat: add genres information - missing and counts (9268ed9e) · Commits · Diana Valková / Visualization Project

exploratory_data_analysis.py

+73 −0

Original line number	Diff line number	Diff line
		@@ -27,8 +27,81 @@ def parse_tags(x):
		except (ValueError, SyntaxError):
		return {}

		def parse_genres(x):
		"""Parse genres from various formats"""
		if pd.isna(x):
		return []
		if isinstance(x, str):
		if x.startswith('[') and x.endswith(']'):
		try:
		return ast.literal_eval(x)
		except:
		# If it's just a comma-separated string
		return [genre.strip().strip('"').strip("'") for genre in x.strip('[]').split(',') if genre.strip()]
		elif ',' in x:
		return [genre.strip() for genre in x.split(',') if genre.strip()]
		else:
		return [x.strip()] if x.strip() else []
		elif isinstance(x, list):
		return x
		else:
		return []

		df['tags'] = df['tags'].apply(parse_tags)

		# Parse genres early in the pipeline
		if 'genres' in df.columns:
		df['genres_parsed'] = df['genres'].apply(parse_genres)

		# Check for missing/empty genres
		print(f"\n--- GENRE MISSING VALUES ANALYSIS ---")
		original_nulls = df['genres'].isnull().sum()
		print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)")

		empty_arrays = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()
		print(f"Empty genre arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)")

		games_with_genres = df['genres_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum()
		print(f"Games with at least one genre: {games_with_genres:,} ({games_with_genres/len(df)*100:.1f}%)")

		# Show some examples of empty/problematic entries
		empty_mask = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)
		if empty_mask.sum() > 0:
		print(f"\nSample of games with empty genres:")
		empty_sample = df[empty_mask][['name', 'genres', 'genres_parsed']].head(3)
		for idx, row in empty_sample.iterrows():
		print(f" '{row['name']}': original='{row['genres']}', parsed={row['genres_parsed']}")

		# Count all genres immediately
		all_genres_debug = []
		for genres_list in df['genres_parsed']:
		if isinstance(genres_list, list):
		all_genres_debug.extend(genres_list)

		if all_genres_debug:
		genre_counts_debug = Counter(all_genres_debug)
		print(f"\nGenre counts (full dataset, before filtering):")
		for genre, count in genre_counts_debug.most_common(15):
		print(f" {genre}: {count:,}")

		# Show genre distribution stats
		avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean()
		print(f"\nAverage genres per game: {avg_genres_per_game:.2f}")

		# Show games with most genres
		max_genres = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).max()
		print(f"Maximum genres on a single game: {max_genres}")

		if max_genres > 0:
		multi_genre_games = df[df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0) == max_genres]
		print(f"Game(s) with {max_genres} genres:")
		for idx, row in multi_genre_games[['name', 'genres_parsed']].head(2).iterrows():
		print(f" '{row['name']}': {row['genres_parsed']}")
		else:
		print("No genres found after parsing!")
		else:
		print("No 'genres' column found in dataset!")

		# --- 2. Handle price=0 ---
		def clean_price(row):
		price = row['price']