feat: initial EDA (86b13362) · Commits · Diana Valková / Visualization Project

exploratory_data_analysis.py

0 → 100644

+99 −0

Original line number	Diff line number	Diff line
		import pandas as pd
		import numpy as np
		import ast
		import matplotlib.pyplot as plt

		# Load dataset (example CSV)
		df = pd.read_csv("./data/games_03_2025_reduced.csv")

		# --- 1. Parse JSON/array columns ---
		# supported_languages might be a string like "['English', 'French']"
		if df['supported_languages'].dtype == object:
		df['supported_languages'] = df['supported_languages'].apply(
		lambda x: ast.literal_eval(x) if pd.notna(x) and x.startswith('[') else []
		)

		def parse_tags(x):
		"""
		Parses a string representation of a dictionary or an empty list.
		Handles NaN values, dictionary-like strings ("{'tag': count}"),
		and empty list strings ("[]").
		"""
		if pd.isna(x) or x == '[]':
		return {}
		try:
		return ast.literal_eval(str(x))
		except (ValueError, SyntaxError):
		return {}

		df['tags'] = df['tags'].apply(parse_tags)

		# --- 2. Handle price=0 ---
		def clean_price(row):
		price = row['price']
		tags = row['tags']
		if price == 0:
		if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
		return 0 # true free to play
		else:
		return np.nan # treat as missing
		return price

		df['price_clean'] = df.apply(clean_price, axis=1)


		# --- 4. Clean playtime columns ---
		# 0 playtime might mean missing if no one played / no data
		playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
		'average_playtime_2weeks', 'median_playtime_2weeks']
		for col in playtime_cols:
		df[col + "_clean"] = df[col].replace(0, np.nan)

		# --- 5. Missingness exploration ---
		print("\n--- Dataset size ---")
		print(df.shape)

		print("\n--- Missing values (NaN only) ---")
		print(df.isna().sum().sort_values(ascending=False))

		print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---")
		num_cols = df.select_dtypes(include=[np.number]).columns
		zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False)
		print(zero_counts)

		# --- 6. Basic EDA ---
		print("\n--- Price overview ---")
		print(df['price_clean'].describe())

		print("\n--- Playtime overview (hours) ---")
		print((df['average_playtime_forever_clean'] / 60).describe()) # convert mins → hours

		print("\n--- Reviews overview ---")
		print(df[['num_reviews_total', 'pct_pos_total']].describe())

		# --- 7. Visualization Examples ---
		# Distribution of prices
		df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution")
		plt.xlabel("Price ($)")
		plt.show()

		# Playtime vs Reviews
		plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3)
		plt.xscale("log")
		plt.yscale("log")
		plt.xlabel("Average Playtime (hours, log)")
		plt.ylabel("Total Reviews (log)")
		plt.title("Playtime vs Reviews")
		plt.show()

		# --- 8. Language & Tag exploration ---
		# Top supported languages
		from collections import Counter
		lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs])
		print("\n--- Top supported languages ---")
		print(pd.Series(lang_counts).sort_values(ascending=False).head(10))

		# Top tags
		tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()])
		print("\n--- Top tags ---")
		print(pd.Series(tag_counts).sort_values(ascending=False).head(10))