feat: remove all nans (eaed9908) · Commits · Diana Valková / Visualization Project

exploratory_data_analysis.py

+137 −25

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@ import pandas as pd
		import numpy as np
		import ast
		import matplotlib.pyplot as plt
		from collections import Counter

		# Load dataset (example CSV)
		df = pd.read_csv("./data/games_03_2025_reduced.csv")
		@@ -39,17 +40,17 @@ def clean_price(row):
		return np.nan # treat as missing
		return price

		df['price_clean'] = df.apply(clean_price, axis=1)

		# Overwrite original price column
		df['price'] = df.apply(clean_price, axis=1)

		# --- 4. Clean playtime columns ---
		# --- 3. Clean playtime columns (overwrite originals) ---
		# 0 playtime might mean missing if no one played / no data
		playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
		'average_playtime_2weeks', 'median_playtime_2weeks']
		for col in playtime_cols:
		df[col + "_clean"] = df[col].replace(0, np.nan)
		df[col] = df[col].replace(0, np.nan)

		# --- 5. Missingness exploration ---
		# --- 4. Missingness exploration ---
		print("\n--- Dataset size ---")
		print(df.shape)

		@@ -61,39 +62,150 @@ num_cols = df.select_dtypes(include=[np.number]).columns
		zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False)
		print(zero_counts)

		# --- 6. Basic EDA ---
		# --- 5. Basic EDA ---
		print("\n--- Price overview ---")
		print(df['price_clean'].describe())
		print(df['price'].describe())

		print("\n--- Playtime overview (hours) ---")
		print((df['average_playtime_forever_clean'] / 60).describe()) # convert mins → hours
		print((df['average_playtime_forever'] / 60).describe()) # convert mins → hours

		print("\n--- Reviews overview ---")
		print(df[['num_reviews_total', 'pct_pos_total']].describe())

		# --- 7. Visualization Examples ---
		# --- 6. REMOVE ROWS WITH ANY NaN VALUES ---
		print("\n" + "="*60)
		print("REMOVING ROWS WITH ANY NaN VALUES")
		print("="*60)

		print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns")
		print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}")

		# Remove all rows that have at least one NaN value
		df_complete = df.dropna()
		print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns")
		print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%")

		# --- 7. ANALYSIS ON COMPLETE DATASET ---
		print("\n" + "="*60)
		print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)")
		print("="*60)

		# Analysis for average_playtime_forever
		playtime_hours = df_complete['average_playtime_forever'] / 60 # convert to hours

		print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---")
		print(f"Mean: {playtime_hours.mean():.2f} hours")
		print(f"Median: {playtime_hours.median():.2f} hours")
		print(f"Standard Deviation: {playtime_hours.std():.2f} hours")
		print(f"Min: {playtime_hours.min():.2f} hours")
		print(f"Max: {playtime_hours.max():.2f} hours")
		print(f"25th percentile: {playtime_hours.quantile(0.25):.2f} hours")
		print(f"75th percentile: {playtime_hours.quantile(0.75):.2f} hours")
		print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours")
		print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours")

		# Analysis for median_playtime_forever
		median_playtime_hours = df_complete['median_playtime_forever'] / 60

		print(f"\n--- Median Playtime Forever Analysis (n={len(df_complete):,}) ---")
		print(f"Mean of medians: {median_playtime_hours.mean():.2f} hours")
		print(f"Median of medians: {median_playtime_hours.median():.2f} hours")
		print(f"Standard Deviation: {median_playtime_hours.std():.2f} hours")
		print(f"Min: {median_playtime_hours.min():.2f} hours")
		print(f"Max: {median_playtime_hours.max():.2f} hours")
		print(f"25th percentile: {median_playtime_hours.quantile(0.25):.2f} hours")
		print(f"75th percentile: {median_playtime_hours.quantile(0.75):.2f} hours")

		# Price analysis
		price_data = df_complete['price']

		print(f"\n--- Price Analysis (n={len(df_complete):,}) ---")
		print(f"Mean: ${price_data.mean():.2f}")
		print(f"Median: ${price_data.median():.2f}")
		print(f"Standard Deviation: ${price_data.std():.2f}")
		print(f"Min: ${price_data.min():.2f}")
		print(f"Max: ${price_data.max():.2f}")
		print(f"25th percentile: ${price_data.quantile(0.25):.2f}")
		print(f"75th percentile: ${price_data.quantile(0.75):.2f}")
		print(f"95th percentile: ${price_data.quantile(0.95):.2f}")
		print(f"99th percentile: ${price_data.quantile(0.99):.2f}")

		# Free vs Paid games breakdown
		free_games = (price_data == 0).sum()
		paid_games = (price_data > 0).sum()
		print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)")
		print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)")

		# Price ranges for paid games only
		paid_prices = price_data[price_data > 0]
		if len(paid_prices) > 0:
		print(f"\n--- Paid Games Price Analysis (n={len(paid_prices):,}) ---")
		print(f"Mean: ${paid_prices.mean():.2f}")
		print(f"Median: ${paid_prices.median():.2f}")
		print(f"Under $10: {(paid_prices < 10).sum():,} ({(paid_prices < 10).mean()*100:.1f}%)")
		print(f"$10-$30: {((paid_prices >= 10) & (paid_prices < 30)).sum():,} ({((paid_prices >= 10) & (paid_prices < 30)).mean()*100:.1f}%)")
		print(f"$30-$60: {((paid_prices >= 30) & (paid_prices < 60)).sum():,} ({((paid_prices >= 30) & (paid_prices < 60)).mean()*100:.1f}%)")
		print(f"Over $60: {(paid_prices >= 60).sum():,} ({(paid_prices >= 60).mean()*100:.1f}%)")

		# Correlations on complete dataset
		correlation_playtime_price = df_complete['average_playtime_forever'].corr(df_complete['price'])
		correlation_playtime_reviews = df_complete['average_playtime_forever'].corr(df_complete['num_reviews_total'])
		correlation_price_reviews = df_complete['price'].corr(df_complete['num_reviews_total'])

		print(f"\n--- Correlations (Complete Dataset) ---")
		print(f"Average playtime vs Price: {correlation_playtime_price:.3f}")
		print(f"Average playtime vs Total reviews: {correlation_playtime_reviews:.3f}")
		print(f"Price vs Total reviews: {correlation_price_reviews:.3f}")

		# Additional insights
		print(f"\n--- Additional Insights (Complete Dataset) ---")
		print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(playtime_hours > 100).mean()*100:.1f}%)")
		print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)")
		print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)")

		# --- 8. Visualization Examples (Complete Dataset) ---
		fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

		# Distribution of prices
		df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution")
		plt.xlabel("Price ($)")
		plt.show()
		df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1)
		ax1.set_xlabel("Price ($)")
		ax1.set_ylabel("Count (log scale)")

		# Distribution of average playtime (in hours, log scale)
		playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros
		playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True,
		title="Average Playtime Distribution (Complete Dataset)", ax=ax2)
		ax2.set_xlabel("Average Playtime (hours, log scale)")
		ax2.set_ylabel("Count (log scale)")

		# Playtime vs Reviews
		plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3)
		plt.xscale("log")
		plt.yscale("log")
		plt.xlabel("Average Playtime (hours, log)")
		plt.ylabel("Total Reviews (log)")
		plt.title("Playtime vs Reviews")
		ax3.scatter(df_complete['average_playtime_forever']/60, df_complete['num_reviews_total'], alpha=0.3)
		ax3.set_xscale("log")
		ax3.set_yscale("log")
		ax3.set_xlabel("Average Playtime (hours, log)")
		ax3.set_ylabel("Total Reviews (log)")
		ax3.set_title("Playtime vs Reviews (Complete Dataset)")

		# Price vs Reviews (for paid games)
		paid_mask = df_complete['price'] > 0
		if paid_mask.sum() > 0:
		ax4.scatter(df_complete[paid_mask]['price'], df_complete[paid_mask]['num_reviews_total'], alpha=0.3)
		ax4.set_xscale("log")
		ax4.set_yscale("log")
		ax4.set_xlabel("Price ($, log)")
		ax4.set_ylabel("Total Reviews (log)")
		ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)")

		plt.tight_layout()
		plt.show()

		# --- 8. Language & Tag exploration ---
		# --- 9. Language & Tag exploration (Complete Dataset) ---
		# Top supported languages
		from collections import Counter
		lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs])
		print("\n--- Top supported languages ---")
		lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs])
		print("\n--- Top supported languages (Complete Dataset) ---")
		print(pd.Series(lang_counts).sort_values(ascending=False).head(10))

		# Top tags
		tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()])
		print("\n--- Top tags ---")
		tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()])
		print("\n--- Top tags (Complete Dataset) ---")
		print(pd.Series(tag_counts).sort_values(ascending=False).head(10))
		No newline at end of file