Loading exploratory_data_analysis.py +137 −25 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ import pandas as pd import numpy as np import ast import matplotlib.pyplot as plt from collections import Counter # Load dataset (example CSV) df = pd.read_csv("./data/games_03_2025_reduced.csv") Loading Loading @@ -39,17 +40,17 @@ def clean_price(row): return np.nan # treat as missing return price df['price_clean'] = df.apply(clean_price, axis=1) # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 4. Clean playtime columns --- # --- 3. Clean playtime columns (overwrite originals) --- # 0 playtime might mean missing if no one played / no data playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col + "_clean"] = df[col].replace(0, np.nan) df[col] = df[col].replace(0, np.nan) # --- 5. Missingness exploration --- # --- 4. Missingness exploration --- print("\n--- Dataset size ---") print(df.shape) Loading @@ -61,39 +62,150 @@ num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 6. Basic EDA --- # --- 5. Basic EDA --- print("\n--- Price overview ---") print(df['price_clean'].describe()) print(df['price'].describe()) print("\n--- Playtime overview (hours) ---") print((df['average_playtime_forever_clean'] / 60).describe()) # convert mins → hours print((df['average_playtime_forever'] / 60).describe()) # convert mins → hours print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 7. Visualization Examples --- # --- 6. REMOVE ROWS WITH ANY NaN VALUES --- print("\n" + "="*60) print("REMOVING ROWS WITH ANY NaN VALUES") print("="*60) print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}") # Remove all rows that have at least one NaN value df_complete = df.dropna() print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # --- 7. ANALYSIS ON COMPLETE DATASET --- print("\n" + "="*60) print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)") print("="*60) # Analysis for average_playtime_forever playtime_hours = df_complete['average_playtime_forever'] / 60 # convert to hours print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean: {playtime_hours.mean():.2f} hours") print(f"Median: {playtime_hours.median():.2f} hours") print(f"Standard Deviation: {playtime_hours.std():.2f} hours") print(f"Min: {playtime_hours.min():.2f} hours") print(f"Max: {playtime_hours.max():.2f} hours") print(f"25th percentile: {playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {playtime_hours.quantile(0.75):.2f} hours") print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours") print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours") # Analysis for median_playtime_forever median_playtime_hours = df_complete['median_playtime_forever'] / 60 print(f"\n--- Median Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean of medians: {median_playtime_hours.mean():.2f} hours") print(f"Median of medians: {median_playtime_hours.median():.2f} hours") print(f"Standard Deviation: {median_playtime_hours.std():.2f} hours") print(f"Min: {median_playtime_hours.min():.2f} hours") print(f"Max: {median_playtime_hours.max():.2f} hours") print(f"25th percentile: {median_playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {median_playtime_hours.quantile(0.75):.2f} hours") # Price analysis price_data = df_complete['price'] print(f"\n--- Price Analysis (n={len(df_complete):,}) ---") print(f"Mean: ${price_data.mean():.2f}") print(f"Median: ${price_data.median():.2f}") print(f"Standard Deviation: ${price_data.std():.2f}") print(f"Min: ${price_data.min():.2f}") print(f"Max: ${price_data.max():.2f}") print(f"25th percentile: ${price_data.quantile(0.25):.2f}") print(f"75th percentile: ${price_data.quantile(0.75):.2f}") print(f"95th percentile: ${price_data.quantile(0.95):.2f}") print(f"99th percentile: ${price_data.quantile(0.99):.2f}") # Free vs Paid games breakdown free_games = (price_data == 0).sum() paid_games = (price_data > 0).sum() print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)") # Price ranges for paid games only paid_prices = price_data[price_data > 0] if len(paid_prices) > 0: print(f"\n--- Paid Games Price Analysis (n={len(paid_prices):,}) ---") print(f"Mean: ${paid_prices.mean():.2f}") print(f"Median: ${paid_prices.median():.2f}") print(f"Under $10: {(paid_prices < 10).sum():,} ({(paid_prices < 10).mean()*100:.1f}%)") print(f"$10-$30: {((paid_prices >= 10) & (paid_prices < 30)).sum():,} ({((paid_prices >= 10) & (paid_prices < 30)).mean()*100:.1f}%)") print(f"$30-$60: {((paid_prices >= 30) & (paid_prices < 60)).sum():,} ({((paid_prices >= 30) & (paid_prices < 60)).mean()*100:.1f}%)") print(f"Over $60: {(paid_prices >= 60).sum():,} ({(paid_prices >= 60).mean()*100:.1f}%)") # Correlations on complete dataset correlation_playtime_price = df_complete['average_playtime_forever'].corr(df_complete['price']) correlation_playtime_reviews = df_complete['average_playtime_forever'].corr(df_complete['num_reviews_total']) correlation_price_reviews = df_complete['price'].corr(df_complete['num_reviews_total']) print(f"\n--- Correlations (Complete Dataset) ---") print(f"Average playtime vs Price: {correlation_playtime_price:.3f}") print(f"Average playtime vs Total reviews: {correlation_playtime_reviews:.3f}") print(f"Price vs Total reviews: {correlation_price_reviews:.3f}") # Additional insights print(f"\n--- Additional Insights (Complete Dataset) ---") print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(playtime_hours > 100).mean()*100:.1f}%)") print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)") print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)") # --- 8. Visualization Examples (Complete Dataset) --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Distribution of prices df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution") plt.xlabel("Price ($)") plt.show() df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1) ax1.set_xlabel("Price ($)") ax1.set_ylabel("Count (log scale)") # Distribution of average playtime (in hours, log scale) playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution (Complete Dataset)", ax=ax2) ax2.set_xlabel("Average Playtime (hours, log scale)") ax2.set_ylabel("Count (log scale)") # Playtime vs Reviews plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3) plt.xscale("log") plt.yscale("log") plt.xlabel("Average Playtime (hours, log)") plt.ylabel("Total Reviews (log)") plt.title("Playtime vs Reviews") ax3.scatter(df_complete['average_playtime_forever']/60, df_complete['num_reviews_total'], alpha=0.3) ax3.set_xscale("log") ax3.set_yscale("log") ax3.set_xlabel("Average Playtime (hours, log)") ax3.set_ylabel("Total Reviews (log)") ax3.set_title("Playtime vs Reviews (Complete Dataset)") # Price vs Reviews (for paid games) paid_mask = df_complete['price'] > 0 if paid_mask.sum() > 0: ax4.scatter(df_complete[paid_mask]['price'], df_complete[paid_mask]['num_reviews_total'], alpha=0.3) ax4.set_xscale("log") ax4.set_yscale("log") ax4.set_xlabel("Price ($, log)") ax4.set_ylabel("Total Reviews (log)") ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)") plt.tight_layout() plt.show() # --- 8. Language & Tag exploration --- # --- 9. Language & Tag exploration (Complete Dataset) --- # Top supported languages from collections import Counter lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs]) print("\n--- Top supported languages ---") lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs]) print("\n--- Top supported languages (Complete Dataset) ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()]) print("\n--- Top tags ---") tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Complete Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) No newline at end of file Loading
exploratory_data_analysis.py +137 −25 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ import pandas as pd import numpy as np import ast import matplotlib.pyplot as plt from collections import Counter # Load dataset (example CSV) df = pd.read_csv("./data/games_03_2025_reduced.csv") Loading Loading @@ -39,17 +40,17 @@ def clean_price(row): return np.nan # treat as missing return price df['price_clean'] = df.apply(clean_price, axis=1) # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 4. Clean playtime columns --- # --- 3. Clean playtime columns (overwrite originals) --- # 0 playtime might mean missing if no one played / no data playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col + "_clean"] = df[col].replace(0, np.nan) df[col] = df[col].replace(0, np.nan) # --- 5. Missingness exploration --- # --- 4. Missingness exploration --- print("\n--- Dataset size ---") print(df.shape) Loading @@ -61,39 +62,150 @@ num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 6. Basic EDA --- # --- 5. Basic EDA --- print("\n--- Price overview ---") print(df['price_clean'].describe()) print(df['price'].describe()) print("\n--- Playtime overview (hours) ---") print((df['average_playtime_forever_clean'] / 60).describe()) # convert mins → hours print((df['average_playtime_forever'] / 60).describe()) # convert mins → hours print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 7. Visualization Examples --- # --- 6. REMOVE ROWS WITH ANY NaN VALUES --- print("\n" + "="*60) print("REMOVING ROWS WITH ANY NaN VALUES") print("="*60) print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}") # Remove all rows that have at least one NaN value df_complete = df.dropna() print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # --- 7. ANALYSIS ON COMPLETE DATASET --- print("\n" + "="*60) print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)") print("="*60) # Analysis for average_playtime_forever playtime_hours = df_complete['average_playtime_forever'] / 60 # convert to hours print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean: {playtime_hours.mean():.2f} hours") print(f"Median: {playtime_hours.median():.2f} hours") print(f"Standard Deviation: {playtime_hours.std():.2f} hours") print(f"Min: {playtime_hours.min():.2f} hours") print(f"Max: {playtime_hours.max():.2f} hours") print(f"25th percentile: {playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {playtime_hours.quantile(0.75):.2f} hours") print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours") print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours") # Analysis for median_playtime_forever median_playtime_hours = df_complete['median_playtime_forever'] / 60 print(f"\n--- Median Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean of medians: {median_playtime_hours.mean():.2f} hours") print(f"Median of medians: {median_playtime_hours.median():.2f} hours") print(f"Standard Deviation: {median_playtime_hours.std():.2f} hours") print(f"Min: {median_playtime_hours.min():.2f} hours") print(f"Max: {median_playtime_hours.max():.2f} hours") print(f"25th percentile: {median_playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {median_playtime_hours.quantile(0.75):.2f} hours") # Price analysis price_data = df_complete['price'] print(f"\n--- Price Analysis (n={len(df_complete):,}) ---") print(f"Mean: ${price_data.mean():.2f}") print(f"Median: ${price_data.median():.2f}") print(f"Standard Deviation: ${price_data.std():.2f}") print(f"Min: ${price_data.min():.2f}") print(f"Max: ${price_data.max():.2f}") print(f"25th percentile: ${price_data.quantile(0.25):.2f}") print(f"75th percentile: ${price_data.quantile(0.75):.2f}") print(f"95th percentile: ${price_data.quantile(0.95):.2f}") print(f"99th percentile: ${price_data.quantile(0.99):.2f}") # Free vs Paid games breakdown free_games = (price_data == 0).sum() paid_games = (price_data > 0).sum() print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)") # Price ranges for paid games only paid_prices = price_data[price_data > 0] if len(paid_prices) > 0: print(f"\n--- Paid Games Price Analysis (n={len(paid_prices):,}) ---") print(f"Mean: ${paid_prices.mean():.2f}") print(f"Median: ${paid_prices.median():.2f}") print(f"Under $10: {(paid_prices < 10).sum():,} ({(paid_prices < 10).mean()*100:.1f}%)") print(f"$10-$30: {((paid_prices >= 10) & (paid_prices < 30)).sum():,} ({((paid_prices >= 10) & (paid_prices < 30)).mean()*100:.1f}%)") print(f"$30-$60: {((paid_prices >= 30) & (paid_prices < 60)).sum():,} ({((paid_prices >= 30) & (paid_prices < 60)).mean()*100:.1f}%)") print(f"Over $60: {(paid_prices >= 60).sum():,} ({(paid_prices >= 60).mean()*100:.1f}%)") # Correlations on complete dataset correlation_playtime_price = df_complete['average_playtime_forever'].corr(df_complete['price']) correlation_playtime_reviews = df_complete['average_playtime_forever'].corr(df_complete['num_reviews_total']) correlation_price_reviews = df_complete['price'].corr(df_complete['num_reviews_total']) print(f"\n--- Correlations (Complete Dataset) ---") print(f"Average playtime vs Price: {correlation_playtime_price:.3f}") print(f"Average playtime vs Total reviews: {correlation_playtime_reviews:.3f}") print(f"Price vs Total reviews: {correlation_price_reviews:.3f}") # Additional insights print(f"\n--- Additional Insights (Complete Dataset) ---") print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(playtime_hours > 100).mean()*100:.1f}%)") print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)") print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)") # --- 8. Visualization Examples (Complete Dataset) --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Distribution of prices df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution") plt.xlabel("Price ($)") plt.show() df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1) ax1.set_xlabel("Price ($)") ax1.set_ylabel("Count (log scale)") # Distribution of average playtime (in hours, log scale) playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution (Complete Dataset)", ax=ax2) ax2.set_xlabel("Average Playtime (hours, log scale)") ax2.set_ylabel("Count (log scale)") # Playtime vs Reviews plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3) plt.xscale("log") plt.yscale("log") plt.xlabel("Average Playtime (hours, log)") plt.ylabel("Total Reviews (log)") plt.title("Playtime vs Reviews") ax3.scatter(df_complete['average_playtime_forever']/60, df_complete['num_reviews_total'], alpha=0.3) ax3.set_xscale("log") ax3.set_yscale("log") ax3.set_xlabel("Average Playtime (hours, log)") ax3.set_ylabel("Total Reviews (log)") ax3.set_title("Playtime vs Reviews (Complete Dataset)") # Price vs Reviews (for paid games) paid_mask = df_complete['price'] > 0 if paid_mask.sum() > 0: ax4.scatter(df_complete[paid_mask]['price'], df_complete[paid_mask]['num_reviews_total'], alpha=0.3) ax4.set_xscale("log") ax4.set_yscale("log") ax4.set_xlabel("Price ($, log)") ax4.set_ylabel("Total Reviews (log)") ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)") plt.tight_layout() plt.show() # --- 8. Language & Tag exploration --- # --- 9. Language & Tag exploration (Complete Dataset) --- # Top supported languages from collections import Counter lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs]) print("\n--- Top supported languages ---") lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs]) print("\n--- Top supported languages (Complete Dataset) ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()]) print("\n--- Top tags ---") tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Complete Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) No newline at end of file