Loading exploratory_data_analysis.py +34 −17 Original line number Diff line number Diff line Loading @@ -72,22 +72,39 @@ print((df['average_playtime_forever'] / 60).describe()) # convert mins → hour print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 6. REMOVE ROWS WITH ANY NaN VALUES --- # --- 6. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("REMOVING ROWS WITH ANY NaN VALUES") print("REMOVING ROWS WITH NaN IN KEY COLUMNS") print("="*60) print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}") # Remove all rows that have at least one NaN value df_complete = df.dropna() print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") # Key columns we need for analysis key_columns = ['price', 'average_playtime_forever', 'median_playtime_forever'] # Check missing values in key columns print(f"\nMissing values in key columns:") for col in key_columns: missing_count = df[col].isnull().sum() missing_pct = (missing_count / len(df)) * 100 print(f" {col}: {missing_count:,} ({missing_pct:.1f}%)") # Remove rows that have NaN in any of the key columns df_complete = df.dropna(subset=key_columns) print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # --- 7. ANALYSIS ON COMPLETE DATASET --- # Show how many rows still have NaN in 2-week columns (this is fine) two_week_cols = ['average_playtime_2weeks', 'median_playtime_2weeks'] for col in two_week_cols: if col in df_complete.columns: missing_2week = df_complete[col].isnull().sum() missing_2week_pct = (missing_2week / len(df_complete)) * 100 print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%) - OK to keep") # --- 7. ANALYSIS ON FILTERED DATASET --- print("\n" + "="*60) print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)") print("ANALYSIS ON FILTERED DATASET (KEY COLUMNS COMPLETE)") print("="*60) # Analysis for average_playtime_forever Loading Loading @@ -163,18 +180,18 @@ print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({( print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)") print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)") # --- 8. Visualization Examples (Complete Dataset) --- # --- 8. Visualization Examples (Filtered Dataset) --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Distribution of prices df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1) df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax1) ax1.set_xlabel("Price ($)") ax1.set_ylabel("Count (log scale)") # Distribution of average playtime (in hours, log scale) playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution (Complete Dataset)", ax=ax2) title="Average Playtime Distribution (Filtered Dataset)", ax=ax2) ax2.set_xlabel("Average Playtime (hours, log scale)") ax2.set_ylabel("Count (log scale)") Loading @@ -184,7 +201,7 @@ ax3.set_xscale("log") ax3.set_yscale("log") ax3.set_xlabel("Average Playtime (hours, log)") ax3.set_ylabel("Total Reviews (log)") ax3.set_title("Playtime vs Reviews (Complete Dataset)") ax3.set_title("Playtime vs Reviews (Filtered Dataset)") # Price vs Reviews (for paid games) paid_mask = df_complete['price'] > 0 Loading @@ -194,18 +211,18 @@ if paid_mask.sum() > 0: ax4.set_yscale("log") ax4.set_xlabel("Price ($, log)") ax4.set_ylabel("Total Reviews (log)") ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)") ax4.set_title("Price vs Reviews - Paid Games (Filtered Dataset)") plt.tight_layout() plt.show() # --- 9. Language & Tag exploration (Complete Dataset) --- # --- 9. Language & Tag exploration (Filtered Dataset) --- # Top supported languages lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs]) print("\n--- Top supported languages (Complete Dataset) ---") print("\n--- Top supported languages (Filtered Dataset) ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Complete Dataset) ---") print("\n--- Top tags (Filtered Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) Loading
exploratory_data_analysis.py +34 −17 Original line number Diff line number Diff line Loading @@ -72,22 +72,39 @@ print((df['average_playtime_forever'] / 60).describe()) # convert mins → hour print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 6. REMOVE ROWS WITH ANY NaN VALUES --- # --- 6. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("REMOVING ROWS WITH ANY NaN VALUES") print("REMOVING ROWS WITH NaN IN KEY COLUMNS") print("="*60) print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}") # Remove all rows that have at least one NaN value df_complete = df.dropna() print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") # Key columns we need for analysis key_columns = ['price', 'average_playtime_forever', 'median_playtime_forever'] # Check missing values in key columns print(f"\nMissing values in key columns:") for col in key_columns: missing_count = df[col].isnull().sum() missing_pct = (missing_count / len(df)) * 100 print(f" {col}: {missing_count:,} ({missing_pct:.1f}%)") # Remove rows that have NaN in any of the key columns df_complete = df.dropna(subset=key_columns) print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # --- 7. ANALYSIS ON COMPLETE DATASET --- # Show how many rows still have NaN in 2-week columns (this is fine) two_week_cols = ['average_playtime_2weeks', 'median_playtime_2weeks'] for col in two_week_cols: if col in df_complete.columns: missing_2week = df_complete[col].isnull().sum() missing_2week_pct = (missing_2week / len(df_complete)) * 100 print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%) - OK to keep") # --- 7. ANALYSIS ON FILTERED DATASET --- print("\n" + "="*60) print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)") print("ANALYSIS ON FILTERED DATASET (KEY COLUMNS COMPLETE)") print("="*60) # Analysis for average_playtime_forever Loading Loading @@ -163,18 +180,18 @@ print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({( print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)") print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)") # --- 8. Visualization Examples (Complete Dataset) --- # --- 8. Visualization Examples (Filtered Dataset) --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Distribution of prices df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1) df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax1) ax1.set_xlabel("Price ($)") ax1.set_ylabel("Count (log scale)") # Distribution of average playtime (in hours, log scale) playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution (Complete Dataset)", ax=ax2) title="Average Playtime Distribution (Filtered Dataset)", ax=ax2) ax2.set_xlabel("Average Playtime (hours, log scale)") ax2.set_ylabel("Count (log scale)") Loading @@ -184,7 +201,7 @@ ax3.set_xscale("log") ax3.set_yscale("log") ax3.set_xlabel("Average Playtime (hours, log)") ax3.set_ylabel("Total Reviews (log)") ax3.set_title("Playtime vs Reviews (Complete Dataset)") ax3.set_title("Playtime vs Reviews (Filtered Dataset)") # Price vs Reviews (for paid games) paid_mask = df_complete['price'] > 0 Loading @@ -194,18 +211,18 @@ if paid_mask.sum() > 0: ax4.set_yscale("log") ax4.set_xlabel("Price ($, log)") ax4.set_ylabel("Total Reviews (log)") ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)") ax4.set_title("Price vs Reviews - Paid Games (Filtered Dataset)") plt.tight_layout() plt.show() # --- 9. Language & Tag exploration (Complete Dataset) --- # --- 9. Language & Tag exploration (Filtered Dataset) --- # Top supported languages lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs]) print("\n--- Top supported languages (Complete Dataset) ---") print("\n--- Top supported languages (Filtered Dataset) ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Complete Dataset) ---") print("\n--- Top tags (Filtered Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10))