Loading exploratory_data_analysis.py +152 −36 Original line number Diff line number Diff line Loading @@ -254,23 +254,30 @@ if games_with_genres > 0: avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # --- 8. COMPREHENSIVE MISSING VALUES SUMMARY --- # --- 8. Pseudo-missing values analysis (zeros in numeric columns) --- print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 9. COMPREHENSIVE MISSING VALUES SUMMARY --- print("\n" + "="*60) print("COMPREHENSIVE MISSING VALUES SUMMARY") print("="*60) missing_summary = {} # Basic missing values (NaN only) # Basic missing values (NaN only) - ALL columns basic_missing = df.isnull().sum() print(f"\n--- Standard Missing Values (NaN) ---") for col, missing_count in basic_missing.sort_values(ascending=False).head(15).items(): print(f"\n--- Standard Missing Values (NaN) - All Columns ---") for col in df.columns: missing_count = basic_missing[col] missing_pct = (missing_count / len(df)) * 100 print(f"{col:25}: {missing_count:8,} ({missing_pct:5.1f}%)") print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") missing_summary[col] = {'nan': missing_count, 'nan_pct': missing_pct} # Parsed/processed missing values print(f"\n--- Processed Missing Values ---") print(f"\n--- Processed Missing Values (Empty/Invalid after parsing) ---") parsed_missing = { 'name (empty/whitespace)': total_missing_names, 'estimated_owners (parsed)': df['estimated_owners_parsed'].isnull().sum(), Loading @@ -282,9 +289,9 @@ parsed_missing = { for col, missing_count in parsed_missing.items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 9. Handle price=0 (from original script) --- # --- 10. Handle price=0 (from original script) --- def clean_price(row): price = row['price'] tags = row['tags'] Loading @@ -298,13 +305,13 @@ def clean_price(row): # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 10. Clean playtime columns (from original script) --- # --- 11. Clean playtime columns (from original script) --- playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 11. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- # --- 12. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("FILTERING DATASET FOR COMPLETE KEY COLUMNS") print("="*60) Loading @@ -323,15 +330,106 @@ df_complete = df.dropna(subset=key_columns) print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # --- 12. ANALYSIS ON FILTERED DATASET --- # --- MISSING VALUES SUMMARY AFTER KEY COLUMNS FILTERING --- print("\n" + "="*40) print("MISSING VALUES SUMMARY AFTER KEY COLUMNS FILTERING") print("="*40) print(f"Dataset size after key columns filtering: {df_complete.shape[0]:,} rows") # Standard missing values in filtered dataset print(f"\n--- Standard Missing Values (NaN) in Filtered Dataset ---") basic_missing_filtered = df_complete.isnull().sum() for col in df_complete.columns: missing_count = basic_missing_filtered[col] missing_pct = (missing_count / len(df_complete)) * 100 if missing_count > 0: # Only show columns with missing values print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") # Parsed missing values in filtered dataset print(f"\n--- Processed Missing Values in Filtered Dataset ---") parsed_missing_filtered = { 'name (empty/whitespace)': (df_complete['name'].isnull() | (df_complete['name'] == '') | df_complete['name'].apply(lambda x: isinstance(x, str) and x.strip() == '')).sum(), 'estimated_owners (parsed)': df_complete['estimated_owners_parsed'].isnull().sum(), 'tags (empty dict)': df_complete['tags'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True).sum(), 'genres_parsed (empty list)': df_complete['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'categories_parsed (empty list)': df_complete['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'supported_languages (empty list)': df_complete['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), } for col, missing_count in parsed_missing_filtered.items(): missing_pct = (missing_count / len(df_complete)) * 100 print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 13. ADDITIONAL FILTERING: Remove games with empty genres, categories, and supported languages --- print("\n" + "="*60) print("ANALYSIS ON FILTERED DATASET") print("ADDITIONAL FILTERING: REMOVE EMPTY GENRES, CATEGORIES, LANGUAGES") print("="*60) print(f"Starting with: {df_complete.shape[0]:,} rows") # Create masks for additional filtering criteria empty_genres_mask = df_complete['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) empty_categories_mask = df_complete['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) empty_languages_mask = df_complete['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) print(f"\nGames to be removed:") print(f" Empty genres: {empty_genres_mask.sum():,} ({empty_genres_mask.mean()*100:.1f}%)") print(f" Empty categories: {empty_categories_mask.sum():,} ({empty_categories_mask.mean()*100:.1f}%)") print(f" Empty supported languages: {empty_languages_mask.sum():,} ({empty_languages_mask.mean()*100:.1f}%)") # Combine all criteria - remove rows where ANY of these are empty combined_mask = empty_genres_mask | empty_categories_mask | empty_languages_mask games_to_remove = combined_mask.sum() print(f" Total unique games to remove: {games_to_remove:,} ({games_to_remove/len(df_complete)*100:.1f}%)") # Apply additional filtering df_fully_complete = df_complete[~combined_mask] print(f"\nFully filtered dataset size: {df_fully_complete.shape[0]:,} rows") print(f"Data retention from original: {len(df_fully_complete)/len(df)*100:.1f}%") print(f"Data retention from key-columns-filtered: {len(df_fully_complete)/len(df_complete)*100:.1f}%") # --- FINAL MISSING VALUES SUMMARY --- print("\n" + "="*40) print("FINAL MISSING VALUES SUMMARY (FULLY FILTERED DATASET)") print("="*40) print(f"Final dataset size: {df_fully_complete.shape[0]:,} rows") # Standard missing values in fully filtered dataset print(f"\n--- Standard Missing Values (NaN) in Fully Filtered Dataset ---") basic_missing_final = df_fully_complete.isnull().sum() any_missing = False for col in df_fully_complete.columns: missing_count = basic_missing_final[col] missing_pct = (missing_count / len(df_fully_complete)) * 100 if missing_count > 0: # Only show columns with missing values print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") any_missing = True if not any_missing: print("No standard missing values (NaN) in fully filtered dataset!") # Check that our filtering worked print(f"\n--- Verification of Filtering ---") print(f"Games with empty genres: {df_fully_complete['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}") print(f"Games with empty categories: {df_fully_complete['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}") print(f"Games with empty supported languages: {df_fully_complete['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}") print(f"Games with NaN price: {df_fully_complete['price'].isnull().sum()}") print(f"Games with NaN average_playtime_forever: {df_fully_complete['average_playtime_forever'].isnull().sum()}") print(f"Games with NaN median_playtime_forever: {df_fully_complete['median_playtime_forever'].isnull().sum()}") # --- 14. ANALYSIS ON FULLY FILTERED DATASET --- print("\n" + "="*60) print("ANALYSIS ON FULLY FILTERED DATASET") print("="*60) # Analysis for average_playtime_forever playtime_hours = df_complete['average_playtime_forever'] / 60 playtime_hours = df_fully_complete['average_playtime_forever'] / 60 print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"\n--- Average Playtime Forever Analysis (n={len(df_fully_complete):,}) ---") print(f"Mean: {playtime_hours.mean():.2f} hours") print(f"Median: {playtime_hours.median():.2f} hours") print(f"Standard Deviation: {playtime_hours.std():.2f} hours") Loading @@ -339,53 +437,71 @@ print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours") print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours") # Price analysis price_data = df_complete['price'] print(f"\n--- Price Analysis (n={len(df_complete):,}) ---") price_data = df_fully_complete['price'] print(f"\n--- Price Analysis (n={len(df_fully_complete):,}) ---") print(f"Mean: ${price_data.mean():.2f}") print(f"Median: ${price_data.median():.2f}") free_games = (price_data == 0).sum() paid_games = (price_data > 0).sum() print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)") # --- 13. Visualization of Missing Values --- print(f"\nFree games: {free_games:,} ({free_games/len(df_fully_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_fully_complete)*100:.1f}%)") # Additional stats on fully filtered dataset print(f"\n--- Additional Statistics (Fully Filtered Dataset) ---") print(f"Average genres per game: {df_fully_complete['genres_parsed'].apply(len).mean():.2f}") print(f"Average categories per game: {df_fully_complete['categories_parsed'].apply(len).mean():.2f}") print(f"Average supported languages per game: {df_fully_complete['supported_languages'].apply(len).mean():.2f}") print(f"Average tags per game: {df_fully_complete['tags'].apply(len).mean():.2f}") print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df_fully_complete.select_dtypes(include=[np.number]).columns zero_counts = (df_fully_complete[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 15. Visualization of Missing Values --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Missing values heatmap-style plot missing_data = df.isnull().sum().sort_values(ascending=False).head(20) missing_data.plot(kind='bar', ax=ax1, title="Missing Values by Column (Top 20)") missing_data.plot(kind='bar', ax=ax1, title="Missing Values by Column (Top 20) - Original Dataset") ax1.set_xlabel("Columns") ax1.set_ylabel("Count of Missing Values") ax1.tick_params(axis='x', rotation=45) # Distribution of prices (filtered dataset) df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax2) # Distribution of prices (fully filtered dataset) df_fully_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Fully Filtered Dataset)", ax=ax2) ax2.set_xlabel("Price ($)") ax2.set_ylabel("Count (log scale)") # Distribution of playtime playtime_hours_plot = playtime_hours[playtime_hours > 0] playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution", ax=ax3) title="Average Playtime Distribution (Fully Filtered)", ax=ax3) ax3.set_xlabel("Average Playtime (hours, log scale)") ax3.set_ylabel("Count (log scale)") # Completeness by key parsed columns completeness_data = { 'Names': (len(df) - total_missing_names) / len(df) * 100, 'Estimated Owners': df['estimated_owners_parsed'].notnull().sum() / len(df) * 100, 'Tags': games_with_tags / len(df) * 100, 'Genres': games_with_genres / len(df) * 100, 'Categories': games_with_categories / len(df) * 100, 'Languages': games_with_languages / len(df) * 100, # Data retention visualization retention_data = { 'Original': len(df), 'Key Columns\nFiltered': len(df_complete), 'Fully Filtered\n(+genres/categories/languages)': len(df_fully_complete) } ax4.bar(completeness_data.keys(), completeness_data.values()) ax4.set_title("Data Completeness by Column (%)") ax4.set_ylabel("Completeness (%)") bars = ax4.bar(retention_data.keys(), retention_data.values(), color=['lightcoral', 'lightskyblue', 'lightgreen']) ax4.set_title("Dataset Size After Filtering Steps") ax4.set_ylabel("Number of Games") ax4.tick_params(axis='x', rotation=45) ax4.set_ylim(0, 100) # Add percentage labels on bars for i, (key, value) in enumerate(retention_data.items()): if i == 0: pct = 100.0 else: pct = (value / len(df)) * 100 ax4.text(i, value + len(df)*0.01, f'{value:,}\n({pct:.1f}%)', ha='center', va='bottom') plt.tight_layout() plt.show() Loading Loading
exploratory_data_analysis.py +152 −36 Original line number Diff line number Diff line Loading @@ -254,23 +254,30 @@ if games_with_genres > 0: avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # --- 8. COMPREHENSIVE MISSING VALUES SUMMARY --- # --- 8. Pseudo-missing values analysis (zeros in numeric columns) --- print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 9. COMPREHENSIVE MISSING VALUES SUMMARY --- print("\n" + "="*60) print("COMPREHENSIVE MISSING VALUES SUMMARY") print("="*60) missing_summary = {} # Basic missing values (NaN only) # Basic missing values (NaN only) - ALL columns basic_missing = df.isnull().sum() print(f"\n--- Standard Missing Values (NaN) ---") for col, missing_count in basic_missing.sort_values(ascending=False).head(15).items(): print(f"\n--- Standard Missing Values (NaN) - All Columns ---") for col in df.columns: missing_count = basic_missing[col] missing_pct = (missing_count / len(df)) * 100 print(f"{col:25}: {missing_count:8,} ({missing_pct:5.1f}%)") print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") missing_summary[col] = {'nan': missing_count, 'nan_pct': missing_pct} # Parsed/processed missing values print(f"\n--- Processed Missing Values ---") print(f"\n--- Processed Missing Values (Empty/Invalid after parsing) ---") parsed_missing = { 'name (empty/whitespace)': total_missing_names, 'estimated_owners (parsed)': df['estimated_owners_parsed'].isnull().sum(), Loading @@ -282,9 +289,9 @@ parsed_missing = { for col, missing_count in parsed_missing.items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 9. Handle price=0 (from original script) --- # --- 10. Handle price=0 (from original script) --- def clean_price(row): price = row['price'] tags = row['tags'] Loading @@ -298,13 +305,13 @@ def clean_price(row): # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 10. Clean playtime columns (from original script) --- # --- 11. Clean playtime columns (from original script) --- playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 11. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- # --- 12. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("FILTERING DATASET FOR COMPLETE KEY COLUMNS") print("="*60) Loading @@ -323,15 +330,106 @@ df_complete = df.dropna(subset=key_columns) print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # --- 12. ANALYSIS ON FILTERED DATASET --- # --- MISSING VALUES SUMMARY AFTER KEY COLUMNS FILTERING --- print("\n" + "="*40) print("MISSING VALUES SUMMARY AFTER KEY COLUMNS FILTERING") print("="*40) print(f"Dataset size after key columns filtering: {df_complete.shape[0]:,} rows") # Standard missing values in filtered dataset print(f"\n--- Standard Missing Values (NaN) in Filtered Dataset ---") basic_missing_filtered = df_complete.isnull().sum() for col in df_complete.columns: missing_count = basic_missing_filtered[col] missing_pct = (missing_count / len(df_complete)) * 100 if missing_count > 0: # Only show columns with missing values print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") # Parsed missing values in filtered dataset print(f"\n--- Processed Missing Values in Filtered Dataset ---") parsed_missing_filtered = { 'name (empty/whitespace)': (df_complete['name'].isnull() | (df_complete['name'] == '') | df_complete['name'].apply(lambda x: isinstance(x, str) and x.strip() == '')).sum(), 'estimated_owners (parsed)': df_complete['estimated_owners_parsed'].isnull().sum(), 'tags (empty dict)': df_complete['tags'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True).sum(), 'genres_parsed (empty list)': df_complete['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'categories_parsed (empty list)': df_complete['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'supported_languages (empty list)': df_complete['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), } for col, missing_count in parsed_missing_filtered.items(): missing_pct = (missing_count / len(df_complete)) * 100 print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 13. ADDITIONAL FILTERING: Remove games with empty genres, categories, and supported languages --- print("\n" + "="*60) print("ANALYSIS ON FILTERED DATASET") print("ADDITIONAL FILTERING: REMOVE EMPTY GENRES, CATEGORIES, LANGUAGES") print("="*60) print(f"Starting with: {df_complete.shape[0]:,} rows") # Create masks for additional filtering criteria empty_genres_mask = df_complete['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) empty_categories_mask = df_complete['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) empty_languages_mask = df_complete['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) print(f"\nGames to be removed:") print(f" Empty genres: {empty_genres_mask.sum():,} ({empty_genres_mask.mean()*100:.1f}%)") print(f" Empty categories: {empty_categories_mask.sum():,} ({empty_categories_mask.mean()*100:.1f}%)") print(f" Empty supported languages: {empty_languages_mask.sum():,} ({empty_languages_mask.mean()*100:.1f}%)") # Combine all criteria - remove rows where ANY of these are empty combined_mask = empty_genres_mask | empty_categories_mask | empty_languages_mask games_to_remove = combined_mask.sum() print(f" Total unique games to remove: {games_to_remove:,} ({games_to_remove/len(df_complete)*100:.1f}%)") # Apply additional filtering df_fully_complete = df_complete[~combined_mask] print(f"\nFully filtered dataset size: {df_fully_complete.shape[0]:,} rows") print(f"Data retention from original: {len(df_fully_complete)/len(df)*100:.1f}%") print(f"Data retention from key-columns-filtered: {len(df_fully_complete)/len(df_complete)*100:.1f}%") # --- FINAL MISSING VALUES SUMMARY --- print("\n" + "="*40) print("FINAL MISSING VALUES SUMMARY (FULLY FILTERED DATASET)") print("="*40) print(f"Final dataset size: {df_fully_complete.shape[0]:,} rows") # Standard missing values in fully filtered dataset print(f"\n--- Standard Missing Values (NaN) in Fully Filtered Dataset ---") basic_missing_final = df_fully_complete.isnull().sum() any_missing = False for col in df_fully_complete.columns: missing_count = basic_missing_final[col] missing_pct = (missing_count / len(df_fully_complete)) * 100 if missing_count > 0: # Only show columns with missing values print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") any_missing = True if not any_missing: print("No standard missing values (NaN) in fully filtered dataset!") # Check that our filtering worked print(f"\n--- Verification of Filtering ---") print(f"Games with empty genres: {df_fully_complete['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}") print(f"Games with empty categories: {df_fully_complete['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}") print(f"Games with empty supported languages: {df_fully_complete['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}") print(f"Games with NaN price: {df_fully_complete['price'].isnull().sum()}") print(f"Games with NaN average_playtime_forever: {df_fully_complete['average_playtime_forever'].isnull().sum()}") print(f"Games with NaN median_playtime_forever: {df_fully_complete['median_playtime_forever'].isnull().sum()}") # --- 14. ANALYSIS ON FULLY FILTERED DATASET --- print("\n" + "="*60) print("ANALYSIS ON FULLY FILTERED DATASET") print("="*60) # Analysis for average_playtime_forever playtime_hours = df_complete['average_playtime_forever'] / 60 playtime_hours = df_fully_complete['average_playtime_forever'] / 60 print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"\n--- Average Playtime Forever Analysis (n={len(df_fully_complete):,}) ---") print(f"Mean: {playtime_hours.mean():.2f} hours") print(f"Median: {playtime_hours.median():.2f} hours") print(f"Standard Deviation: {playtime_hours.std():.2f} hours") Loading @@ -339,53 +437,71 @@ print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours") print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours") # Price analysis price_data = df_complete['price'] print(f"\n--- Price Analysis (n={len(df_complete):,}) ---") price_data = df_fully_complete['price'] print(f"\n--- Price Analysis (n={len(df_fully_complete):,}) ---") print(f"Mean: ${price_data.mean():.2f}") print(f"Median: ${price_data.median():.2f}") free_games = (price_data == 0).sum() paid_games = (price_data > 0).sum() print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)") # --- 13. Visualization of Missing Values --- print(f"\nFree games: {free_games:,} ({free_games/len(df_fully_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_fully_complete)*100:.1f}%)") # Additional stats on fully filtered dataset print(f"\n--- Additional Statistics (Fully Filtered Dataset) ---") print(f"Average genres per game: {df_fully_complete['genres_parsed'].apply(len).mean():.2f}") print(f"Average categories per game: {df_fully_complete['categories_parsed'].apply(len).mean():.2f}") print(f"Average supported languages per game: {df_fully_complete['supported_languages'].apply(len).mean():.2f}") print(f"Average tags per game: {df_fully_complete['tags'].apply(len).mean():.2f}") print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df_fully_complete.select_dtypes(include=[np.number]).columns zero_counts = (df_fully_complete[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 15. Visualization of Missing Values --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Missing values heatmap-style plot missing_data = df.isnull().sum().sort_values(ascending=False).head(20) missing_data.plot(kind='bar', ax=ax1, title="Missing Values by Column (Top 20)") missing_data.plot(kind='bar', ax=ax1, title="Missing Values by Column (Top 20) - Original Dataset") ax1.set_xlabel("Columns") ax1.set_ylabel("Count of Missing Values") ax1.tick_params(axis='x', rotation=45) # Distribution of prices (filtered dataset) df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax2) # Distribution of prices (fully filtered dataset) df_fully_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Fully Filtered Dataset)", ax=ax2) ax2.set_xlabel("Price ($)") ax2.set_ylabel("Count (log scale)") # Distribution of playtime playtime_hours_plot = playtime_hours[playtime_hours > 0] playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution", ax=ax3) title="Average Playtime Distribution (Fully Filtered)", ax=ax3) ax3.set_xlabel("Average Playtime (hours, log scale)") ax3.set_ylabel("Count (log scale)") # Completeness by key parsed columns completeness_data = { 'Names': (len(df) - total_missing_names) / len(df) * 100, 'Estimated Owners': df['estimated_owners_parsed'].notnull().sum() / len(df) * 100, 'Tags': games_with_tags / len(df) * 100, 'Genres': games_with_genres / len(df) * 100, 'Categories': games_with_categories / len(df) * 100, 'Languages': games_with_languages / len(df) * 100, # Data retention visualization retention_data = { 'Original': len(df), 'Key Columns\nFiltered': len(df_complete), 'Fully Filtered\n(+genres/categories/languages)': len(df_fully_complete) } ax4.bar(completeness_data.keys(), completeness_data.values()) ax4.set_title("Data Completeness by Column (%)") ax4.set_ylabel("Completeness (%)") bars = ax4.bar(retention_data.keys(), retention_data.values(), color=['lightcoral', 'lightskyblue', 'lightgreen']) ax4.set_title("Dataset Size After Filtering Steps") ax4.set_ylabel("Number of Games") ax4.tick_params(axis='x', rotation=45) ax4.set_ylim(0, 100) # Add percentage labels on bars for i, (key, value) in enumerate(retention_data.items()): if i == 0: pct = 100.0 else: pct = (value / len(df)) * 100 ax4.text(i, value + len(df)*0.01, f'{value:,}\n({pct:.1f}%)', ha='center', va='bottom') plt.tight_layout() plt.show() Loading