Loading exploratory_data_analysis.py +24 −3 Original line number Diff line number Diff line Loading @@ -227,7 +227,7 @@ if games_with_languages > 0: avg_languages_per_game = df['supported_languages'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage supported languages per game: {avg_languages_per_game:.2f}") # --- 7. GENRES ANALYSIS (from original script) --- # --- 9. GENRES ANALYSIS (from original script) --- print(f"\n--- GENRES ANALYSIS ---") original_nulls = df['genres'].isnull().sum() print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") Loading @@ -254,13 +254,34 @@ if games_with_genres > 0: avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # --- 8. Pseudo-missing values analysis (zeros in numeric columns) --- # --- 10. Pseudo-missing values analysis (zeros in numeric columns) --- print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 9. COMPREHENSIVE MISSING VALUES SUMMARY --- # --- 7. Handle price=0 (BEFORE comprehensive summary) --- def clean_price(row): price = row['price'] tags = row['tags'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play else: return np.nan # treat as missing return price # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 8. Clean playtime columns (BEFORE comprehensive summary) --- # 0 playtime might mean missing if no one played / no data playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 11. COMPREHENSIVE MISSING VALUES SUMMARY --- print("\n" + "="*60) print("COMPREHENSIVE MISSING VALUES SUMMARY") print("="*60) Loading Loading
exploratory_data_analysis.py +24 −3 Original line number Diff line number Diff line Loading @@ -227,7 +227,7 @@ if games_with_languages > 0: avg_languages_per_game = df['supported_languages'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage supported languages per game: {avg_languages_per_game:.2f}") # --- 7. GENRES ANALYSIS (from original script) --- # --- 9. GENRES ANALYSIS (from original script) --- print(f"\n--- GENRES ANALYSIS ---") original_nulls = df['genres'].isnull().sum() print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") Loading @@ -254,13 +254,34 @@ if games_with_genres > 0: avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # --- 8. Pseudo-missing values analysis (zeros in numeric columns) --- # --- 10. Pseudo-missing values analysis (zeros in numeric columns) --- print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 9. COMPREHENSIVE MISSING VALUES SUMMARY --- # --- 7. Handle price=0 (BEFORE comprehensive summary) --- def clean_price(row): price = row['price'] tags = row['tags'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play else: return np.nan # treat as missing return price # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 8. Clean playtime columns (BEFORE comprehensive summary) --- # 0 playtime might mean missing if no one played / no data playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 11. COMPREHENSIVE MISSING VALUES SUMMARY --- print("\n" + "="*60) print("COMPREHENSIVE MISSING VALUES SUMMARY") print("="*60) Loading