Loading exploratory_data_analysis.py +275 −185 Original line number Diff line number Diff line Loading @@ -7,12 +7,12 @@ from collections import Counter # Load dataset (example CSV) df = pd.read_csv("./data/games_03_2025_reduced.csv") # --- 1. Parse JSON/array columns --- # supported_languages might be a string like "['English', 'French']" if df['supported_languages'].dtype == object: df['supported_languages'] = df['supported_languages'].apply( lambda x: ast.literal_eval(x) if pd.notna(x) and x.startswith('[') else [] ) print("="*80) print("COMPREHENSIVE MISSING VALUES ANALYSIS") print("="*80) # --- 1. Parse JSON/array columns and analyze missing values --- print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") def parse_tags(x): """ Loading Loading @@ -47,14 +47,188 @@ def parse_genres(x): else: return [] df['tags'] = df['tags'].apply(parse_tags) def parse_categories(x): """Parse categories using the same logic as genres""" return parse_genres(x) # Parse genres early in the pipeline if 'genres' in df.columns: def parse_supported_languages(x): """Parse supported languages from various formats""" if pd.isna(x): return [] if isinstance(x, str): if x.startswith('[') and x.endswith(']'): try: return ast.literal_eval(x) except: # If it's just a comma-separated string return [lang.strip().strip('"').strip("'") for lang in x.strip('[]').split(',') if lang.strip()] elif ',' in x: return [lang.strip() for lang in x.split(',') if lang.strip()] else: return [x.strip()] if x.strip() else [] elif isinstance(x, list): return x else: return [] def parse_estimated_owners(x): """Parse estimated owners - clean and validate the range format""" if pd.isna(x): return None if isinstance(x, str): x = x.strip() if x == '' or x.lower() == 'unknown': return None # Expected format: "50000000 - 100000000" or similar if ' - ' in x: return x # Handle other potential formats elif '-' in x: return x.replace('-', ' - ') else: return x if x else None return str(x) if x else None # Parse all columns df['supported_languages'] = df['supported_languages'].apply(parse_supported_languages) df['tags'] = df['tags'].apply(parse_tags) df['genres_parsed'] = df['genres'].apply(parse_genres) df['categories_parsed'] = df['categories'].apply(parse_categories) df['estimated_owners_parsed'] = df['estimated_owners'].apply(parse_estimated_owners) # Check for missing/empty genres print(f"\n--- GENRE MISSING VALUES ANALYSIS ---") print("\n" + "="*60) print("DETAILED MISSING VALUES ANALYSIS BY COLUMN") print("="*60) # --- 2. Analyze ESTIMATED_OWNERS --- print(f"\n--- ESTIMATED_OWNERS ANALYSIS ---") original_nulls = df['estimated_owners'].isnull().sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_strings = (df['estimated_owners'] == '').sum() print(f"Empty strings: {empty_strings:,} ({empty_strings/len(df)*100:.1f}%)") parsed_nulls = df['estimated_owners_parsed'].isnull().sum() print(f"NULL after parsing: {parsed_nulls:,} ({parsed_nulls/len(df)*100:.1f}%)") valid_owners = df['estimated_owners_parsed'].notnull().sum() print(f"Valid estimated_owners: {valid_owners:,} ({valid_owners/len(df)*100:.1f}%)") # Show unique patterns in estimated_owners if valid_owners > 0: print(f"\nUnique estimated_owners patterns (top 10):") owners_counts = df['estimated_owners_parsed'].value_counts().head(10) for pattern, count in owners_counts.items(): print(f" '{pattern}': {count:,}") # --- 3. Analyze TAGS --- print(f"\n--- TAGS ANALYSIS ---") original_nulls = df['tags'].apply(lambda x: pd.isna(x) if not isinstance(x, dict) else False).sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_dicts = df['tags'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True).sum() print(f"Empty tag dictionaries after parsing: {empty_dicts:,} ({empty_dicts/len(df)*100:.1f}%)") games_with_tags = df['tags'].apply(lambda x: len(x) > 0 if isinstance(x, dict) else False).sum() print(f"Games with at least one tag: {games_with_tags:,} ({games_with_tags/len(df)*100:.1f}%)") if games_with_tags > 0: # Count all tags all_tags = [] for tag_dict in df['tags']: if isinstance(tag_dict, dict): all_tags.extend(tag_dict.keys()) if all_tags: tag_counts = Counter(all_tags) print(f"\nTop 10 most common tags:") for tag, count in tag_counts.most_common(10): print(f" '{tag}': {count:,}") avg_tags_per_game = df['tags'].apply(lambda x: len(x) if isinstance(x, dict) else 0).mean() print(f"\nAverage tags per game: {avg_tags_per_game:.2f}") # --- 4. Analyze NAME --- print(f"\n--- NAME ANALYSIS ---") original_nulls = df['name'].isnull().sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_strings = (df['name'] == '').sum() print(f"Empty strings: {empty_strings:,} ({empty_strings/len(df)*100:.1f}%)") whitespace_only = df['name'].apply(lambda x: isinstance(x, str) and x.strip() == '').sum() print(f"Whitespace-only strings: {whitespace_only:,} ({whitespace_only/len(df)*100:.1f}%)") total_missing_names = original_nulls + empty_strings + whitespace_only print(f"Total missing/empty names: {total_missing_names:,} ({total_missing_names/len(df)*100:.1f}%)") valid_names = len(df) - total_missing_names print(f"Valid names: {valid_names:,} ({valid_names/len(df)*100:.1f}%)") # Show some examples of problematic names if any if total_missing_names > 0: print(f"\nSample of games with missing/empty names:") problem_mask = df['name'].isnull() | (df['name'] == '') | df['name'].apply(lambda x: isinstance(x, str) and x.strip() == '') problem_sample = df[problem_mask][['appid', 'name']].head(5) for idx, row in problem_sample.iterrows(): print(f" AppID {row['appid']}: name='{row['name']}'") # --- 5. Analyze CATEGORIES --- print(f"\n--- CATEGORIES ANALYSIS ---") original_nulls = df['categories'].isnull().sum() print(f"Original NULL values in categories: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_arrays = df['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum() print(f"Empty category arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)") games_with_categories = df['categories_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one category: {games_with_categories:,} ({games_with_categories/len(df)*100:.1f}%)") if games_with_categories > 0: # Count all categories all_categories = [] for categories_list in df['categories_parsed']: if isinstance(categories_list, list): all_categories.extend(categories_list) if all_categories: category_counts = Counter(all_categories) print(f"\nTop 10 most common categories:") for category, count in category_counts.most_common(10): print(f" '{category}': {count:,}") avg_categories_per_game = df['categories_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage categories per game: {avg_categories_per_game:.2f}") # --- 6. Analyze SUPPORTED_LANGUAGES --- print(f"\n--- SUPPORTED_LANGUAGES ANALYSIS ---") original_nulls = df['supported_languages'].apply(lambda x: pd.isna(x) if not isinstance(x, list) else False).sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_arrays = df['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum() print(f"Empty language arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)") games_with_languages = df['supported_languages'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one supported language: {games_with_languages:,} ({games_with_languages/len(df)*100:.1f}%)") if games_with_languages > 0: # Count all languages all_languages = [] for languages_list in df['supported_languages']: if isinstance(languages_list, list): all_languages.extend(languages_list) if all_languages: language_counts = Counter(all_languages) print(f"\nTop 10 most supported languages:") for language, count in language_counts.most_common(10): print(f" '{language}': {count:,}") avg_languages_per_game = df['supported_languages'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage supported languages per game: {avg_languages_per_game:.2f}") # --- 7. GENRES ANALYSIS (from original script) --- print(f"\n--- GENRES ANALYSIS ---") original_nulls = df['genres'].isnull().sum() print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") Loading @@ -64,45 +238,53 @@ if 'genres' in df.columns: games_with_genres = df['genres_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one genre: {games_with_genres:,} ({games_with_genres/len(df)*100:.1f}%)") # Show some examples of empty/problematic entries empty_mask = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) if empty_mask.sum() > 0: print(f"\nSample of games with empty genres:") empty_sample = df[empty_mask][['name', 'genres', 'genres_parsed']].head(3) for idx, row in empty_sample.iterrows(): print(f" '{row['name']}': original='{row['genres']}', parsed={row['genres_parsed']}") # Count all genres immediately all_genres_debug = [] if games_with_genres > 0: # Count all genres all_genres = [] for genres_list in df['genres_parsed']: if isinstance(genres_list, list): all_genres_debug.extend(genres_list) all_genres.extend(genres_list) if all_genres_debug: genre_counts_debug = Counter(all_genres_debug) print(f"\nGenre counts (full dataset, before filtering):") for genre, count in genre_counts_debug.most_common(15): print(f" {genre}: {count:,}") if all_genres: genre_counts = Counter(all_genres) print(f"\nTop 10 most common genres:") for genre, count in genre_counts.most_common(10): print(f" '{genre}': {count:,}") # Show genre distribution stats avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # Show games with most genres max_genres = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).max() print(f"Maximum genres on a single game: {max_genres}") # --- 8. COMPREHENSIVE MISSING VALUES SUMMARY --- print("\n" + "="*60) print("COMPREHENSIVE MISSING VALUES SUMMARY") print("="*60) if max_genres > 0: multi_genre_games = df[df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0) == max_genres] print(f"Game(s) with {max_genres} genres:") for idx, row in multi_genre_games[['name', 'genres_parsed']].head(2).iterrows(): print(f" '{row['name']}': {row['genres_parsed']}") else: print("No genres found after parsing!") else: print("No 'genres' column found in dataset!") missing_summary = {} # --- 2. Handle price=0 --- # Basic missing values (NaN only) basic_missing = df.isnull().sum() print(f"\n--- Standard Missing Values (NaN) ---") for col, missing_count in basic_missing.sort_values(ascending=False).head(15).items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:25}: {missing_count:8,} ({missing_pct:5.1f}%)") missing_summary[col] = {'nan': missing_count, 'nan_pct': missing_pct} # Parsed/processed missing values print(f"\n--- Processed Missing Values ---") parsed_missing = { 'name (empty/whitespace)': total_missing_names, 'estimated_owners (parsed)': df['estimated_owners_parsed'].isnull().sum(), 'tags (empty dict)': df['tags'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True).sum(), 'genres_parsed (empty list)': df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'categories_parsed (empty list)': df['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'supported_languages (empty list)': df['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), } for col, missing_count in parsed_missing.items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 9. Handle price=0 (from original script) --- def clean_price(row): price = row['price'] tags = row['tags'] Loading @@ -116,47 +298,21 @@ def clean_price(row): # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 3. Clean playtime columns (overwrite originals) --- # 0 playtime might mean missing if no one played / no data # --- 10. Clean playtime columns (from original script) --- playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 4. Missingness exploration --- print("\n--- Dataset size ---") print(df.shape) print("\n--- Missing values (NaN only) ---") print(df.isna().sum().sort_values(ascending=False)) print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 5. Basic EDA --- print("\n--- Price overview ---") print(df['price'].describe()) print("\n--- Playtime overview (hours) ---") print((df['average_playtime_forever'] / 60).describe()) # convert mins → hours print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 6. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- # --- 11. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("REMOVING ROWS WITH NaN IN KEY COLUMNS") print("FILTERING DATASET FOR COMPLETE KEY COLUMNS") print("="*60) print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") # Key columns we need for analysis key_columns = ['price', 'average_playtime_forever', 'median_playtime_forever'] # Check missing values in key columns print(f"\nMissing values in key columns:") print(f"\nMissing values in key columns before filtering:") for col in key_columns: missing_count = df[col].isnull().sum() missing_pct = (missing_count / len(df)) * 100 Loading @@ -164,141 +320,75 @@ for col in key_columns: # Remove rows that have NaN in any of the key columns df_complete = df.dropna(subset=key_columns) print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # Show how many rows still have NaN in 2-week columns (this is fine) two_week_cols = ['average_playtime_2weeks', 'median_playtime_2weeks'] for col in two_week_cols: if col in df_complete.columns: missing_2week = df_complete[col].isnull().sum() missing_2week_pct = (missing_2week / len(df_complete)) * 100 print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%)") # --- 7. ANALYSIS ON FILTERED DATASET --- # --- 12. ANALYSIS ON FILTERED DATASET --- print("\n" + "="*60) print("ANALYSIS ON FILTERED DATASET (KEY COLUMNS COMPLETE)") print("ANALYSIS ON FILTERED DATASET") print("="*60) # Analysis for average_playtime_forever playtime_hours = df_complete['average_playtime_forever'] / 60 # convert to hours playtime_hours = df_complete['average_playtime_forever'] / 60 print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean: {playtime_hours.mean():.2f} hours") print(f"Median: {playtime_hours.median():.2f} hours") print(f"Standard Deviation: {playtime_hours.std():.2f} hours") print(f"Min: {playtime_hours.min():.2f} hours") print(f"Max: {playtime_hours.max():.2f} hours") print(f"25th percentile: {playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {playtime_hours.quantile(0.75):.2f} hours") print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours") print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours") # Analysis for median_playtime_forever median_playtime_hours = df_complete['median_playtime_forever'] / 60 print(f"\n--- Median Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean of medians: {median_playtime_hours.mean():.2f} hours") print(f"Median of medians: {median_playtime_hours.median():.2f} hours") print(f"Standard Deviation: {median_playtime_hours.std():.2f} hours") print(f"Min: {median_playtime_hours.min():.2f} hours") print(f"Max: {median_playtime_hours.max():.2f} hours") print(f"25th percentile: {median_playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {median_playtime_hours.quantile(0.75):.2f} hours") # Price analysis price_data = df_complete['price'] print(f"\n--- Price Analysis (n={len(df_complete):,}) ---") print(f"Mean: ${price_data.mean():.2f}") print(f"Median: ${price_data.median():.2f}") print(f"Standard Deviation: ${price_data.std():.2f}") print(f"Min: ${price_data.min():.2f}") print(f"Max: ${price_data.max():.2f}") print(f"25th percentile: ${price_data.quantile(0.25):.2f}") print(f"75th percentile: ${price_data.quantile(0.75):.2f}") print(f"95th percentile: ${price_data.quantile(0.95):.2f}") print(f"99th percentile: ${price_data.quantile(0.99):.2f}") # Free vs Paid games breakdown free_games = (price_data == 0).sum() paid_games = (price_data > 0).sum() print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)") # Price ranges for paid games only paid_prices = price_data[price_data > 0] if len(paid_prices) > 0: print(f"\n--- Paid Games Price Analysis (n={len(paid_prices):,}) ---") print(f"Mean: ${paid_prices.mean():.2f}") print(f"Median: ${paid_prices.median():.2f}") print(f"Under $10: {(paid_prices < 10).sum():,} ({(paid_prices < 10).mean()*100:.1f}%)") print(f"$10-$30: {((paid_prices >= 10) & (paid_prices < 30)).sum():,} ({((paid_prices >= 10) & (paid_prices < 30)).mean()*100:.1f}%)") print(f"$30-$60: {((paid_prices >= 30) & (paid_prices < 60)).sum():,} ({((paid_prices >= 30) & (paid_prices < 60)).mean()*100:.1f}%)") print(f"Over $60: {(paid_prices >= 60).sum():,} ({(paid_prices >= 60).mean()*100:.1f}%)") # Correlations on complete dataset correlation_playtime_price = df_complete['average_playtime_forever'].corr(df_complete['price']) correlation_playtime_reviews = df_complete['average_playtime_forever'].corr(df_complete['num_reviews_total']) correlation_price_reviews = df_complete['price'].corr(df_complete['num_reviews_total']) print(f"\n--- Correlations (Complete Dataset) ---") print(f"Average playtime vs Price: {correlation_playtime_price:.3f}") print(f"Average playtime vs Total reviews: {correlation_playtime_reviews:.3f}") print(f"Price vs Total reviews: {correlation_price_reviews:.3f}") # Additional insights print(f"\n--- Additional Insights (Complete Dataset) ---") print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(playtime_hours > 100).mean()*100:.1f}%)") print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)") print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)") # --- 8. Visualization Examples (Filtered Dataset) --- # --- 13. Visualization of Missing Values --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Distribution of prices df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax1) ax1.set_xlabel("Price ($)") ax1.set_ylabel("Count (log scale)") # Missing values heatmap-style plot missing_data = df.isnull().sum().sort_values(ascending=False).head(20) missing_data.plot(kind='bar', ax=ax1, title="Missing Values by Column (Top 20)") ax1.set_xlabel("Columns") ax1.set_ylabel("Count of Missing Values") ax1.tick_params(axis='x', rotation=45) # Distribution of average playtime (in hours, log scale) playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution (Filtered Dataset)", ax=ax2) ax2.set_xlabel("Average Playtime (hours, log scale)") # Distribution of prices (filtered dataset) df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax2) ax2.set_xlabel("Price ($)") ax2.set_ylabel("Count (log scale)") # Playtime vs Reviews ax3.scatter(df_complete['average_playtime_forever']/60, df_complete['num_reviews_total'], alpha=0.3) ax3.set_xscale("log") ax3.set_yscale("log") ax3.set_xlabel("Average Playtime (hours, log)") ax3.set_ylabel("Total Reviews (log)") ax3.set_title("Playtime vs Reviews (Filtered Dataset)") # Price vs Reviews (for paid games) paid_mask = df_complete['price'] > 0 if paid_mask.sum() > 0: ax4.scatter(df_complete[paid_mask]['price'], df_complete[paid_mask]['num_reviews_total'], alpha=0.3) ax4.set_xscale("log") ax4.set_yscale("log") ax4.set_xlabel("Price ($, log)") ax4.set_ylabel("Total Reviews (log)") ax4.set_title("Price vs Reviews - Paid Games (Filtered Dataset)") # Distribution of playtime playtime_hours_plot = playtime_hours[playtime_hours > 0] playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution", ax=ax3) ax3.set_xlabel("Average Playtime (hours, log scale)") ax3.set_ylabel("Count (log scale)") # Completeness by key parsed columns completeness_data = { 'Names': (len(df) - total_missing_names) / len(df) * 100, 'Estimated Owners': df['estimated_owners_parsed'].notnull().sum() / len(df) * 100, 'Tags': games_with_tags / len(df) * 100, 'Genres': games_with_genres / len(df) * 100, 'Categories': games_with_categories / len(df) * 100, 'Languages': games_with_languages / len(df) * 100, } ax4.bar(completeness_data.keys(), completeness_data.values()) ax4.set_title("Data Completeness by Column (%)") ax4.set_ylabel("Completeness (%)") ax4.tick_params(axis='x', rotation=45) ax4.set_ylim(0, 100) plt.tight_layout() plt.show() # --- 9. Language & Tag exploration (Filtered Dataset) --- # Top supported languages lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs]) print("\n--- Top supported languages (Filtered Dataset) ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Filtered Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) print("which columns we have:") print(df.columns) print("\nAnalysis complete! Dataset columns available:") print(list(df.columns)) Loading
exploratory_data_analysis.py +275 −185 Original line number Diff line number Diff line Loading @@ -7,12 +7,12 @@ from collections import Counter # Load dataset (example CSV) df = pd.read_csv("./data/games_03_2025_reduced.csv") # --- 1. Parse JSON/array columns --- # supported_languages might be a string like "['English', 'French']" if df['supported_languages'].dtype == object: df['supported_languages'] = df['supported_languages'].apply( lambda x: ast.literal_eval(x) if pd.notna(x) and x.startswith('[') else [] ) print("="*80) print("COMPREHENSIVE MISSING VALUES ANALYSIS") print("="*80) # --- 1. Parse JSON/array columns and analyze missing values --- print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") def parse_tags(x): """ Loading Loading @@ -47,14 +47,188 @@ def parse_genres(x): else: return [] df['tags'] = df['tags'].apply(parse_tags) def parse_categories(x): """Parse categories using the same logic as genres""" return parse_genres(x) # Parse genres early in the pipeline if 'genres' in df.columns: def parse_supported_languages(x): """Parse supported languages from various formats""" if pd.isna(x): return [] if isinstance(x, str): if x.startswith('[') and x.endswith(']'): try: return ast.literal_eval(x) except: # If it's just a comma-separated string return [lang.strip().strip('"').strip("'") for lang in x.strip('[]').split(',') if lang.strip()] elif ',' in x: return [lang.strip() for lang in x.split(',') if lang.strip()] else: return [x.strip()] if x.strip() else [] elif isinstance(x, list): return x else: return [] def parse_estimated_owners(x): """Parse estimated owners - clean and validate the range format""" if pd.isna(x): return None if isinstance(x, str): x = x.strip() if x == '' or x.lower() == 'unknown': return None # Expected format: "50000000 - 100000000" or similar if ' - ' in x: return x # Handle other potential formats elif '-' in x: return x.replace('-', ' - ') else: return x if x else None return str(x) if x else None # Parse all columns df['supported_languages'] = df['supported_languages'].apply(parse_supported_languages) df['tags'] = df['tags'].apply(parse_tags) df['genres_parsed'] = df['genres'].apply(parse_genres) df['categories_parsed'] = df['categories'].apply(parse_categories) df['estimated_owners_parsed'] = df['estimated_owners'].apply(parse_estimated_owners) # Check for missing/empty genres print(f"\n--- GENRE MISSING VALUES ANALYSIS ---") print("\n" + "="*60) print("DETAILED MISSING VALUES ANALYSIS BY COLUMN") print("="*60) # --- 2. Analyze ESTIMATED_OWNERS --- print(f"\n--- ESTIMATED_OWNERS ANALYSIS ---") original_nulls = df['estimated_owners'].isnull().sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_strings = (df['estimated_owners'] == '').sum() print(f"Empty strings: {empty_strings:,} ({empty_strings/len(df)*100:.1f}%)") parsed_nulls = df['estimated_owners_parsed'].isnull().sum() print(f"NULL after parsing: {parsed_nulls:,} ({parsed_nulls/len(df)*100:.1f}%)") valid_owners = df['estimated_owners_parsed'].notnull().sum() print(f"Valid estimated_owners: {valid_owners:,} ({valid_owners/len(df)*100:.1f}%)") # Show unique patterns in estimated_owners if valid_owners > 0: print(f"\nUnique estimated_owners patterns (top 10):") owners_counts = df['estimated_owners_parsed'].value_counts().head(10) for pattern, count in owners_counts.items(): print(f" '{pattern}': {count:,}") # --- 3. Analyze TAGS --- print(f"\n--- TAGS ANALYSIS ---") original_nulls = df['tags'].apply(lambda x: pd.isna(x) if not isinstance(x, dict) else False).sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_dicts = df['tags'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True).sum() print(f"Empty tag dictionaries after parsing: {empty_dicts:,} ({empty_dicts/len(df)*100:.1f}%)") games_with_tags = df['tags'].apply(lambda x: len(x) > 0 if isinstance(x, dict) else False).sum() print(f"Games with at least one tag: {games_with_tags:,} ({games_with_tags/len(df)*100:.1f}%)") if games_with_tags > 0: # Count all tags all_tags = [] for tag_dict in df['tags']: if isinstance(tag_dict, dict): all_tags.extend(tag_dict.keys()) if all_tags: tag_counts = Counter(all_tags) print(f"\nTop 10 most common tags:") for tag, count in tag_counts.most_common(10): print(f" '{tag}': {count:,}") avg_tags_per_game = df['tags'].apply(lambda x: len(x) if isinstance(x, dict) else 0).mean() print(f"\nAverage tags per game: {avg_tags_per_game:.2f}") # --- 4. Analyze NAME --- print(f"\n--- NAME ANALYSIS ---") original_nulls = df['name'].isnull().sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_strings = (df['name'] == '').sum() print(f"Empty strings: {empty_strings:,} ({empty_strings/len(df)*100:.1f}%)") whitespace_only = df['name'].apply(lambda x: isinstance(x, str) and x.strip() == '').sum() print(f"Whitespace-only strings: {whitespace_only:,} ({whitespace_only/len(df)*100:.1f}%)") total_missing_names = original_nulls + empty_strings + whitespace_only print(f"Total missing/empty names: {total_missing_names:,} ({total_missing_names/len(df)*100:.1f}%)") valid_names = len(df) - total_missing_names print(f"Valid names: {valid_names:,} ({valid_names/len(df)*100:.1f}%)") # Show some examples of problematic names if any if total_missing_names > 0: print(f"\nSample of games with missing/empty names:") problem_mask = df['name'].isnull() | (df['name'] == '') | df['name'].apply(lambda x: isinstance(x, str) and x.strip() == '') problem_sample = df[problem_mask][['appid', 'name']].head(5) for idx, row in problem_sample.iterrows(): print(f" AppID {row['appid']}: name='{row['name']}'") # --- 5. Analyze CATEGORIES --- print(f"\n--- CATEGORIES ANALYSIS ---") original_nulls = df['categories'].isnull().sum() print(f"Original NULL values in categories: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_arrays = df['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum() print(f"Empty category arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)") games_with_categories = df['categories_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one category: {games_with_categories:,} ({games_with_categories/len(df)*100:.1f}%)") if games_with_categories > 0: # Count all categories all_categories = [] for categories_list in df['categories_parsed']: if isinstance(categories_list, list): all_categories.extend(categories_list) if all_categories: category_counts = Counter(all_categories) print(f"\nTop 10 most common categories:") for category, count in category_counts.most_common(10): print(f" '{category}': {count:,}") avg_categories_per_game = df['categories_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage categories per game: {avg_categories_per_game:.2f}") # --- 6. Analyze SUPPORTED_LANGUAGES --- print(f"\n--- SUPPORTED_LANGUAGES ANALYSIS ---") original_nulls = df['supported_languages'].apply(lambda x: pd.isna(x) if not isinstance(x, list) else False).sum() print(f"Original NULL values: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_arrays = df['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum() print(f"Empty language arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)") games_with_languages = df['supported_languages'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one supported language: {games_with_languages:,} ({games_with_languages/len(df)*100:.1f}%)") if games_with_languages > 0: # Count all languages all_languages = [] for languages_list in df['supported_languages']: if isinstance(languages_list, list): all_languages.extend(languages_list) if all_languages: language_counts = Counter(all_languages) print(f"\nTop 10 most supported languages:") for language, count in language_counts.most_common(10): print(f" '{language}': {count:,}") avg_languages_per_game = df['supported_languages'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage supported languages per game: {avg_languages_per_game:.2f}") # --- 7. GENRES ANALYSIS (from original script) --- print(f"\n--- GENRES ANALYSIS ---") original_nulls = df['genres'].isnull().sum() print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") Loading @@ -64,45 +238,53 @@ if 'genres' in df.columns: games_with_genres = df['genres_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one genre: {games_with_genres:,} ({games_with_genres/len(df)*100:.1f}%)") # Show some examples of empty/problematic entries empty_mask = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) if empty_mask.sum() > 0: print(f"\nSample of games with empty genres:") empty_sample = df[empty_mask][['name', 'genres', 'genres_parsed']].head(3) for idx, row in empty_sample.iterrows(): print(f" '{row['name']}': original='{row['genres']}', parsed={row['genres_parsed']}") # Count all genres immediately all_genres_debug = [] if games_with_genres > 0: # Count all genres all_genres = [] for genres_list in df['genres_parsed']: if isinstance(genres_list, list): all_genres_debug.extend(genres_list) all_genres.extend(genres_list) if all_genres_debug: genre_counts_debug = Counter(all_genres_debug) print(f"\nGenre counts (full dataset, before filtering):") for genre, count in genre_counts_debug.most_common(15): print(f" {genre}: {count:,}") if all_genres: genre_counts = Counter(all_genres) print(f"\nTop 10 most common genres:") for genre, count in genre_counts.most_common(10): print(f" '{genre}': {count:,}") # Show genre distribution stats avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # Show games with most genres max_genres = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).max() print(f"Maximum genres on a single game: {max_genres}") # --- 8. COMPREHENSIVE MISSING VALUES SUMMARY --- print("\n" + "="*60) print("COMPREHENSIVE MISSING VALUES SUMMARY") print("="*60) if max_genres > 0: multi_genre_games = df[df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0) == max_genres] print(f"Game(s) with {max_genres} genres:") for idx, row in multi_genre_games[['name', 'genres_parsed']].head(2).iterrows(): print(f" '{row['name']}': {row['genres_parsed']}") else: print("No genres found after parsing!") else: print("No 'genres' column found in dataset!") missing_summary = {} # --- 2. Handle price=0 --- # Basic missing values (NaN only) basic_missing = df.isnull().sum() print(f"\n--- Standard Missing Values (NaN) ---") for col, missing_count in basic_missing.sort_values(ascending=False).head(15).items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:25}: {missing_count:8,} ({missing_pct:5.1f}%)") missing_summary[col] = {'nan': missing_count, 'nan_pct': missing_pct} # Parsed/processed missing values print(f"\n--- Processed Missing Values ---") parsed_missing = { 'name (empty/whitespace)': total_missing_names, 'estimated_owners (parsed)': df['estimated_owners_parsed'].isnull().sum(), 'tags (empty dict)': df['tags'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True).sum(), 'genres_parsed (empty list)': df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'categories_parsed (empty list)': df['categories_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), 'supported_languages (empty list)': df['supported_languages'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum(), } for col, missing_count in parsed_missing.items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:30}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 9. Handle price=0 (from original script) --- def clean_price(row): price = row['price'] tags = row['tags'] Loading @@ -116,47 +298,21 @@ def clean_price(row): # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 3. Clean playtime columns (overwrite originals) --- # 0 playtime might mean missing if no one played / no data # --- 10. Clean playtime columns (from original script) --- playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 4. Missingness exploration --- print("\n--- Dataset size ---") print(df.shape) print("\n--- Missing values (NaN only) ---") print(df.isna().sum().sort_values(ascending=False)) print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 5. Basic EDA --- print("\n--- Price overview ---") print(df['price'].describe()) print("\n--- Playtime overview (hours) ---") print((df['average_playtime_forever'] / 60).describe()) # convert mins → hours print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 6. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- # --- 11. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("REMOVING ROWS WITH NaN IN KEY COLUMNS") print("FILTERING DATASET FOR COMPLETE KEY COLUMNS") print("="*60) print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns") # Key columns we need for analysis key_columns = ['price', 'average_playtime_forever', 'median_playtime_forever'] # Check missing values in key columns print(f"\nMissing values in key columns:") print(f"\nMissing values in key columns before filtering:") for col in key_columns: missing_count = df[col].isnull().sum() missing_pct = (missing_count / len(df)) * 100 Loading @@ -164,141 +320,75 @@ for col in key_columns: # Remove rows that have NaN in any of the key columns df_complete = df.dropna(subset=key_columns) print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns") print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows") print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%") # Show how many rows still have NaN in 2-week columns (this is fine) two_week_cols = ['average_playtime_2weeks', 'median_playtime_2weeks'] for col in two_week_cols: if col in df_complete.columns: missing_2week = df_complete[col].isnull().sum() missing_2week_pct = (missing_2week / len(df_complete)) * 100 print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%)") # --- 7. ANALYSIS ON FILTERED DATASET --- # --- 12. ANALYSIS ON FILTERED DATASET --- print("\n" + "="*60) print("ANALYSIS ON FILTERED DATASET (KEY COLUMNS COMPLETE)") print("ANALYSIS ON FILTERED DATASET") print("="*60) # Analysis for average_playtime_forever playtime_hours = df_complete['average_playtime_forever'] / 60 # convert to hours playtime_hours = df_complete['average_playtime_forever'] / 60 print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean: {playtime_hours.mean():.2f} hours") print(f"Median: {playtime_hours.median():.2f} hours") print(f"Standard Deviation: {playtime_hours.std():.2f} hours") print(f"Min: {playtime_hours.min():.2f} hours") print(f"Max: {playtime_hours.max():.2f} hours") print(f"25th percentile: {playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {playtime_hours.quantile(0.75):.2f} hours") print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours") print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours") # Analysis for median_playtime_forever median_playtime_hours = df_complete['median_playtime_forever'] / 60 print(f"\n--- Median Playtime Forever Analysis (n={len(df_complete):,}) ---") print(f"Mean of medians: {median_playtime_hours.mean():.2f} hours") print(f"Median of medians: {median_playtime_hours.median():.2f} hours") print(f"Standard Deviation: {median_playtime_hours.std():.2f} hours") print(f"Min: {median_playtime_hours.min():.2f} hours") print(f"Max: {median_playtime_hours.max():.2f} hours") print(f"25th percentile: {median_playtime_hours.quantile(0.25):.2f} hours") print(f"75th percentile: {median_playtime_hours.quantile(0.75):.2f} hours") # Price analysis price_data = df_complete['price'] print(f"\n--- Price Analysis (n={len(df_complete):,}) ---") print(f"Mean: ${price_data.mean():.2f}") print(f"Median: ${price_data.median():.2f}") print(f"Standard Deviation: ${price_data.std():.2f}") print(f"Min: ${price_data.min():.2f}") print(f"Max: ${price_data.max():.2f}") print(f"25th percentile: ${price_data.quantile(0.25):.2f}") print(f"75th percentile: ${price_data.quantile(0.75):.2f}") print(f"95th percentile: ${price_data.quantile(0.95):.2f}") print(f"99th percentile: ${price_data.quantile(0.99):.2f}") # Free vs Paid games breakdown free_games = (price_data == 0).sum() paid_games = (price_data > 0).sum() print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)") print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)") # Price ranges for paid games only paid_prices = price_data[price_data > 0] if len(paid_prices) > 0: print(f"\n--- Paid Games Price Analysis (n={len(paid_prices):,}) ---") print(f"Mean: ${paid_prices.mean():.2f}") print(f"Median: ${paid_prices.median():.2f}") print(f"Under $10: {(paid_prices < 10).sum():,} ({(paid_prices < 10).mean()*100:.1f}%)") print(f"$10-$30: {((paid_prices >= 10) & (paid_prices < 30)).sum():,} ({((paid_prices >= 10) & (paid_prices < 30)).mean()*100:.1f}%)") print(f"$30-$60: {((paid_prices >= 30) & (paid_prices < 60)).sum():,} ({((paid_prices >= 30) & (paid_prices < 60)).mean()*100:.1f}%)") print(f"Over $60: {(paid_prices >= 60).sum():,} ({(paid_prices >= 60).mean()*100:.1f}%)") # Correlations on complete dataset correlation_playtime_price = df_complete['average_playtime_forever'].corr(df_complete['price']) correlation_playtime_reviews = df_complete['average_playtime_forever'].corr(df_complete['num_reviews_total']) correlation_price_reviews = df_complete['price'].corr(df_complete['num_reviews_total']) print(f"\n--- Correlations (Complete Dataset) ---") print(f"Average playtime vs Price: {correlation_playtime_price:.3f}") print(f"Average playtime vs Total reviews: {correlation_playtime_reviews:.3f}") print(f"Price vs Total reviews: {correlation_price_reviews:.3f}") # Additional insights print(f"\n--- Additional Insights (Complete Dataset) ---") print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(playtime_hours > 100).mean()*100:.1f}%)") print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)") print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)") # --- 8. Visualization Examples (Filtered Dataset) --- # --- 13. Visualization of Missing Values --- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Distribution of prices df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax1) ax1.set_xlabel("Price ($)") ax1.set_ylabel("Count (log scale)") # Missing values heatmap-style plot missing_data = df.isnull().sum().sort_values(ascending=False).head(20) missing_data.plot(kind='bar', ax=ax1, title="Missing Values by Column (Top 20)") ax1.set_xlabel("Columns") ax1.set_ylabel("Count of Missing Values") ax1.tick_params(axis='x', rotation=45) # Distribution of average playtime (in hours, log scale) playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution (Filtered Dataset)", ax=ax2) ax2.set_xlabel("Average Playtime (hours, log scale)") # Distribution of prices (filtered dataset) df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax2) ax2.set_xlabel("Price ($)") ax2.set_ylabel("Count (log scale)") # Playtime vs Reviews ax3.scatter(df_complete['average_playtime_forever']/60, df_complete['num_reviews_total'], alpha=0.3) ax3.set_xscale("log") ax3.set_yscale("log") ax3.set_xlabel("Average Playtime (hours, log)") ax3.set_ylabel("Total Reviews (log)") ax3.set_title("Playtime vs Reviews (Filtered Dataset)") # Price vs Reviews (for paid games) paid_mask = df_complete['price'] > 0 if paid_mask.sum() > 0: ax4.scatter(df_complete[paid_mask]['price'], df_complete[paid_mask]['num_reviews_total'], alpha=0.3) ax4.set_xscale("log") ax4.set_yscale("log") ax4.set_xlabel("Price ($, log)") ax4.set_ylabel("Total Reviews (log)") ax4.set_title("Price vs Reviews - Paid Games (Filtered Dataset)") # Distribution of playtime playtime_hours_plot = playtime_hours[playtime_hours > 0] playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, title="Average Playtime Distribution", ax=ax3) ax3.set_xlabel("Average Playtime (hours, log scale)") ax3.set_ylabel("Count (log scale)") # Completeness by key parsed columns completeness_data = { 'Names': (len(df) - total_missing_names) / len(df) * 100, 'Estimated Owners': df['estimated_owners_parsed'].notnull().sum() / len(df) * 100, 'Tags': games_with_tags / len(df) * 100, 'Genres': games_with_genres / len(df) * 100, 'Categories': games_with_categories / len(df) * 100, 'Languages': games_with_languages / len(df) * 100, } ax4.bar(completeness_data.keys(), completeness_data.values()) ax4.set_title("Data Completeness by Column (%)") ax4.set_ylabel("Completeness (%)") ax4.tick_params(axis='x', rotation=45) ax4.set_ylim(0, 100) plt.tight_layout() plt.show() # --- 9. Language & Tag exploration (Filtered Dataset) --- # Top supported languages lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs]) print("\n--- Top supported languages (Filtered Dataset) ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Filtered Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) print("which columns we have:") print(df.columns) print("\nAnalysis complete! Dataset columns available:") print(list(df.columns))