Loading exploratory_data_analysis.py +73 −0 Original line number Diff line number Diff line Loading @@ -27,8 +27,81 @@ def parse_tags(x): except (ValueError, SyntaxError): return {} def parse_genres(x): """Parse genres from various formats""" if pd.isna(x): return [] if isinstance(x, str): if x.startswith('[') and x.endswith(']'): try: return ast.literal_eval(x) except: # If it's just a comma-separated string return [genre.strip().strip('"').strip("'") for genre in x.strip('[]').split(',') if genre.strip()] elif ',' in x: return [genre.strip() for genre in x.split(',') if genre.strip()] else: return [x.strip()] if x.strip() else [] elif isinstance(x, list): return x else: return [] df['tags'] = df['tags'].apply(parse_tags) # Parse genres early in the pipeline if 'genres' in df.columns: df['genres_parsed'] = df['genres'].apply(parse_genres) # Check for missing/empty genres print(f"\n--- GENRE MISSING VALUES ANALYSIS ---") original_nulls = df['genres'].isnull().sum() print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_arrays = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum() print(f"Empty genre arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)") games_with_genres = df['genres_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one genre: {games_with_genres:,} ({games_with_genres/len(df)*100:.1f}%)") # Show some examples of empty/problematic entries empty_mask = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) if empty_mask.sum() > 0: print(f"\nSample of games with empty genres:") empty_sample = df[empty_mask][['name', 'genres', 'genres_parsed']].head(3) for idx, row in empty_sample.iterrows(): print(f" '{row['name']}': original='{row['genres']}', parsed={row['genres_parsed']}") # Count all genres immediately all_genres_debug = [] for genres_list in df['genres_parsed']: if isinstance(genres_list, list): all_genres_debug.extend(genres_list) if all_genres_debug: genre_counts_debug = Counter(all_genres_debug) print(f"\nGenre counts (full dataset, before filtering):") for genre, count in genre_counts_debug.most_common(15): print(f" {genre}: {count:,}") # Show genre distribution stats avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # Show games with most genres max_genres = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).max() print(f"Maximum genres on a single game: {max_genres}") if max_genres > 0: multi_genre_games = df[df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0) == max_genres] print(f"Game(s) with {max_genres} genres:") for idx, row in multi_genre_games[['name', 'genres_parsed']].head(2).iterrows(): print(f" '{row['name']}': {row['genres_parsed']}") else: print("No genres found after parsing!") else: print("No 'genres' column found in dataset!") # --- 2. Handle price=0 --- def clean_price(row): price = row['price'] Loading Loading
exploratory_data_analysis.py +73 −0 Original line number Diff line number Diff line Loading @@ -27,8 +27,81 @@ def parse_tags(x): except (ValueError, SyntaxError): return {} def parse_genres(x): """Parse genres from various formats""" if pd.isna(x): return [] if isinstance(x, str): if x.startswith('[') and x.endswith(']'): try: return ast.literal_eval(x) except: # If it's just a comma-separated string return [genre.strip().strip('"').strip("'") for genre in x.strip('[]').split(',') if genre.strip()] elif ',' in x: return [genre.strip() for genre in x.split(',') if genre.strip()] else: return [x.strip()] if x.strip() else [] elif isinstance(x, list): return x else: return [] df['tags'] = df['tags'].apply(parse_tags) # Parse genres early in the pipeline if 'genres' in df.columns: df['genres_parsed'] = df['genres'].apply(parse_genres) # Check for missing/empty genres print(f"\n--- GENRE MISSING VALUES ANALYSIS ---") original_nulls = df['genres'].isnull().sum() print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)") empty_arrays = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum() print(f"Empty genre arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)") games_with_genres = df['genres_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() print(f"Games with at least one genre: {games_with_genres:,} ({games_with_genres/len(df)*100:.1f}%)") # Show some examples of empty/problematic entries empty_mask = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True) if empty_mask.sum() > 0: print(f"\nSample of games with empty genres:") empty_sample = df[empty_mask][['name', 'genres', 'genres_parsed']].head(3) for idx, row in empty_sample.iterrows(): print(f" '{row['name']}': original='{row['genres']}', parsed={row['genres_parsed']}") # Count all genres immediately all_genres_debug = [] for genres_list in df['genres_parsed']: if isinstance(genres_list, list): all_genres_debug.extend(genres_list) if all_genres_debug: genre_counts_debug = Counter(all_genres_debug) print(f"\nGenre counts (full dataset, before filtering):") for genre, count in genre_counts_debug.most_common(15): print(f" {genre}: {count:,}") # Show genre distribution stats avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean() print(f"\nAverage genres per game: {avg_genres_per_game:.2f}") # Show games with most genres max_genres = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).max() print(f"Maximum genres on a single game: {max_genres}") if max_genres > 0: multi_genre_games = df[df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0) == max_genres] print(f"Game(s) with {max_genres} genres:") for idx, row in multi_genre_games[['name', 'genres_parsed']].head(2).iterrows(): print(f" '{row['name']}': {row['genres_parsed']}") else: print("No genres found after parsing!") else: print("No 'genres' column found in dataset!") # --- 2. Handle price=0 --- def clean_price(row): price = row['price'] Loading