Commit 9268ed9e authored by danielczinege's avatar danielczinege
Browse files

feat: add genres information - missing and counts

parent ffabdda0
Loading
Loading
Loading
Loading
+73 −0
Original line number Diff line number Diff line
@@ -27,8 +27,81 @@ def parse_tags(x):
    except (ValueError, SyntaxError):
        return {}

def parse_genres(x):
    """Parse genres from various formats"""
    if pd.isna(x):
        return []
    if isinstance(x, str):
        if x.startswith('[') and x.endswith(']'):
            try:
                return ast.literal_eval(x)
            except:
                # If it's just a comma-separated string
                return [genre.strip().strip('"').strip("'") for genre in x.strip('[]').split(',') if genre.strip()]
        elif ',' in x:
            return [genre.strip() for genre in x.split(',') if genre.strip()]
        else:
            return [x.strip()] if x.strip() else []
    elif isinstance(x, list):
        return x
    else:
        return []

df['tags'] = df['tags'].apply(parse_tags)

# Parse genres early in the pipeline
if 'genres' in df.columns:
    df['genres_parsed'] = df['genres'].apply(parse_genres)
    
    # Check for missing/empty genres
    print(f"\n--- GENRE MISSING VALUES ANALYSIS ---")
    original_nulls = df['genres'].isnull().sum()
    print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)")
    
    empty_arrays = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()
    print(f"Empty genre arrays after parsing: {empty_arrays:,} ({empty_arrays/len(df)*100:.1f}%)")
    
    games_with_genres = df['genres_parsed'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum()
    print(f"Games with at least one genre: {games_with_genres:,} ({games_with_genres/len(df)*100:.1f}%)")
    
    # Show some examples of empty/problematic entries
    empty_mask = df['genres_parsed'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)
    if empty_mask.sum() > 0:
        print(f"\nSample of games with empty genres:")
        empty_sample = df[empty_mask][['name', 'genres', 'genres_parsed']].head(3)
        for idx, row in empty_sample.iterrows():
            print(f"  '{row['name']}': original='{row['genres']}', parsed={row['genres_parsed']}")
    
    # Count all genres immediately
    all_genres_debug = []
    for genres_list in df['genres_parsed']:
        if isinstance(genres_list, list):
            all_genres_debug.extend(genres_list)
    
    if all_genres_debug:
        genre_counts_debug = Counter(all_genres_debug)
        print(f"\nGenre counts (full dataset, before filtering):")
        for genre, count in genre_counts_debug.most_common(15):
            print(f"  {genre}: {count:,}")
        
        # Show genre distribution stats
        avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean()
        print(f"\nAverage genres per game: {avg_genres_per_game:.2f}")
        
        # Show games with most genres
        max_genres = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).max()
        print(f"Maximum genres on a single game: {max_genres}")
        
        if max_genres > 0:
            multi_genre_games = df[df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0) == max_genres]
            print(f"Game(s) with {max_genres} genres:")
            for idx, row in multi_genre_games[['name', 'genres_parsed']].head(2).iterrows():
                print(f"  '{row['name']}': {row['genres_parsed']}")
    else:
        print("No genres found after parsing!")
else:
    print("No 'genres' column found in dataset!")

# --- 2. Handle price=0 ---
def clean_price(row):
    price = row['price']