Commit 788de70d authored by danielczinege's avatar danielczinege
Browse files

feat: look for free to play in tags and also genres

parent c1f6141a
Loading
Loading
Loading
Loading
+19 −24
Original line number Diff line number Diff line
@@ -262,16 +262,31 @@ print(zero_counts)

# --- 7. Handle price=0 (BEFORE comprehensive summary) ---
def clean_price(row):
    """
    Cleans the price column. If price is 0, checks if 'Free to Play' is
    present in either tags or genres. If not, the price is set to NaN.
    """
    price = row['price']
    tags = row['tags']
    genres = row['genres_parsed']
    
    if price == 0:
        if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
            return 0  # true free to play
        # Check if 'Free to Play' is in tags (case-insensitive)
        is_free_in_tags = isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys())
        
        # Check if 'Free to Play' is in genres (case-insensitive)
        is_free_in_genres = isinstance(genres, list) and any(g.lower() == "free to play" for g in genres)

        # If it's marked as Free to Play in either tags or genres, keep the price at 0
        if is_free_in_tags or is_free_in_genres:
            return 0  # It's a genuinely free-to-play game
        else:
            return np.nan  # treat as missing
            # If price is 0 but not marked as F2P, treat it as missing data
            return np.nan
            
    return price

# Overwrite original price column
# Overwrite original price column with cleaned data
df['price'] = df.apply(clean_price, axis=1)

# --- 8. Clean playtime columns (BEFORE comprehensive summary) ---
@@ -312,26 +327,6 @@ for col, missing_count in parsed_missing.items():
    missing_pct = (missing_count / len(df)) * 100
    print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)")

# --- 10. Handle price=0 (from original script) ---
def clean_price(row):
    price = row['price']
    tags = row['tags']
    if price == 0:
        if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
            return 0  # true free to play
        else:
            return np.nan  # treat as missing
    return price

# Overwrite original price column
df['price'] = df.apply(clean_price, axis=1)

# --- 11. Clean playtime columns (from original script) ---
playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
                 'average_playtime_2weeks', 'median_playtime_2weeks']
for col in playtime_cols:
    df[col] = df[col].replace(0, np.nan)

# --- 12. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS ---
print("\n" + "="*60)
print("FILTERING DATASET FOR COMPLETE KEY COLUMNS")