Commit c1f6141a authored by danielczinege's avatar danielczinege
Browse files

fix: do price and playtime cleaning before summary

parent eaebbd90
Loading
Loading
Loading
Loading
+24 −3
Original line number Diff line number Diff line
@@ -227,7 +227,7 @@ if games_with_languages > 0:
        avg_languages_per_game = df['supported_languages'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean()
        print(f"\nAverage supported languages per game: {avg_languages_per_game:.2f}")

# --- 7. GENRES ANALYSIS (from original script) ---
# --- 9. GENRES ANALYSIS (from original script) ---
print(f"\n--- GENRES ANALYSIS ---")
original_nulls = df['genres'].isnull().sum()
print(f"Original NULL values in genres: {original_nulls:,} ({original_nulls/len(df)*100:.1f}%)")
@@ -254,13 +254,34 @@ if games_with_genres > 0:
        avg_genres_per_game = df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0).mean()
        print(f"\nAverage genres per game: {avg_genres_per_game:.2f}")

# --- 8. Pseudo-missing values analysis (zeros in numeric columns) ---
# --- 10. Pseudo-missing values analysis (zeros in numeric columns) ---
print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---")
num_cols = df.select_dtypes(include=[np.number]).columns
zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False)
print(zero_counts)

# --- 9. COMPREHENSIVE MISSING VALUES SUMMARY ---
# --- 7. Handle price=0 (BEFORE comprehensive summary) ---
def clean_price(row):
    price = row['price']
    tags = row['tags']
    if price == 0:
        if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
            return 0  # true free to play
        else:
            return np.nan  # treat as missing
    return price

# Overwrite original price column
df['price'] = df.apply(clean_price, axis=1)

# --- 8. Clean playtime columns (BEFORE comprehensive summary) ---
# 0 playtime might mean missing if no one played / no data
playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
                 'average_playtime_2weeks', 'median_playtime_2weeks']
for col in playtime_cols:
    df[col] = df[col].replace(0, np.nan)

# --- 11. COMPREHENSIVE MISSING VALUES SUMMARY ---
print("\n" + "="*60)
print("COMPREHENSIVE MISSING VALUES SUMMARY")
print("="*60)