Commit eaed9908 authored by danielczinege's avatar danielczinege
Browse files

feat: remove all nans

parent 86b13362
Loading
Loading
Loading
Loading
+137 −25
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@ import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
from collections import Counter

# Load dataset (example CSV)
df = pd.read_csv("./data/games_03_2025_reduced.csv")
@@ -39,17 +40,17 @@ def clean_price(row):
            return np.nan  # treat as missing
    return price

df['price_clean'] = df.apply(clean_price, axis=1)

# Overwrite original price column
df['price'] = df.apply(clean_price, axis=1)

# --- 4. Clean playtime columns ---
# --- 3. Clean playtime columns (overwrite originals) ---
# 0 playtime might mean missing if no one played / no data
playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
                 'average_playtime_2weeks', 'median_playtime_2weeks']
for col in playtime_cols:
    df[col + "_clean"] = df[col].replace(0, np.nan)
    df[col] = df[col].replace(0, np.nan)

# --- 5. Missingness exploration ---
# --- 4. Missingness exploration ---
print("\n--- Dataset size ---")
print(df.shape)

@@ -61,39 +62,150 @@ num_cols = df.select_dtypes(include=[np.number]).columns
zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False)
print(zero_counts)

# --- 6. Basic EDA ---
# --- 5. Basic EDA ---
print("\n--- Price overview ---")
print(df['price_clean'].describe())
print(df['price'].describe())

print("\n--- Playtime overview (hours) ---")
print((df['average_playtime_forever_clean'] / 60).describe())  # convert mins → hours
print((df['average_playtime_forever'] / 60).describe())  # convert mins → hours

print("\n--- Reviews overview ---")
print(df[['num_reviews_total', 'pct_pos_total']].describe())

# --- 7. Visualization Examples ---
# --- 6. REMOVE ROWS WITH ANY NaN VALUES ---
print("\n" + "="*60)
print("REMOVING ROWS WITH ANY NaN VALUES")
print("="*60)

print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}")

# Remove all rows that have at least one NaN value
df_complete = df.dropna()
print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns")
print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%")

# --- 7. ANALYSIS ON COMPLETE DATASET ---
print("\n" + "="*60)
print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)")
print("="*60)

# Analysis for average_playtime_forever
playtime_hours = df_complete['average_playtime_forever'] / 60  # convert to hours

print(f"\n--- Average Playtime Forever Analysis (n={len(df_complete):,}) ---")
print(f"Mean: {playtime_hours.mean():.2f} hours")
print(f"Median: {playtime_hours.median():.2f} hours")
print(f"Standard Deviation: {playtime_hours.std():.2f} hours")
print(f"Min: {playtime_hours.min():.2f} hours")
print(f"Max: {playtime_hours.max():.2f} hours")
print(f"25th percentile: {playtime_hours.quantile(0.25):.2f} hours")
print(f"75th percentile: {playtime_hours.quantile(0.75):.2f} hours")
print(f"95th percentile: {playtime_hours.quantile(0.95):.2f} hours")
print(f"99th percentile: {playtime_hours.quantile(0.99):.2f} hours")

# Analysis for median_playtime_forever
median_playtime_hours = df_complete['median_playtime_forever'] / 60

print(f"\n--- Median Playtime Forever Analysis (n={len(df_complete):,}) ---")
print(f"Mean of medians: {median_playtime_hours.mean():.2f} hours")
print(f"Median of medians: {median_playtime_hours.median():.2f} hours")
print(f"Standard Deviation: {median_playtime_hours.std():.2f} hours")
print(f"Min: {median_playtime_hours.min():.2f} hours")
print(f"Max: {median_playtime_hours.max():.2f} hours")
print(f"25th percentile: {median_playtime_hours.quantile(0.25):.2f} hours")
print(f"75th percentile: {median_playtime_hours.quantile(0.75):.2f} hours")

# Price analysis
price_data = df_complete['price']

print(f"\n--- Price Analysis (n={len(df_complete):,}) ---")
print(f"Mean: ${price_data.mean():.2f}")
print(f"Median: ${price_data.median():.2f}")
print(f"Standard Deviation: ${price_data.std():.2f}")
print(f"Min: ${price_data.min():.2f}")
print(f"Max: ${price_data.max():.2f}")
print(f"25th percentile: ${price_data.quantile(0.25):.2f}")
print(f"75th percentile: ${price_data.quantile(0.75):.2f}")
print(f"95th percentile: ${price_data.quantile(0.95):.2f}")
print(f"99th percentile: ${price_data.quantile(0.99):.2f}")

# Free vs Paid games breakdown
free_games = (price_data == 0).sum()
paid_games = (price_data > 0).sum()
print(f"\nFree games: {free_games:,} ({free_games/len(df_complete)*100:.1f}%)")
print(f"Paid games: {paid_games:,} ({paid_games/len(df_complete)*100:.1f}%)")

# Price ranges for paid games only
paid_prices = price_data[price_data > 0]
if len(paid_prices) > 0:
    print(f"\n--- Paid Games Price Analysis (n={len(paid_prices):,}) ---")
    print(f"Mean: ${paid_prices.mean():.2f}")
    print(f"Median: ${paid_prices.median():.2f}")
    print(f"Under $10: {(paid_prices < 10).sum():,} ({(paid_prices < 10).mean()*100:.1f}%)")
    print(f"$10-$30: {((paid_prices >= 10) & (paid_prices < 30)).sum():,} ({((paid_prices >= 10) & (paid_prices < 30)).mean()*100:.1f}%)")
    print(f"$30-$60: {((paid_prices >= 30) & (paid_prices < 60)).sum():,} ({((paid_prices >= 30) & (paid_prices < 60)).mean()*100:.1f}%)")
    print(f"Over $60: {(paid_prices >= 60).sum():,} ({(paid_prices >= 60).mean()*100:.1f}%)")

# Correlations on complete dataset
correlation_playtime_price = df_complete['average_playtime_forever'].corr(df_complete['price'])
correlation_playtime_reviews = df_complete['average_playtime_forever'].corr(df_complete['num_reviews_total'])
correlation_price_reviews = df_complete['price'].corr(df_complete['num_reviews_total'])

print(f"\n--- Correlations (Complete Dataset) ---")
print(f"Average playtime vs Price: {correlation_playtime_price:.3f}")
print(f"Average playtime vs Total reviews: {correlation_playtime_reviews:.3f}")
print(f"Price vs Total reviews: {correlation_price_reviews:.3f}")

# Additional insights
print(f"\n--- Additional Insights (Complete Dataset) ---")
print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(playtime_hours > 100).mean()*100:.1f}%)")
print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)")
print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)")

# --- 8. Visualization Examples (Complete Dataset) ---
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Distribution of prices
df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution")
plt.xlabel("Price ($)")
plt.show()
df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1)
ax1.set_xlabel("Price ($)")
ax1.set_ylabel("Count (log scale)")

# Distribution of average playtime (in hours, log scale)
playtime_hours_plot = playtime_hours[playtime_hours > 0]  # Remove any remaining zeros
playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, 
                        title="Average Playtime Distribution (Complete Dataset)", ax=ax2)
ax2.set_xlabel("Average Playtime (hours, log scale)")
ax2.set_ylabel("Count (log scale)")

# Playtime vs Reviews
plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Average Playtime (hours, log)")
plt.ylabel("Total Reviews (log)")
plt.title("Playtime vs Reviews")
ax3.scatter(df_complete['average_playtime_forever']/60, df_complete['num_reviews_total'], alpha=0.3)
ax3.set_xscale("log")
ax3.set_yscale("log")
ax3.set_xlabel("Average Playtime (hours, log)")
ax3.set_ylabel("Total Reviews (log)")
ax3.set_title("Playtime vs Reviews (Complete Dataset)")

# Price vs Reviews (for paid games)
paid_mask = df_complete['price'] > 0
if paid_mask.sum() > 0:
    ax4.scatter(df_complete[paid_mask]['price'], df_complete[paid_mask]['num_reviews_total'], alpha=0.3)
    ax4.set_xscale("log")
    ax4.set_yscale("log")
    ax4.set_xlabel("Price ($, log)")
    ax4.set_ylabel("Total Reviews (log)")
    ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)")

plt.tight_layout()
plt.show()

# --- 8. Language & Tag exploration ---
# --- 9. Language & Tag exploration (Complete Dataset) ---
# Top supported languages
from collections import Counter
lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs])
print("\n--- Top supported languages ---")
lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs])
print("\n--- Top supported languages (Complete Dataset) ---")
print(pd.Series(lang_counts).sort_values(ascending=False).head(10))

# Top tags
tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()])
print("\n--- Top tags ---")
tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()])
print("\n--- Top tags (Complete Dataset) ---")
print(pd.Series(tag_counts).sort_values(ascending=False).head(10))
 No newline at end of file