Commit 86b13362 authored by danielczinege's avatar danielczinege
Browse files

feat: initial EDA

parent 95e481b7
Loading
Loading
Loading
Loading
+99 −0
Original line number Diff line number Diff line
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

# Load dataset (example CSV)
df = pd.read_csv("./data/games_03_2025_reduced.csv")

# --- 1. Parse JSON/array columns ---
# supported_languages might be a string like "['English', 'French']"
if df['supported_languages'].dtype == object:
    df['supported_languages'] = df['supported_languages'].apply(
        lambda x: ast.literal_eval(x) if pd.notna(x) and x.startswith('[') else []
    )

def parse_tags(x):
    """
    Parses a string representation of a dictionary or an empty list.
    Handles NaN values, dictionary-like strings ("{'tag': count}"),
    and empty list strings ("[]").
    """
    if pd.isna(x) or x == '[]':
        return {}
    try:
        return ast.literal_eval(str(x))
    except (ValueError, SyntaxError):
        return {}

df['tags'] = df['tags'].apply(parse_tags)

# --- 2. Handle price=0 ---
def clean_price(row):
    price = row['price']
    tags = row['tags']
    if price == 0:
        if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
            return 0  # true free to play
        else:
            return np.nan  # treat as missing
    return price

df['price_clean'] = df.apply(clean_price, axis=1)


# --- 4. Clean playtime columns ---
# 0 playtime might mean missing if no one played / no data
playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
                 'average_playtime_2weeks', 'median_playtime_2weeks']
for col in playtime_cols:
    df[col + "_clean"] = df[col].replace(0, np.nan)

# --- 5. Missingness exploration ---
print("\n--- Dataset size ---")
print(df.shape)

print("\n--- Missing values (NaN only) ---")
print(df.isna().sum().sort_values(ascending=False))

print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---")
num_cols = df.select_dtypes(include=[np.number]).columns
zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False)
print(zero_counts)

# --- 6. Basic EDA ---
print("\n--- Price overview ---")
print(df['price_clean'].describe())

print("\n--- Playtime overview (hours) ---")
print((df['average_playtime_forever_clean'] / 60).describe())  # convert mins → hours

print("\n--- Reviews overview ---")
print(df[['num_reviews_total', 'pct_pos_total']].describe())

# --- 7. Visualization Examples ---
# Distribution of prices
df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution")
plt.xlabel("Price ($)")
plt.show()

# Playtime vs Reviews
plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Average Playtime (hours, log)")
plt.ylabel("Total Reviews (log)")
plt.title("Playtime vs Reviews")
plt.show()

# --- 8. Language & Tag exploration ---
# Top supported languages
from collections import Counter
lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs])
print("\n--- Top supported languages ---")
print(pd.Series(lang_counts).sort_values(ascending=False).head(10))

# Top tags
tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()])
print("\n--- Top tags ---")
print(pd.Series(tag_counts).sort_values(ascending=False).head(10))