Loading exploratory_data_analysis.py 0 → 100644 +99 −0 Original line number Diff line number Diff line import pandas as pd import numpy as np import ast import matplotlib.pyplot as plt # Load dataset (example CSV) df = pd.read_csv("./data/games_03_2025_reduced.csv") # --- 1. Parse JSON/array columns --- # supported_languages might be a string like "['English', 'French']" if df['supported_languages'].dtype == object: df['supported_languages'] = df['supported_languages'].apply( lambda x: ast.literal_eval(x) if pd.notna(x) and x.startswith('[') else [] ) def parse_tags(x): """ Parses a string representation of a dictionary or an empty list. Handles NaN values, dictionary-like strings ("{'tag': count}"), and empty list strings ("[]"). """ if pd.isna(x) or x == '[]': return {} try: return ast.literal_eval(str(x)) except (ValueError, SyntaxError): return {} df['tags'] = df['tags'].apply(parse_tags) # --- 2. Handle price=0 --- def clean_price(row): price = row['price'] tags = row['tags'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play else: return np.nan # treat as missing return price df['price_clean'] = df.apply(clean_price, axis=1) # --- 4. Clean playtime columns --- # 0 playtime might mean missing if no one played / no data playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col + "_clean"] = df[col].replace(0, np.nan) # --- 5. Missingness exploration --- print("\n--- Dataset size ---") print(df.shape) print("\n--- Missing values (NaN only) ---") print(df.isna().sum().sort_values(ascending=False)) print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 6. Basic EDA --- print("\n--- Price overview ---") print(df['price_clean'].describe()) print("\n--- Playtime overview (hours) ---") print((df['average_playtime_forever_clean'] / 60).describe()) # convert mins → hours print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 7. Visualization Examples --- # Distribution of prices df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution") plt.xlabel("Price ($)") plt.show() # Playtime vs Reviews plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3) plt.xscale("log") plt.yscale("log") plt.xlabel("Average Playtime (hours, log)") plt.ylabel("Total Reviews (log)") plt.title("Playtime vs Reviews") plt.show() # --- 8. Language & Tag exploration --- # Top supported languages from collections import Counter lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs]) print("\n--- Top supported languages ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()]) print("\n--- Top tags ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) Loading
exploratory_data_analysis.py 0 → 100644 +99 −0 Original line number Diff line number Diff line import pandas as pd import numpy as np import ast import matplotlib.pyplot as plt # Load dataset (example CSV) df = pd.read_csv("./data/games_03_2025_reduced.csv") # --- 1. Parse JSON/array columns --- # supported_languages might be a string like "['English', 'French']" if df['supported_languages'].dtype == object: df['supported_languages'] = df['supported_languages'].apply( lambda x: ast.literal_eval(x) if pd.notna(x) and x.startswith('[') else [] ) def parse_tags(x): """ Parses a string representation of a dictionary or an empty list. Handles NaN values, dictionary-like strings ("{'tag': count}"), and empty list strings ("[]"). """ if pd.isna(x) or x == '[]': return {} try: return ast.literal_eval(str(x)) except (ValueError, SyntaxError): return {} df['tags'] = df['tags'].apply(parse_tags) # --- 2. Handle price=0 --- def clean_price(row): price = row['price'] tags = row['tags'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play else: return np.nan # treat as missing return price df['price_clean'] = df.apply(clean_price, axis=1) # --- 4. Clean playtime columns --- # 0 playtime might mean missing if no one played / no data playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col + "_clean"] = df[col].replace(0, np.nan) # --- 5. Missingness exploration --- print("\n--- Dataset size ---") print(df.shape) print("\n--- Missing values (NaN only) ---") print(df.isna().sum().sort_values(ascending=False)) print("\n--- Potential pseudo-missing values (zeros in numeric columns) ---") num_cols = df.select_dtypes(include=[np.number]).columns zero_counts = (df[num_cols] == 0).sum().sort_values(ascending=False) print(zero_counts) # --- 6. Basic EDA --- print("\n--- Price overview ---") print(df['price_clean'].describe()) print("\n--- Playtime overview (hours) ---") print((df['average_playtime_forever_clean'] / 60).describe()) # convert mins → hours print("\n--- Reviews overview ---") print(df[['num_reviews_total', 'pct_pos_total']].describe()) # --- 7. Visualization Examples --- # Distribution of prices df['price_clean'].dropna().plot(kind='hist', bins=50, logy=True, title="Price distribution") plt.xlabel("Price ($)") plt.show() # Playtime vs Reviews plt.scatter(df['average_playtime_forever_clean']/60, df['num_reviews_total'], alpha=0.3) plt.xscale("log") plt.yscale("log") plt.xlabel("Average Playtime (hours, log)") plt.ylabel("Total Reviews (log)") plt.title("Playtime vs Reviews") plt.show() # --- 8. Language & Tag exploration --- # Top supported languages from collections import Counter lang_counts = Counter([lang for langs in df['supported_languages'] for lang in langs]) print("\n--- Top supported languages ---") print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) # Top tags tag_counts = Counter([tag for tags in df['tags'] for tag in tags.keys()]) print("\n--- Top tags ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10))