Commit ffabdda0 authored by danielczinege's avatar danielczinege
Browse files

fix: dont remove rows that have nan in 2weeks playtimes (too many nans)

parent eaed9908
Loading
Loading
Loading
Loading
+34 −17
Original line number Diff line number Diff line
@@ -72,22 +72,39 @@ print((df['average_playtime_forever'] / 60).describe()) # convert mins → hour
print("\n--- Reviews overview ---")
print(df[['num_reviews_total', 'pct_pos_total']].describe())

# --- 6. REMOVE ROWS WITH ANY NaN VALUES ---
# --- 6. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS ---
print("\n" + "="*60)
print("REMOVING ROWS WITH ANY NaN VALUES")
print("REMOVING ROWS WITH NaN IN KEY COLUMNS")
print("="*60)

print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}")

# Remove all rows that have at least one NaN value
df_complete = df.dropna()
print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns")
# Key columns we need for analysis
key_columns = ['price', 'average_playtime_forever', 'median_playtime_forever']

# Check missing values in key columns
print(f"\nMissing values in key columns:")
for col in key_columns:
    missing_count = df[col].isnull().sum()
    missing_pct = (missing_count / len(df)) * 100
    print(f"  {col}: {missing_count:,} ({missing_pct:.1f}%)")

# Remove rows that have NaN in any of the key columns
df_complete = df.dropna(subset=key_columns)
print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns")
print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%")

# --- 7. ANALYSIS ON COMPLETE DATASET ---
# Show how many rows still have NaN in 2-week columns (this is fine)
two_week_cols = ['average_playtime_2weeks', 'median_playtime_2weeks']
for col in two_week_cols:
    if col in df_complete.columns:
        missing_2week = df_complete[col].isnull().sum()
        missing_2week_pct = (missing_2week / len(df_complete)) * 100
        print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%) - OK to keep")

# --- 7. ANALYSIS ON FILTERED DATASET ---
print("\n" + "="*60)
print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)")
print("ANALYSIS ON FILTERED DATASET (KEY COLUMNS COMPLETE)")
print("="*60)

# Analysis for average_playtime_forever
@@ -163,18 +180,18 @@ print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(
print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)")
print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)")

# --- 8. Visualization Examples (Complete Dataset) ---
# --- 8. Visualization Examples (Filtered Dataset) ---
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Distribution of prices
df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1)
df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax1)
ax1.set_xlabel("Price ($)")
ax1.set_ylabel("Count (log scale)")

# Distribution of average playtime (in hours, log scale)
playtime_hours_plot = playtime_hours[playtime_hours > 0]  # Remove any remaining zeros
playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True, 
                        title="Average Playtime Distribution (Complete Dataset)", ax=ax2)
                        title="Average Playtime Distribution (Filtered Dataset)", ax=ax2)
ax2.set_xlabel("Average Playtime (hours, log scale)")
ax2.set_ylabel("Count (log scale)")

@@ -184,7 +201,7 @@ ax3.set_xscale("log")
ax3.set_yscale("log")
ax3.set_xlabel("Average Playtime (hours, log)")
ax3.set_ylabel("Total Reviews (log)")
ax3.set_title("Playtime vs Reviews (Complete Dataset)")
ax3.set_title("Playtime vs Reviews (Filtered Dataset)")

# Price vs Reviews (for paid games)
paid_mask = df_complete['price'] > 0
@@ -194,18 +211,18 @@ if paid_mask.sum() > 0:
    ax4.set_yscale("log")
    ax4.set_xlabel("Price ($, log)")
    ax4.set_ylabel("Total Reviews (log)")
    ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)")
    ax4.set_title("Price vs Reviews - Paid Games (Filtered Dataset)")

plt.tight_layout()
plt.show()

# --- 9. Language & Tag exploration (Complete Dataset) ---
# --- 9. Language & Tag exploration (Filtered Dataset) ---
# Top supported languages
lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs])
print("\n--- Top supported languages (Complete Dataset) ---")
print("\n--- Top supported languages (Filtered Dataset) ---")
print(pd.Series(lang_counts).sort_values(ascending=False).head(10))

# Top tags
tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()])
print("\n--- Top tags (Complete Dataset) ---")
print("\n--- Top tags (Filtered Dataset) ---")
print(pd.Series(tag_counts).sort_values(ascending=False).head(10))