fix: dont remove rows that have nan in 2weeks playtimes (too many nans) (ffabdda0) · Commits · Diana Valková / Visualization Project

exploratory_data_analysis.py

+34 −17

Original line number	Diff line number	Diff line
		@@ -72,22 +72,39 @@ print((df['average_playtime_forever'] / 60).describe()) # convert mins → hour
		print("\n--- Reviews overview ---")
		print(df[['num_reviews_total', 'pct_pos_total']].describe())

		# --- 6. REMOVE ROWS WITH ANY NaN VALUES ---
		# --- 6. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS ---
		print("\n" + "="*60)
		print("REMOVING ROWS WITH ANY NaN VALUES")
		print("REMOVING ROWS WITH NaN IN KEY COLUMNS")
		print("="*60)

		print(f"Original dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns")
		print(f"Rows with any NaN: {df.isnull().any(axis=1).sum():,}")

		# Remove all rows that have at least one NaN value
		df_complete = df.dropna()
		print(f"Complete dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns")
		# Key columns we need for analysis
		key_columns = ['price', 'average_playtime_forever', 'median_playtime_forever']

		# Check missing values in key columns
		print(f"\nMissing values in key columns:")
		for col in key_columns:
		missing_count = df[col].isnull().sum()
		missing_pct = (missing_count / len(df)) * 100
		print(f" {col}: {missing_count:,} ({missing_pct:.1f}%)")

		# Remove rows that have NaN in any of the key columns
		df_complete = df.dropna(subset=key_columns)
		print(f"\nFiltered dataset size: {df_complete.shape[0]:,} rows, {df_complete.shape[1]} columns")
		print(f"Data retention: {len(df_complete)/len(df)*100:.1f}%")

		# --- 7. ANALYSIS ON COMPLETE DATASET ---
		# Show how many rows still have NaN in 2-week columns (this is fine)
		two_week_cols = ['average_playtime_2weeks', 'median_playtime_2weeks']
		for col in two_week_cols:
		if col in df_complete.columns:
		missing_2week = df_complete[col].isnull().sum()
		missing_2week_pct = (missing_2week / len(df_complete)) * 100
		print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%) - OK to keep")

		# --- 7. ANALYSIS ON FILTERED DATASET ---
		print("\n" + "="*60)
		print("ANALYSIS ON COMPLETE DATASET (NO NaN VALUES)")
		print("ANALYSIS ON FILTERED DATASET (KEY COLUMNS COMPLETE)")
		print("="*60)

		# Analysis for average_playtime_forever
		@@ -163,18 +180,18 @@ print(f"Games with >100 hours avg playtime: {(playtime_hours > 100).sum():,} ({(
		print(f"Games with >1000 reviews: {(df_complete['num_reviews_total'] > 1000).sum():,} ({(df_complete['num_reviews_total'] > 1000).mean()*100:.1f}%)")
		print(f"Games with >90% positive reviews: {(df_complete['pct_pos_total'] > 90).sum():,} ({(df_complete['pct_pos_total'] > 90).mean()*100:.1f}%)")

		# --- 8. Visualization Examples (Complete Dataset) ---
		# --- 8. Visualization Examples (Filtered Dataset) ---
		fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

		# Distribution of prices
		df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Complete Dataset)", ax=ax1)
		df_complete['price'].plot(kind='hist', bins=50, logy=True, title="Price Distribution (Filtered Dataset)", ax=ax1)
		ax1.set_xlabel("Price ($)")
		ax1.set_ylabel("Count (log scale)")

		# Distribution of average playtime (in hours, log scale)
		playtime_hours_plot = playtime_hours[playtime_hours > 0] # Remove any remaining zeros
		playtime_hours_plot.plot(kind='hist', bins=50, logx=True, logy=True,
		title="Average Playtime Distribution (Complete Dataset)", ax=ax2)
		title="Average Playtime Distribution (Filtered Dataset)", ax=ax2)
		ax2.set_xlabel("Average Playtime (hours, log scale)")
		ax2.set_ylabel("Count (log scale)")

		@@ -184,7 +201,7 @@ ax3.set_xscale("log")
		ax3.set_yscale("log")
		ax3.set_xlabel("Average Playtime (hours, log)")
		ax3.set_ylabel("Total Reviews (log)")
		ax3.set_title("Playtime vs Reviews (Complete Dataset)")
		ax3.set_title("Playtime vs Reviews (Filtered Dataset)")

		# Price vs Reviews (for paid games)
		paid_mask = df_complete['price'] > 0
		@@ -194,18 +211,18 @@ if paid_mask.sum() > 0:
		ax4.set_yscale("log")
		ax4.set_xlabel("Price ($, log)")
		ax4.set_ylabel("Total Reviews (log)")
		ax4.set_title("Price vs Reviews - Paid Games (Complete Dataset)")
		ax4.set_title("Price vs Reviews - Paid Games (Filtered Dataset)")

		plt.tight_layout()
		plt.show()

		# --- 9. Language & Tag exploration (Complete Dataset) ---
		# --- 9. Language & Tag exploration (Filtered Dataset) ---
		# Top supported languages
		lang_counts = Counter([lang for langs in df_complete['supported_languages'] for lang in langs])
		print("\n--- Top supported languages (Complete Dataset) ---")
		print("\n--- Top supported languages (Filtered Dataset) ---")
		print(pd.Series(lang_counts).sort_values(ascending=False).head(10))

		# Top tags
		tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()])
		print("\n--- Top tags (Complete Dataset) ---")
		print("\n--- Top tags (Filtered Dataset) ---")
		print(pd.Series(tag_counts).sort_values(ascending=False).head(10))