feat: look for free to play in tags and also genres (788de70d) · Commits · Diana Valková / Visualization Project

exploratory_data_analysis.py

+19 −24

Original line number	Diff line number	Diff line
		@@ -262,16 +262,31 @@ print(zero_counts)

		# --- 7. Handle price=0 (BEFORE comprehensive summary) ---
		def clean_price(row):
		"""
		Cleans the price column. If price is 0, checks if 'Free to Play' is
		present in either tags or genres. If not, the price is set to NaN.
		"""
		price = row['price']
		tags = row['tags']
		genres = row['genres_parsed']

		if price == 0:
		if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
		return 0 # true free to play
		# Check if 'Free to Play' is in tags (case-insensitive)
		is_free_in_tags = isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys())

		# Check if 'Free to Play' is in genres (case-insensitive)
		is_free_in_genres = isinstance(genres, list) and any(g.lower() == "free to play" for g in genres)

		# If it's marked as Free to Play in either tags or genres, keep the price at 0
		if is_free_in_tags or is_free_in_genres:
		return 0 # It's a genuinely free-to-play game
		else:
		return np.nan # treat as missing
		# If price is 0 but not marked as F2P, treat it as missing data
		return np.nan

		return price

		# Overwrite original price column
		# Overwrite original price column with cleaned data
		df['price'] = df.apply(clean_price, axis=1)

		# --- 8. Clean playtime columns (BEFORE comprehensive summary) ---
		@@ -312,26 +327,6 @@ for col, missing_count in parsed_missing.items():
		missing_pct = (missing_count / len(df)) * 100
		print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)")

		# --- 10. Handle price=0 (from original script) ---
		def clean_price(row):
		price = row['price']
		tags = row['tags']
		if price == 0:
		if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()):
		return 0 # true free to play
		else:
		return np.nan # treat as missing
		return price

		# Overwrite original price column
		df['price'] = df.apply(clean_price, axis=1)

		# --- 11. Clean playtime columns (from original script) ---
		playtime_cols = ['average_playtime_forever', 'median_playtime_forever',
		'average_playtime_2weeks', 'median_playtime_2weeks']
		for col in playtime_cols:
		df[col] = df[col].replace(0, np.nan)

		# --- 12. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS ---
		print("\n" + "="*60)
		print("FILTERING DATASET FOR COMPLETE KEY COLUMNS")