Loading exploratory_data_analysis.py +19 −24 Original line number Diff line number Diff line Loading @@ -262,16 +262,31 @@ print(zero_counts) # --- 7. Handle price=0 (BEFORE comprehensive summary) --- def clean_price(row): """ Cleans the price column. If price is 0, checks if 'Free to Play' is present in either tags or genres. If not, the price is set to NaN. """ price = row['price'] tags = row['tags'] genres = row['genres_parsed'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play # Check if 'Free to Play' is in tags (case-insensitive) is_free_in_tags = isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()) # Check if 'Free to Play' is in genres (case-insensitive) is_free_in_genres = isinstance(genres, list) and any(g.lower() == "free to play" for g in genres) # If it's marked as Free to Play in either tags or genres, keep the price at 0 if is_free_in_tags or is_free_in_genres: return 0 # It's a genuinely free-to-play game else: return np.nan # treat as missing # If price is 0 but not marked as F2P, treat it as missing data return np.nan return price # Overwrite original price column # Overwrite original price column with cleaned data df['price'] = df.apply(clean_price, axis=1) # --- 8. Clean playtime columns (BEFORE comprehensive summary) --- Loading Loading @@ -312,26 +327,6 @@ for col, missing_count in parsed_missing.items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 10. Handle price=0 (from original script) --- def clean_price(row): price = row['price'] tags = row['tags'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play else: return np.nan # treat as missing return price # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 11. Clean playtime columns (from original script) --- playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 12. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("FILTERING DATASET FOR COMPLETE KEY COLUMNS") Loading Loading
exploratory_data_analysis.py +19 −24 Original line number Diff line number Diff line Loading @@ -262,16 +262,31 @@ print(zero_counts) # --- 7. Handle price=0 (BEFORE comprehensive summary) --- def clean_price(row): """ Cleans the price column. If price is 0, checks if 'Free to Play' is present in either tags or genres. If not, the price is set to NaN. """ price = row['price'] tags = row['tags'] genres = row['genres_parsed'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play # Check if 'Free to Play' is in tags (case-insensitive) is_free_in_tags = isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()) # Check if 'Free to Play' is in genres (case-insensitive) is_free_in_genres = isinstance(genres, list) and any(g.lower() == "free to play" for g in genres) # If it's marked as Free to Play in either tags or genres, keep the price at 0 if is_free_in_tags or is_free_in_genres: return 0 # It's a genuinely free-to-play game else: return np.nan # treat as missing # If price is 0 but not marked as F2P, treat it as missing data return np.nan return price # Overwrite original price column # Overwrite original price column with cleaned data df['price'] = df.apply(clean_price, axis=1) # --- 8. Clean playtime columns (BEFORE comprehensive summary) --- Loading Loading @@ -312,26 +327,6 @@ for col, missing_count in parsed_missing.items(): missing_pct = (missing_count / len(df)) * 100 print(f"{col:35}: {missing_count:8,} ({missing_pct:5.1f}%)") # --- 10. Handle price=0 (from original script) --- def clean_price(row): price = row['price'] tags = row['tags'] if price == 0: if isinstance(tags, dict) and any(t.lower() == "free to play" for t in tags.keys()): return 0 # true free to play else: return np.nan # treat as missing return price # Overwrite original price column df['price'] = df.apply(clean_price, axis=1) # --- 11. Clean playtime columns (from original script) --- playtime_cols = ['average_playtime_forever', 'median_playtime_forever', 'average_playtime_2weeks', 'median_playtime_2weeks'] for col in playtime_cols: df[col] = df[col].replace(0, np.nan) # --- 12. REMOVE ROWS WITH NaN VALUES IN KEY COLUMNS --- print("\n" + "="*60) print("FILTERING DATASET FOR COMPLETE KEY COLUMNS") Loading