Loading exploratory_data_analysis.py +4 −1 Original line number Diff line number Diff line Loading @@ -173,7 +173,7 @@ for col in two_week_cols: if col in df_complete.columns: missing_2week = df_complete[col].isnull().sum() missing_2week_pct = (missing_2week / len(df_complete)) * 100 print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%) - OK to keep") print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%)") # --- 7. ANALYSIS ON FILTERED DATASET --- print("\n" + "="*60) Loading Loading @@ -299,3 +299,6 @@ print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Filtered Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) print("which columns we have:") print(df.columns) Loading
exploratory_data_analysis.py +4 −1 Original line number Diff line number Diff line Loading @@ -173,7 +173,7 @@ for col in two_week_cols: if col in df_complete.columns: missing_2week = df_complete[col].isnull().sum() missing_2week_pct = (missing_2week / len(df_complete)) * 100 print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%) - OK to keep") print(f"Remaining NaN in {col}: {missing_2week:,} ({missing_2week_pct:.1f}%)") # --- 7. ANALYSIS ON FILTERED DATASET --- print("\n" + "="*60) Loading Loading @@ -299,3 +299,6 @@ print(pd.Series(lang_counts).sort_values(ascending=False).head(10)) tag_counts = Counter([tag for tags in df_complete['tags'] for tag in tags.keys()]) print("\n--- Top tags (Filtered Dataset) ---") print(pd.Series(tag_counts).sort_values(ascending=False).head(10)) print("which columns we have:") print(df.columns)