import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r"D:\B_Data_Anlysist_Project\Python_Projects\01_Handling_Null_Value\walmart_sales_with_nulls_value_dataset.csv")
df.head()
df.shape
df.info()
df.isnull().sum()
df_copy = pd.DataFrame(df)
df_copy.head()
df_copy["Order ID"].info()
df_copy["Order ID"].isnull().sum()
df_copy[df_copy["Order ID"].isnull()]
df_copy["Order ID"] = df_copy["Order ID"].fillna("Unknown")
df_copy[df_copy["Order ID"] == "Unknown"]
"Unknown" to retain row integrity without creating misleading data.df_copy["Order ID"].info()
df_copy["Customer ID"].nunique()
df_copy["Customer ID"].info()
df_copy[df_copy["Customer ID"].isnull()]
df_copy["Customer ID"] = df_copy["Customer ID"].fillna("Unknown")
df_copy[df_copy["Customer ID"] == "Unknown"]
"Unknown" to retain row integrity without creating misleading assumptions.df_copy["Customer ID"].info()
df_copy["Customer Name"].nunique()
df_copy["Customer Name"].info()
df_copy[df_copy["Customer Name"].isnull()]
df_copy["Customer Name"] = df_copy["Customer Name"].fillna("Unknown")
df_copy[df_copy["Customer Name"] == "Unknown"]
df_copy["Customer Name"].info()
df_copy[["Order ID", "Customer ID", "Customer Name"]].info()
df_date = pd.DataFrame(df_copy)
df_date.head()
df_date["Order Date"] = pd.to_datetime(df_date["Order Date"], dayfirst= True)
df_date.head()
df_date["Year"] = df_date["Order Date"].dt.year
df_date.head()
df_date["Month"] = df_date["Order Date"].dt.month
df_date.head()
df_date[(df_date["Year"] == 2023) & (df_date["Month"] == 7)].count()
df_date[(df_date["Year"] == 2023) & (df_date["Month"] == 8)].count()
df_date[(df_date["Year"] == 2023) & (df_date["Month"] == 9)].count()
df_date[(df_date["Year"] == 2023) & (df_date["Month"] == 10)].count()
df_date[(df_date["Year"] == 2023) & (df_date["Month"] == 11)].count()
df_date[(df_date["Year"] == 2023) & (df_date["Month"] == 12)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 1)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 2)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 3)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 4)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 5)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 6)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 7)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 8)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 9)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 10)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 11)].count()
df_date[(df_date["Year"] == 2024) & (df_date["Month"] == 12)].count()
df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 1)].count()
df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 2)].count()
df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 3)].count()
df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 4)].count()
df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 5)].count()
df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 6)].count()
df_per = df_date.groupby("Year").agg({"Month" : "count"})/df_date["Year"].count()*100
round(df_per)
df_date[df_date["Order Date"].isnull()]
df_date.at[6, "Order Date"] = "01-01-2024"
df_date.at[89, "Order Date"] = "20-04-2024"
df_date.at[10, "Order Date"] = "15-02-2024"
df_date.at[74, "Order Date"] = "12-11-2024"
df_date.at[12, "Order Date"] = "16-12-2024"
df_date.at[45, "Order Date"] = "11-04-2024"
df_date.at[43, "Order Date"] = "23-09-2024"
df_date.at[62, "Order Date"] = "01-10-2024"
df_date.at[19, "Order Date"] = "30-08-2023"
df_date.at[26, "Order Date"] = "13-07-2023"
df_date.at[36, "Order Date"] = "28-10-2023"
df_date.at[48, "Order Date"] = "01-08-2023"
df_date.at[53, "Order Date"] = "24-02-2025"
df_date.at[57, "Order Date"] = "15-05-2025"
df_date.at[99, "Order Date"] = "23-01-2025"
df_date.iloc[[6,10,12,19,26,36,43,45,48,53,57,62,74,89,99]]
df_date.drop("Year", axis= 1, inplace= True)
df_date.drop("Month", axis= 1, inplace= True)
df_date.head()
"Order Date" from object to datetime64[ns] using:df_date["Order Date"] = pd.to_datetime(df_date["Order Date"], dayfirst= True)Year and Month to analyze time-based distribution:df_date["Year"] = df_date["Order Date"].dt.yeardf_date["Month"] = df_date["Order Date"].dt.monthdf_date[(df_date["Year"] == 2025) & (df_date["Month"] == 1)].count()df_date[(df_date["Year"] == 2025) & (df_date["Month"] == 1)].count()df_date.groupby("Year").agg({"Month" : "count"})/df_date["Year"].count()*100df_date.at[99, "Order Date"] = "2025-01-23"df_date["Order Date"].info()
df_cat = pd.DataFrame(df_date)
df_cat.head()
df_cat["City"].info()
df_cat["City"].nunique()
df_cat["City"].isnull().sum()
df_cat[df_cat["City"].isnull()]
df_cat["City"] = df_cat["City"].fillna("Unknown")
df_cat[df_cat["City"] == "Unknown"]
City column, I found that all 88 filled values are unique β no city names are repeated across the dataset. Due to this uniqueness, there is no meaningful way to relate or group missing values with other categorical columns such as Region or Categorie.'Unknown' to fill the 12 missing city values is the most appropriate and logical choice.'Unknown' ensures data integrity without distorting analytical results.df_cat["City"].info()
df_cat["Region"].info()
df_cat["Region"].isnull().sum()
df_cat["Region"].unique()
df_cat.groupby("Region").agg({"Region" : "count"})
df_cat[df_cat["Region"].isnull()]
df_cat.drop(index= 64, inplace= True)
df_cat[df_cat["Region"].isnull()]
df_cat.at[35,"Region"] = "West"
df_cat.at[54,"Region"] = "West"
df_cat.at[84,"Region"] = "South"
df_cat.iloc[[35,54,83]]
df_cat["Region"].info()
df_cat["Category"].info()
df_cat["Category"].isnull().sum()
df_cat["Category"].unique()
df_cat.groupby("Category").agg({"Region" : "count"})
df_cat[(df_cat["Category"] == "Furniture") & (df_cat["Region"] == "Central")].count()
df_cat[(df_cat["Category"] == "Furniture") & (df_cat["Region"] == "South")].count()
df_cat[(df_cat["Category"] == "Furniture") & (df_cat["Region"] == "East")].count()
df_cat[(df_cat["Category"] == "Furniture") & (df_cat["Region"] == "West")].count()
df_cat[(df_cat["Category"] == "Office Supplies") & (df_cat["Region"] == "Central")].count()
df_cat[(df_cat["Category"] == "Office Supplies") & (df_cat["Region"] == "South")].count()
df_cat[(df_cat["Category"] == "Office Supplies") & (df_cat["Region"] == "East")].count()
df_cat[(df_cat["Category"] == "Office Supplies") & (df_cat["Region"] == "West")].count()
df_cat[(df_cat["Category"] == "Technology") & (df_cat["Region"] == "Central")].count()
df_cat[(df_cat["Category"] == "Technology") & (df_cat["Region"] == "South")].count()
df_cat[(df_cat["Category"] == "Technology") & (df_cat["Region"] == "East")].count()
df_cat[(df_cat["Category"] == "Technology") & (df_cat["Region"] == "West")].count()
df_cat[df_cat["Category"].isnull()]
df_cat.at[11, "Category"] = "Furniture"
df_cat.at[3, "Category"] = "Furniture"
df_cat.at[49, "Category"] = "Furniture"
df_cat.at[55, "Category"] = "Office Supplies"
df_cat.at[53, "Category"] = "Office Supplies"
df_cat.at[67, "Category"] = "Technology"
df_cat.at[91, "Category"] = "Technology"
df_cat.iloc[[11,3,49,53,55,66,90]]
Category column (which includes Furniture, Office Supplies, and Technology) is tightly linked with the Region column.Category should take into account the region in which the sale occurred.groupby() to check how many times each category appears in the dataset:df_cat.groupby("Category").agg({"Region" : "count"})Category and Region to identify patterns:df.at[index, "Category"] = "Furniture" for each corresponding index based on their regionCategory values, I performed both frequency analysis and region-category relationship analysis.df_cat["Category"].info()
df_cat[["City","Region","Category"]].info()
df_num = pd.DataFrame(df_cat)
df_num.head()
df_num["Quantity"].info()
df_num["Quantity"].count()
df_num["Quantity"].isnull().sum()
df_num["Quantity"].max()
df_num["Quantity"].min()
df_num["Quantity"].mean()
df_num["Quantity"].sum()
df_num[df_num["Quantity"].isnull()]
df_num[(df_num["Category"] == "Office Supplies") & (df_num["Region"] == "Central") & (df_num["Order Date"] >= "2024-12-1")]
df_num.at[24, "Quantity"] = 3
df_num_31 = df_num[(df_num["Category"] == "Office Supplies") & (df_num["Region"] == "East") & (df_num["Order Date"] >= "2024-1-1") & (df_num["Order Date"] <= "2024-12-31")]
df_num_31
df_num.at[31, "Quantity"] = df_num_31["Quantity"].mean()
df_num_39 = df_num[(df_num["Category"] == "Furniture") & (df_num["Region"] == "South") & (df_num["Order Date"] >= "2025-1-1")]
df_num_39
df_num.at[39, "Quantity"] = df_num_39["Quantity"].mean()
df_num_40 = df_num[(df_num["Category"] == "Furniture") & (df_num["Region"] == "West") & (df_num["Order Date"] >= "2024-5-1") & (df_num["Order Date"] <= "2024-8-1")]
df_num_40
df_num.at[40, "Quantity"] = df_num_40["Quantity"].mean()
df_num[(df_num["Category"] == "Technology") & (df_num["Region"] == "Central") & (df_num["Order Date"] >= "2025-1-1") & (df_num["Order Date"] <= "2025-3-1")]
df_num.at[78, "Quantity"] = 1
df_num.drop(index= 96, inplace= True)
df_num.iloc[[24,31,39,40,78]]
df_num["Quantity"].mean()
df_num["Quantity"].sum()
Quantity column using:info(), count() β to identify null valuesmin(), max(), mean(), sum() β to understand data distributionQuantity using df[df["Quantity"].isnull()].Category + Region + Date Range, and used either the group mean or a manual assignment to fill the value:(like 3 and 1) only when the data size was too small for accurate averaging, which is a practical and reasonable decision.df_num.iloc[[24,31,39,40,78]]96 was dropped as it lacked sufficient reference values across other columns.df_num["Quantity"].info()
df_num["Sales"].info()
df_num["Sales"].count()
df_num["Sales"].isnull().sum()
df_num["Sales"].min()
df_num["Sales"].max()
df_num["Sales"].mean()
df_num["Sales"].sum()
df_num[df_num["Sales"].isnull()]
df_num_16 = df_num[(df_num["Category"] == "Furniture") & (df_num["Region"] == "Central") & (df_num["Order Date"] >= "2024-1-1") & (df_num["Order Date"] <= "2024-2-28")]
df_num_16
df_num.at[16, "Sales"] = df_num_16["Sales"].mean()
df_num_30 = df_num[(df_num["Category"] == "Furniture") & (df_num["Region"] == "East") & (df_num["Order Date"] >= "2025-1-1") & (df_num["Order Date"] <= "2025-5-28")]
df_num_30
df_num.at[30, "Sales"] = df_num_30["Sales"].mean()
df_num_52 = df_num[(df_num["Category"] == "Furniture") & (df_num["Region"] == "West") & (df_num["Order Date"] >= "2024-3-1") & (df_num["Order Date"] <= "2024-5-30")]
df_num_52
df_num.at[52, "Sales"] = df_num_52["Sales"].mean()
df_num_33 = df_num[(df_num["Category"] == "Technology") & (df_num["Region"] == "West") & (df_num["Order Date"] >= "2024-8-1")]
df_num_33
df_num.at[33, "Sales"] = df_num_33["Sales"].mean()
df_num_61 = df_num[(df_num["Category"] == "Technology") & (df_num["Region"] == "South")]
df_num_61
df_num.drop(index= 61, inplace= True)
df_num_94 = df_num[(df_num["Category"] == "Technology") & (df_num["Region"] == "Central") & (df_num["Order Date"] >= "2023-9-1") & (df_num["Order Date"] <= "2023-12-30")]
df_num_94
df_num.at[94, "Sales"] = 511.76
df_num[(df_num["Category"] == "Office Supplies") & (df_num["Region"] == "West") & (df_num["Order Date"] >= "2024-12-1") & ((df_num["Order Date"] <= "2024-12-30"))]
df_num.at[46, "Sales"] = 830
df_num[(df_num["Category"] == "Office Supplies") & (df_num["Region"] == "Central") & (df_num["Order Date"] >= "2025-1-1")]
df_num.drop(index= 42, inplace= True)
df_num.drop(index= 51, inplace= True)
df_num.iloc[[16,30,52,33,92,46]]
df_num["Sales"].max()
df_num["Sales"].min()
df_num["Sales"].count()
df_num["Sales"].sum()
df_num["Sales"].mean()
info(), count(), isnull().sum(), min(), max(), mean(), and sum() to understand the data distribution and missing values.df[df["Sales"].isnull()], I isolated the rows with missing Sales values and reviewed their associated columns such as Category, Region, and Order Date for contextual clues.Category (e.g., Furniture, Technology)Region (e.g., Central, West)Order Date windows).at[] to assign the mean to the correct index.drop(index=..., inplace=True).df.at[94, "Sales"] = 511.76.Sales column with info() to ensure that all null values had been handled. The column is now clean and ready for further analysis.Sales column were handled using smart contextual imputation β not random guesswork. By filtering based on Category + Region + Order Date, I ensured that replacements were grounded in similar data behavior.df_num["Sales"].info()
df_num["Profit"].info()
df_num["Profit"].count()
df_num["Profit"].isnull().sum()
df_num["Profit"].max()
df_num["Profit"].min()
df_num["Profit"].sum()
df_num["Profit"].mean()
df_num[df_num["Profit"].isnull()]
df_num["Profit"].fillna(0.0, inplace= True)
df_num.iloc[[0,1,10,34,43,53,65,71,72,76,87]]
df["Profit"].fillna(0.0, inplace=True)df_num[["Quantity", "Sales","Profit"]].info()
df_column = df.columns
df_column
cou = df.count()
df_count = cou.values
df_count
isnull = df.isnull().sum()
isnull_count = isnull.values
isnull_count

fig, ax = plt.subplots(figsize= (12,7))
ax.bar(df_column, df_count, color= "green")
ax.bar(df_column, isnull_count, bottom= df_count, color= "red")
ax.set_xlabel("Columns")
ax.set_ylabel("Values_Count")
plt.title("Before Null Value Handling: Green= Filled_value vs Red= Null_Value")
plt.xticks(rotation= 90)
plt.tight_layout()
plt.show()
df.info()
df_handled_missing_values = pd.DataFrame(df_num)
df_handled_missing_values.head()
df_handled_missing_values.reset_index(drop= True, inplace= True)
non_null_counts = df_handled_missing_values.notnull().sum().reset_index()
non_null_counts.columns = ['Column', 'Non-Null Count']
plt.figure(figsize=(12, 6))
sns.barplot(data=non_null_counts, x='Column', y='Non-Null Count')
plt.title('After Handling Null Values Columns')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

df_handled_missing_values.info()
"Unknown" since all values are unique identifiers."Unknown" due to uniqueness of values."Unknown" since all names are unique and avoid misleading results."Unknown" β cities are all unique.0.0 to avoid misleading results.
