\[ This project involves an e-commerce platform data ________________________________________ 1. Objective The objective of this project is to: • Perform data analysis to uncover insights, including handling missing values, outliers, exploratory data analysis (EDA), regression modeling, correlation analysis, and statistical tests. • Visualize the data using various charts and graphs. • Use R as the primary tool for data analysis, and visualization. ________________________________________ 2. Data The dataset will include the following variables: Customer Details • CustomerID: Unique identifier for each customer. • Age: Age of the customer. • Gender: Gender of the customer (Male/Female/Other). • Location: City or region of the customer. • MembershipStatus: Membership level (Basic, Premium, Gold). Product Information • ProductID: Unique identifier for each product. • ProductCategory: Category of the product (e.g., Electronics, Clothing, Home Appliances). • Price: Price of the product. • Rating: Average customer rating of the product (1-5). Transaction Information • TransactionID: Unique identifier for each transaction. • CustomerID: Links to the customer who made the transaction. • ProductID: Links to the product purchased. • Quantity: Number of units purchased. • TotalAmount: Total amount spent in the transaction. • PaymentMethod: Payment method used (Credit Card, Debit Card, UPI, Cash on Delivery). • TransactionDate: Date of the transaction. \]

/newpage

##W/D

#Import ecommerce Data set

ecommerce_data=read.csv("E-commerce_data.csv")
View(ecommerce_data)
str(ecommerce_data)
'data.frame':   300 obs. of  14 variables:
 $ TransactionID   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ CustomerID      : int  207 253 110 256 274 52 191 165 18 169 ...
 $ ProductID       : int  14 15 19 1 2 15 10 5 5 17 ...
 $ Quantity        : int  5 1 5 3 5 2 1 5 4 2 ...
 $ PaymentMethod   : chr  "UPI" "Cash on Delivery" "Cash on Delivery" "Debit Card" ...
 $ TransactionDate : chr  "12/28/2023" "4/17/2023" "1/17/2023" "10/23/2023" ...
 $ ProductCategory : chr  "Clothing" "Electronics" "Clothing" "Home Appliances" ...
 $ Price           : num  33.4 389 145.9 215 325.5 ...
 $ Rating          : num  4.2 2.2 1.7 2.7 3.4 2.2 3.7 1.4 1.4 1.9 ...
 $ TotalAmount     : num  167 389 729 645 1627 ...
 $ Age             : int  50 19 37 50 24 50 40 37 25 53 ...
 $ Gender          : chr  "Male" "Female" "Female" "Male" ...
 $ Location        : chr  "Houston" "Chicago" "Houston" "Los Angeles" ...
 $ MembershipStatus: chr  "Basic" "Premium" "Basic" "Premium" ...

#DATA CLEANING AND REPROCESSING #Handling Missing Values # Check for missing values

colSums(is.na(ecommerce_data))
   TransactionID       CustomerID        ProductID         Quantity 
               0                0                0                0 
   PaymentMethod  TransactionDate  ProductCategory            Price 
               0                0                0                0 
          Rating      TotalAmount              Age           Gender 
               0                0                8                0 
        Location MembershipStatus 
               0                0 

Impute missing values for Age (using mean) and Gender (using mode)

library(dplyr)
library(tidyr)

ecommerce_data <- ecommerce_data %>%
  mutate(Age = ifelse(is.na(Age), mean(Age, na.rm = TRUE), Age),
         Gender = ifelse(is.na(Gender), names(sort(table(Gender), decreasing = TRUE))[1], Gender))

#Handle Outliers # Detect outliers using the IQR method

outlier_detection <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  return(x < lower_bound | x > upper_bound)
  }

Apply outlier detection to TotalAmount

ecommerce_data$Outlier <- outlier_detection(ecommerce_data$TotalAmount)

Remove outliers

ecommerce_data <- ecommerce_data %>% filter(Outlier == FALSE)

#EXPLORATORY DATA ANALYSIS (EDA) #Univariate Analysis # Histogram for Age

library(ggplot2)
ggplot(ecommerce_data, aes(x = Age)) +
  geom_boxplot(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Age", x = "Age", y = "Count")

library(ggplot2)
ggplot(ecommerce_data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Age", x = "Age", y = "Count")

Pie chart for Gender

gender_counts <- table(ecommerce_data$Gender)
pie(gender_counts, labels = names(gender_counts), main = "Proportion of Customers by Gender")

#Bivariate Analysis # Scatter plot for Price vs. TotalAmount

ggplot(ecommerce_data, aes(x = Price, y = TotalAmount)) +
  geom_point(alpha = 0.5, color = "red") +
  labs(title = "Price vs. TotalAmount", x = "Price", y = "TotalAmount")

Bar chart for average TotalAmount by ProductCategory

avg_amount_by_category <- ecommerce_data %>%
  group_by(ProductCategory) %>%
  summarise(Avg_TotalAmount = mean(TotalAmount))

ggplot(avg_amount_by_category, aes(x = ProductCategory, y = Avg_TotalAmount)) +
  geom_bar(stat = "identity", fill = "orange") +
  labs(title = "Average TotalAmount by Product Category", x = "Product Category", y = "Average TotalAmount")

#REGRESSION MODEL # Build a linear regression model

model <- lm(TotalAmount ~ Age + Price + Quantity + Rating, data = ecommerce_data)

Summarize the model

summary(model)

Call:
lm(formula = TotalAmount ~ Age + Price + Quantity + Rating, data = ecommerce_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-493.41  -98.07    8.41   99.72  447.72 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -725.22455   52.37345 -13.847   <2e-16 ***
Age            0.08356    0.74111   0.113    0.910    
Price          3.14352    0.08184  38.412   <2e-16 ***
Quantity     238.95753    6.71918  35.564   <2e-16 ***
Rating        -8.46482    9.18344  -0.922    0.357    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 166.5 on 291 degrees of freedom
Multiple R-squared:  0.9076,    Adjusted R-squared:  0.9063 
F-statistic: 714.4 on 4 and 291 DF,  p-value: < 2.2e-16

#CORRELATION ANALYSIS # Correlation matrix

correlation_matrix <- cor(ecommerce_data[, c("Age", "Price", "Quantity", "Rating", "TotalAmount")])
print(correlation_matrix)
                    Age       Price    Quantity      Rating TotalAmount
Age          1.00000000 -0.04408382  0.02224395  0.03805062 -0.01471413
Price       -0.04408382  1.00000000  0.02973474 -0.06313723  0.70665706
Quantity     0.02224395  0.02973474  1.00000000 -0.09877651  0.65943058
Rating       0.03805062 -0.06313723 -0.09877651  1.00000000 -0.12278622
TotalAmount -0.01471413  0.70665706  0.65943058 -0.12278622  1.00000000

Visualize correlation matrix

library(corrplot)
corrplot(correlation_matrix, method = "circle")

#STATISTICAL TESTS (Compare the average TotalAmount spent by male and female customers.) #T-test # Load necessary library

library(dplyr)

Check column names

colnames(ecommerce_data)
 [1] "TransactionID"    "CustomerID"       "ProductID"        "Quantity"        
 [5] "PaymentMethod"    "TransactionDate"  "ProductCategory"  "Price"           
 [9] "Rating"           "TotalAmount"      "Age"              "Gender"          
[13] "Location"         "MembershipStatus" "Outlier"         

Convert TotalAmount to numeric

ecommerce_data$TotalAmount <- as.numeric(ecommerce_data$TotalAmount)

Convert Gender to factor

ecommerce_data$Gender <- as.factor(ecommerce_data$Gender)

Check for missing values

sum(is.na(ecommerce_data$TotalAmount))
[1] 0
sum(is.na(ecommerce_data$Gender))
[1] 0

Remove rows with missing values

ecommerce_data <- ecommerce_data %>% filter(!is.na(TotalAmount) & !is.na(Gender))

Check levels of Gender

levels(ecommerce_data$Gender)
[1] ""       "Female" "Male"  

Filter data to include only Male and Female

ecommerce_data <- ecommerce_data %>% filter(Gender %in% c("Male", "Female"))

Perform the t-test

t_test_result <- t.test(TotalAmount ~ Gender, data = ecommerce_data)

Perform chi-square test

chi_square_result <- chisq.test(contingency_table)
print(chi_square_result)

    Pearson's Chi-squared test

data:  contingency_table
X-squared = 2.3691, df = 6, p-value = 0.8828

#ANOVA (Compare the average Total Amount across different Membership Status levels.) # Perform ANOVA

anova_result <- aov(TotalAmount ~ MembershipStatus, data = ecommerce_data)
summary(anova_result)
                  Df   Sum Sq Mean Sq F value Pr(>F)
MembershipStatus   2   272752  136376   0.455  0.635
Residuals        280 83980962  299932               

#DATA VISUALIZATION (Create visualizations to summarize findings.) # Box plot for Total Amount by Membership Status

ggplot(ecommerce_data, aes(x = MembershipStatus, y = TotalAmount)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "TotalAmount by Membership Status", x = "Membership Status", y = "TotalAmount")

Donut chart for PaymentMethod

library(ggpubr)
library(ggplot2)

Create a summary table for PaymentMethod

payment_counts <- ecommerce_data %>%
  group_by(PaymentMethod) %>%
  summarise(Count = n()) %>%
  ungroup()

Calculate percentages

payment_counts <- payment_counts %>%
  mutate(Percentage = Count / sum(Count) * 100)

Create the donut chart using ggplot2

ggplot(payment_counts, aes(x = 2, y = Percentage, fill = PaymentMethod)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  xlim(1, 2.5) +
  theme_void() +
  geom_text(aes(label = paste0(round(Percentage, 1), "%")), 
            position = position_stack(vjust = 0.5)) +
  labs(fill = "Payment Method")