\[ This project involves an e-commerce platform data ________________________________________ 1. Objective The objective of this project is to: • Perform data analysis to uncover insights, including handling missing values, outliers, exploratory data analysis (EDA), regression modeling, correlation analysis, and statistical tests. • Visualize the data using various charts and graphs. • Use R as the primary tool for data analysis, and visualization. ________________________________________ 2. Data The dataset will include the following variables: Customer Details • CustomerID: Unique identifier for each customer. • Age: Age of the customer. • Gender: Gender of the customer (Male/Female/Other). • Location: City or region of the customer. • MembershipStatus: Membership level (Basic, Premium, Gold). Product Information • ProductID: Unique identifier for each product. • ProductCategory: Category of the product (e.g., Electronics, Clothing, Home Appliances). • Price: Price of the product. • Rating: Average customer rating of the product (1-5). Transaction Information • TransactionID: Unique identifier for each transaction. • CustomerID: Links to the customer who made the transaction. • ProductID: Links to the product purchased. • Quantity: Number of units purchased. • TotalAmount: Total amount spent in the transaction. • PaymentMethod: Payment method used (Credit Card, Debit Card, UPI, Cash on Delivery). • TransactionDate: Date of the transaction. \]
/newpage
##W/D
#Import ecommerce Data set
ecommerce_data=read.csv("E-commerce_data.csv")
View(ecommerce_data)
str(ecommerce_data)
'data.frame': 300 obs. of 14 variables:
$ TransactionID : int 1 2 3 4 5 6 7 8 9 10 ...
$ CustomerID : int 207 253 110 256 274 52 191 165 18 169 ...
$ ProductID : int 14 15 19 1 2 15 10 5 5 17 ...
$ Quantity : int 5 1 5 3 5 2 1 5 4 2 ...
$ PaymentMethod : chr "UPI" "Cash on Delivery" "Cash on Delivery" "Debit Card" ...
$ TransactionDate : chr "12/28/2023" "4/17/2023" "1/17/2023" "10/23/2023" ...
$ ProductCategory : chr "Clothing" "Electronics" "Clothing" "Home Appliances" ...
$ Price : num 33.4 389 145.9 215 325.5 ...
$ Rating : num 4.2 2.2 1.7 2.7 3.4 2.2 3.7 1.4 1.4 1.9 ...
$ TotalAmount : num 167 389 729 645 1627 ...
$ Age : int 50 19 37 50 24 50 40 37 25 53 ...
$ Gender : chr "Male" "Female" "Female" "Male" ...
$ Location : chr "Houston" "Chicago" "Houston" "Los Angeles" ...
$ MembershipStatus: chr "Basic" "Premium" "Basic" "Premium" ...
#DATA CLEANING AND REPROCESSING #Handling Missing Values # Check for missing values
colSums(is.na(ecommerce_data))
TransactionID CustomerID ProductID Quantity
0 0 0 0
PaymentMethod TransactionDate ProductCategory Price
0 0 0 0
Rating TotalAmount Age Gender
0 0 8 0
Location MembershipStatus
0 0
library(dplyr)
library(tidyr)
ecommerce_data <- ecommerce_data %>%
mutate(Age = ifelse(is.na(Age), mean(Age, na.rm = TRUE), Age),
Gender = ifelse(is.na(Gender), names(sort(table(Gender), decreasing = TRUE))[1], Gender))
#Handle Outliers # Detect outliers using the IQR method
outlier_detection <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
return(x < lower_bound | x > upper_bound)
}
ecommerce_data$Outlier <- outlier_detection(ecommerce_data$TotalAmount)
ecommerce_data <- ecommerce_data %>% filter(Outlier == FALSE)
#EXPLORATORY DATA ANALYSIS (EDA) #Univariate Analysis # Histogram for Age
library(ggplot2)
ggplot(ecommerce_data, aes(x = Age)) +
geom_boxplot(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Distribution of Age", x = "Age", y = "Count")
library(ggplot2)
ggplot(ecommerce_data, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Distribution of Age", x = "Age", y = "Count")
gender_counts <- table(ecommerce_data$Gender)
pie(gender_counts, labels = names(gender_counts), main = "Proportion of Customers by Gender")
#Bivariate Analysis # Scatter plot for Price vs. TotalAmount
ggplot(ecommerce_data, aes(x = Price, y = TotalAmount)) +
geom_point(alpha = 0.5, color = "red") +
labs(title = "Price vs. TotalAmount", x = "Price", y = "TotalAmount")
avg_amount_by_category <- ecommerce_data %>%
group_by(ProductCategory) %>%
summarise(Avg_TotalAmount = mean(TotalAmount))
ggplot(avg_amount_by_category, aes(x = ProductCategory, y = Avg_TotalAmount)) +
geom_bar(stat = "identity", fill = "orange") +
labs(title = "Average TotalAmount by Product Category", x = "Product Category", y = "Average TotalAmount")
#REGRESSION MODEL # Build a linear regression model
model <- lm(TotalAmount ~ Age + Price + Quantity + Rating, data = ecommerce_data)
summary(model)
Call:
lm(formula = TotalAmount ~ Age + Price + Quantity + Rating, data = ecommerce_data)
Residuals:
Min 1Q Median 3Q Max
-493.41 -98.07 8.41 99.72 447.72
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -725.22455 52.37345 -13.847 <2e-16 ***
Age 0.08356 0.74111 0.113 0.910
Price 3.14352 0.08184 38.412 <2e-16 ***
Quantity 238.95753 6.71918 35.564 <2e-16 ***
Rating -8.46482 9.18344 -0.922 0.357
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 166.5 on 291 degrees of freedom
Multiple R-squared: 0.9076, Adjusted R-squared: 0.9063
F-statistic: 714.4 on 4 and 291 DF, p-value: < 2.2e-16
#CORRELATION ANALYSIS # Correlation matrix
correlation_matrix <- cor(ecommerce_data[, c("Age", "Price", "Quantity", "Rating", "TotalAmount")])
print(correlation_matrix)
Age Price Quantity Rating TotalAmount
Age 1.00000000 -0.04408382 0.02224395 0.03805062 -0.01471413
Price -0.04408382 1.00000000 0.02973474 -0.06313723 0.70665706
Quantity 0.02224395 0.02973474 1.00000000 -0.09877651 0.65943058
Rating 0.03805062 -0.06313723 -0.09877651 1.00000000 -0.12278622
TotalAmount -0.01471413 0.70665706 0.65943058 -0.12278622 1.00000000
library(corrplot)
corrplot(correlation_matrix, method = "circle")
#STATISTICAL TESTS (Compare the average TotalAmount spent by male and female customers.) #T-test # Load necessary library
library(dplyr)
colnames(ecommerce_data)
[1] "TransactionID" "CustomerID" "ProductID" "Quantity"
[5] "PaymentMethod" "TransactionDate" "ProductCategory" "Price"
[9] "Rating" "TotalAmount" "Age" "Gender"
[13] "Location" "MembershipStatus" "Outlier"
ecommerce_data$TotalAmount <- as.numeric(ecommerce_data$TotalAmount)
ecommerce_data$Gender <- as.factor(ecommerce_data$Gender)
sum(is.na(ecommerce_data$TotalAmount))
[1] 0
sum(is.na(ecommerce_data$Gender))
[1] 0
ecommerce_data <- ecommerce_data %>% filter(!is.na(TotalAmount) & !is.na(Gender))
levels(ecommerce_data$Gender)
[1] "" "Female" "Male"
ecommerce_data <- ecommerce_data %>% filter(Gender %in% c("Male", "Female"))
t_test_result <- t.test(TotalAmount ~ Gender, data = ecommerce_data)
print(t_test_result)
Welch Two Sample t-test
data: TotalAmount by Gender
t = -2.4726, df = 274.83, p-value = 0.01402
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-285.1822 -32.3603
sample estimates:
mean in group Female mean in group Male
696.0371 854.8084
#Chi-Square Test (Check the association between Product-Category and Payment-Method) # Create a contingency table
contingency_table <- table(ecommerce_data$ProductCategory, ecommerce_data$PaymentMethod)
chi_square_result <- chisq.test(contingency_table)
print(chi_square_result)
Pearson's Chi-squared test
data: contingency_table
X-squared = 2.3691, df = 6, p-value = 0.8828
#ANOVA (Compare the average Total Amount across different Membership Status levels.) # Perform ANOVA
anova_result <- aov(TotalAmount ~ MembershipStatus, data = ecommerce_data)
summary(anova_result)
Df Sum Sq Mean Sq F value Pr(>F)
MembershipStatus 2 272752 136376 0.455 0.635
Residuals 280 83980962 299932
#DATA VISUALIZATION (Create visualizations to summarize findings.) # Box plot for Total Amount by Membership Status
ggplot(ecommerce_data, aes(x = MembershipStatus, y = TotalAmount)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "TotalAmount by Membership Status", x = "Membership Status", y = "TotalAmount")
library(ggpubr)
library(ggplot2)
payment_counts <- ecommerce_data %>%
group_by(PaymentMethod) %>%
summarise(Count = n()) %>%
ungroup()
payment_counts <- payment_counts %>%
mutate(Percentage = Count / sum(Count) * 100)
ggplot(payment_counts, aes(x = 2, y = Percentage, fill = PaymentMethod)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
xlim(1, 2.5) +
theme_void() +
geom_text(aes(label = paste0(round(Percentage, 1), "%")),
position = position_stack(vjust = 0.5)) +
labs(fill = "Payment Method")