setwd("~/R training")
library(readxl)
library(tidyverse)
library(ggplot2)
library(psych)
library(MASS)
library(dplyr)
library(graphics)
library(sjmisc)
library(summarytools)
library(ggthemes)
library(car)
library(rstatix)
library(stargazer)
library(corrplot)
library(forecast)
library(lmtest)
library(scales)
library(stats)
library(mice)
data<-read.csv("E-commerce_data.csv")
head(data)
NA
data <- read.csv("E-commerce_data.csv", na.strings = c("", "NA", "null"))
head(data)
data<-read.csv("E-commerce_data.csv")
missing_value <- colSums(is.na(data))
data.frame(missing_value)
colSums(is.na(data))
TransactionID CustomerID ProductID Quantity PaymentMethod
0 0 0 0 0
TransactionDate ProductCategory Price Rating TotalAmount
0 0 0 0 0
Age Gender Location MembershipStatus
8 0 0 0
## handling missing value for Age
#data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)
head(data)
colSums(is.na(data))
TransactionID CustomerID ProductID Quantity PaymentMethod
0 0 0 0 0
TransactionDate ProductCategory Price Rating TotalAmount
0 0 0 0 0
Age Gender Location MembershipStatus
8 0 0 0
## handling missing value for gender
get_mode <- function(x, na.rm = FALSE) {
if (na.rm) x <- na.omit(x)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Use it
get_mode(data$Gender, na.rm = TRUE)
[1] "Male"
data$Gender[is.na(data$Gender)] <- get_mode(data$Gender, na.rm = TRUE)
• Detect and handle outliers in numerical variables (e.g., Age, Price, TotalAmount) using methods like the IQR rule or Z-score.
find_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
return(which(x < lower_bound | x > upper_bound))
}
numeric_columns <- sapply(data, is.numeric)
outlier_indices_list <- lapply(data[, numeric_columns], find_outliers_iqr)
# Print summary of outliers
for (col in names(outlier_indices_list)) {
cat("Variable:", col, " - Outliers found:", length(outlier_indices_list[[col]]), "\n")
}
Variable: TransactionID - Outliers found: 0
Variable: CustomerID - Outliers found: 0
Variable: ProductID - Outliers found: 0
Variable: Quantity - Outliers found: 0
Variable: Price - Outliers found: 0
Variable: Rating - Outliers found: 0
Variable: TotalAmount - Outliers found: 4
Variable: Age - Outliers found: 0
Q1 <- quantile(data$TotalAmount, 0.25, na.rm = TRUE)
Q3 <- quantile(data$TotalAmount, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data_clean <- data[data$TotalAmount >= lower_bound & data$TotalAmount <= upper_bound, ]
Q1 <- quantile(data_clean$TotalAmount, 0.25, na.rm = TRUE)
Q3 <- quantile(data_clean$TotalAmount, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
sum(data_clean$TotalAmount < lower | data_clean$TotalAmount > upper, na.rm = TRUE)
[1] 3
##cleaning the remaining outliers
Q1 <- quantile(data_clean$TotalAmount, 0.25, na.rm = TRUE)
Q3 <- quantile(data_clean$TotalAmount, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
#data_clean <- data[data_clean$TotalAmount >= lower_bound & data$TotalAmount <= upper_bound, ]
#data_clean
##double checking if the outliers have been cleaned
Q1 <- quantile(data_clean$TotalAmount, 0.25, na.rm = TRUE)
Q3 <- quantile(data_clean$TotalAmount, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
sum(data_clean$TotalAmount < lower | data_clean$TotalAmount > upper, na.rm = TRUE)
[1] 3
• Perform univariate and bivariate analysis to understand the distribution of variables and relationships between them
## Summary Statistics
summary(data_clean)
TransactionID CustomerID ProductID Quantity PaymentMethod
Min. : 1.00 Min. : 4.00 Min. : 1.00 Min. :1.000 Length:296
1st Qu.: 74.75 1st Qu.: 76.75 1st Qu.: 6.00 1st Qu.:2.000 Class :character
Median :149.50 Median :156.50 Median :11.00 Median :3.000 Mode :character
Mean :149.80 Mean :154.93 Mean :10.64 Mean :3.101
3rd Qu.:224.25 3rd Qu.:230.25 3rd Qu.:15.25 3rd Qu.:4.000
Max. :300.00 Max. :299.00 Max. :20.00 Max. :5.000
TransactionDate ProductCategory Price Rating
Length:296 Length:296 Min. : 33.36 Min. :1.400
Class :character Class :character 1st Qu.:145.89 1st Qu.:2.100
Mode :character Mode :character Median :215.02 Median :2.600
Mean :250.65 Mean :2.932
3rd Qu.:341.40 3rd Qu.:3.700
Max. :466.49 Max. :5.000
TotalAmount Age Gender Location
Min. : 33.36 Min. :18.00 Length:296 Length:296
1st Qu.: 348.95 1st Qu.:30.00 Class :character Class :character
Median : 650.96 Median :42.00 Mode :character Mode :character
Mean : 782.45 Mean :41.80
3rd Qu.:1087.23 3rd Qu.:52.25
Max. :2252.85 Max. :65.00
NA's :8
MembershipStatus
Length:296
Class :character
Mode :character
## frequency of categorical variables
#freq(data_clean)
boxplot(data_clean$Age, main='Age')
hist(data_clean$Price,main="Histogram of Price",xlab="Price",col='blue')
table(data_clean$Gender)
Female Male
13 125 158
table(data_clean$PaymentMethod)
Cash on Delivery Credit Card Debit Card UPI
79 63 65 89
table(data_clean$MembershipStatus)
Basic Gold Premium
103 93 100
##scatterplot
ggplot(data_clean,aes(x=Price,y=TotalAmount))+
geom_point(alpha=0.6,color="blue")+
labs(title="Scatterplot of Price vs TotalAmount")
library(ggplot2)
ggplot(data_clean,aes(x=ProductCategory, y=Price,fill=ProductCategory))+
geom_boxplot()+
labs(title="ProductCategory vs Price")+
geom_boxplot()
ggplot(data=data_clean, aes(x=ProductCategory, y=Price))+
geom_line()
contingency_table<-table(data_clean$PaymentMethod,data_clean$MembershipStatus)
print(contingency_table)
Basic Gold Premium
Cash on Delivery 24 34 21
Credit Card 22 16 25
Debit Card 22 20 23
UPI 35 23 31
• Build a regression model to predict TotalAmount based on variables like Age, Price, Quantity, and Rating.
model<-lm(TotalAmount~Age+Price+Rating+Quantity,data=data_clean)
summary(model)
Call:
lm(formula = TotalAmount ~ Age + Price + Rating + Quantity, data = data_clean)
Residuals:
Min 1Q Median 3Q Max
-493.48 -97.40 12.02 104.31 455.06
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -727.22946 53.14078 -13.685 <2e-16 ***
Age 0.09187 0.74622 0.123 0.902
Price 3.15513 0.08333 37.862 <2e-16 ***
Rating -10.00590 9.44359 -1.060 0.290
Quantity 239.44714 6.83102 35.053 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 167.6 on 283 degrees of freedom
(8 observations deleted due to missingness)
Multiple R-squared: 0.9065, Adjusted R-squared: 0.9052
F-statistic: 685.8 on 4 and 283 DF, p-value: < 2.2e-16
library(car)
vif_values=vif(model)
print(vif_values)
Age Price Rating Quantity
1.004085 1.004369 1.012183 1.009450
library(tidyverse)
library(corrplot)
cor_matrix<-cor(select(data_clean, where(is.numeric)),use="complete.obs")
corrplot(cor_matrix,
method="color",
type="full",
tl.col="black",
tl.srt=45,
addCoef.col="white",
number.cex=0.7,
diag=TRUE)
library(forecast)
checkresiduals(model)
Breusch-Godfrey test for serial correlation of order up to 10
data: Residuals
LM test = 3.9246, df = 10, p-value = 0.9507
##3.5 Correlation Analysis • Analyze the correlation between numerical variables (e.g., Price vs. TotalAmount, Age vs. TotalAmount).
Hypothesis:H0:There is significant relationship between the variables H1:There is no significant Relationship in atleast one of the variables
## correlation matrix
library(ggplot2)
library(corrplot)
num_data_clean<-data_clean[,c("Age","Price","Quantity","Rating","TotalAmount")]
cor_matrix<-cor(num_data_clean,use="complete.obs")
cor_matrix
Age Price Quantity Rating TotalAmount
Age 1.00000000 -0.04463762 0.02247050 0.03892636 -0.01491569
Price -0.04463762 1.00000000 0.01927763 -0.04750919 0.70292125
Quantity 0.02247050 0.01927763 1.00000000 -0.09178436 0.65531595
Rating 0.03892636 -0.04750919 -0.09178436 1.00000000 -0.11081925
TotalAmount -0.01491569 0.70292125 0.65531595 -0.11081925 1.00000000
##pearson Rank
##pearson Rank
library(corrplot)
num_data_clean<-data_clean[,c("Age","Price","Quantity","Rating","TotalAmount")]
pearson_matrix<-cor(num_data_clean,method="pearson")
pearson_matrix
Age Price Quantity Rating TotalAmount
Age 1 NA NA NA NA
Price NA 1.00000000 0.02973474 -0.06313723 0.7066571
Quantity NA 0.02973474 1.00000000 -0.09877651 0.6594306
Rating NA -0.06313723 -0.09877651 1.00000000 -0.1227862
TotalAmount NA 0.70665706 0.65943058 -0.12278622 1.0000000
## spearman
library(corrplot)
num_data_clean<-data_clean[,c("Age","Price","Quantity","Rating","TotalAmount")]
spearman_matrix<-cor(num_data_clean,method="spearman")
spearman_matrix
Age Price Quantity Rating TotalAmount
Age 1 NA NA NA NA
Price NA 1.000000000 0.01763169 -0.001029652 0.68529096
Quantity NA 0.017631694 1.00000000 -0.066746308 0.68980207
Rating NA -0.001029652 -0.06674631 1.000000000 -0.05791432
TotalAmount NA 0.685290962 0.68980207 -0.057914324 1.00000000
• Perform a t-test to compare the average TotalAmount spent by male and female customers. • Perform a chi-square test to check the association between ProductCategory and PaymentMethod. • Perform ANOVA to compare the average TotalAmount across different MembershipStatus levels. ________________________________________
## Hypothesis: H_0:There is a statistically significant difference between male and female spending.")
## H_1:There is no statistically significant difference between male and female spending.")
library(dplyr)
#Checking for missing values in Gender and TotalAmount
sum(is.na(data_clean$Gender))
[1] 0
sum(is.na(data_clean$TotalAmount))
[1] 0
# 5. Separate TotalAmount by Gender
male_spending <- data_clean %>% filter(Gender == "Male") %>% pull(TotalAmount)
female_spending <- data_clean %>% filter(Gender == "Female") %>% pull(TotalAmount)
# 6. Perform Independent T-Test (Welch's T-Test - assumes unequal variances)
t_test_result <- t.test(male_spending, female_spending, var.equal = FALSE)
# 7. Print the result
print(t_test_result)
Welch Two Sample t-test
data: male_spending and female_spending
t = 2.4726, df = 274.83, p-value = 0.01402
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
32.3603 285.1822
sample estimates:
mean of x mean of y
854.8084 696.0371
Output interpretation for t_test:p_value<0.05;Reject H0 conclusion: there is no significant difference between Female and male spending
##• Perform a chi-square test to check the association between ProductCategory and PaymentMethod. ## chi square Test
## Hypothesis:Hâ‚€: There is an association between Product Category and Payment Method
## H1:There is no association between Product Category and Payment Method
contingency_table <- table(data_clean$ProductCategory, data_clean$PaymentMethod)
chi_square_result <- chisq.test(contingency_table)
print(chi_square_result)
Pearson's Chi-squared test
data: contingency_table
X-squared = 2.2394, df = 6, p-value = 0.8964
conclusion: P-value >0.05: Do not Reject the nullhypothesis; There is an association between payment method and product category.
##• Perform ANOVA to compare the average TotalAmount across different MembershipStatus levels.
##Hypothesis;H0:there is significant difference between average Total amount accross different membership levels
## H1:there is no significant difference between average Total amount accross different membership levels
# Perform one-way ANOVA
anova_result <- aov(TotalAmount ~ MembershipStatus, data = data_clean)
summary(anova_result)
Df Sum Sq Mean Sq F value Pr(>F)
MembershipStatus 2 451973 225986 0.762 0.467
Residuals 293 86840140 296383
Output interpretation:P_value<0.05,Do not reject null hypothesis Conclusion:there is significant difference between average Total Amount across different membership levels ## 4. Data Visualization 4.1 Histogram • Visualize the distribution of numerical variables like Age, Price, and TotalAmount.
4.4 Donut Chart • Display the distribution of PaymentMethod used by customers. 4.5 Scatter Plot • Visualize the relationship between Price and TotalAmount. 4.6 Box Plot • Identify outliers in TotalAmount or Price. ________________________________________
par(mfrow = c(1, 3))
# Histograms
hist(data_clean$Age, col = "lightblue", main = "Distribution of Age", xlab = "Age")
hist(data_clean$Price, col = "lightgreen", main = "Distribution of Price", xlab = "Price")
hist(data_clean$TotalAmount, col = "salmon", main = "Distribution of TotalAmount", xlab = "Total Amount")
hist(data_clean$Rating, col = "red", main = "Distribution of Rating", xlab = "Rating")
##4.2 Pie Chart • Show the proportion of customers by Gender or MembershipStatus.
# Pie chart for Membership Status
membership_counts <- table(data_clean$MembershipStatus)
pie(membership_counts, main = "Membership Status Distribution", col = rainbow(length(membership_counts)))
# Pie chart for Gender
gender_counts <- table(data_clean$Gender)
pie(gender_counts, main = "Customer Gender Distribution", col = c("lightblue", "pink"))
• Compare the average TotalAmount spent by customers in different Location or ProductCategory.
## TotalAmount by ProductCategory
library(ggplot2)
library(dplyr)
# Calculate average total amount by product category
avg_by_category <- data %>%
group_by(ProductCategory) %>%
summarise(AvgTotalAmount = mean(TotalAmount, na.rm = TRUE))
ggplot(avg_by_category, aes(x = reorder(ProductCategory, AvgTotalAmount),
y = AvgTotalAmount,
fill = ProductCategory)) +
geom_bar(stat = "identity") +
labs(title = "Average Total Amount Spent by Product Category",
x = "Product Category",
y = "Average Total Amount") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)
## TotalAmount by Location bar chart
library(ggplot2)
library(dplyr)
# Calculate average total amount by Location
avg_by_Location<- data %>%
group_by(Location) %>%
summarise(AvgTotalAmount = mean(TotalAmount, na.rm = TRUE))
ggplot(avg_by_Location, aes(x = reorder(Location, AvgTotalAmount),
y = AvgTotalAmount,
fill = Location)) +
geom_bar(stat = "identity") +
labs(title = "Average Total Amount Spent by Location",
x = "Location",
y = "Average Total Amount") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)
# Load libraries
library(ggplot2)
library(ggforce)
library(dplyr)
# Count payment method frequencies
payment_counts <- data %>%
count(PaymentMethod, name = "Count") %>%
mutate(
Percent = round(Count / sum(Count) * 100, 1),
Cumulative = cumsum(Count),
start = lag(Cumulative, default = 0),
end = Cumulative,
label_position = (start + end) / 2
)
# Convert to radians
payment_counts$radians_start <- payment_counts$start * 2 * pi / sum(payment_counts$Count)
payment_counts$radians_end <- payment_counts$end * 2 * pi / sum(payment_counts$Count)# Plot
ggplot(payment_counts) +
geom_arc_bar(
aes(
x0 = 0, y0 = 0, r0 = 0.5,
r = 1,
start = radians_start,
end = radians_end,
fill = PaymentMethod
)
) +
coord_polar(theta = "y") +
xlim(c(-1.1, 1.1)) +
ylim(c(0, 1.1)) +
scale_fill_brewer(palette = "Set3") +
theme_void() +
labs(title = "Distribution of Payment Methods Used by Customers", fill = "Payment Method") +
annotate("text",
x = 0,
y = seq(0.6, 1.05, length.out = nrow(payment_counts)),
label = paste0(payment_counts$Percent, "%"),
size = 4,
color = "black")
ggplot(data = data, aes(x = Price, y = TotalAmount, color = as.factor(Quantity))) +
geom_point(alpha = 0.7) +
labs(
title = "Price vs TotalAmount (Colored by Quantity)",
x = "Price per Unit",
y = "Total Amount Spent",
color = "Quantity"
) +
theme_minimal()
## TotalAmount
ggplot(data, aes(y = TotalAmount)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Box Plot of TotalAmount", y = "Total Amount Spent") +
theme_minimal()
##Price
ggplot(data, aes(y = Price)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Box Plot of Price", y = "Price") +
theme_minimal()