# Load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot) # For correlation heatmap
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
# Load the dataset
data <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")
# Basic inspection
str(data) # View data types
## 'data.frame': 10999 obs. of 12 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Warehouse_block : chr "D" "F" "A" "B" ...
## $ Mode_of_Shipment : chr "Flight" "Flight" "Flight" "Flight" ...
## $ Customer_care_calls: int 4 4 2 3 2 3 3 4 3 3 ...
## $ Customer_rating : int 2 5 2 3 2 1 4 1 4 2 ...
## $ Cost_of_the_Product: int 177 216 183 176 184 162 250 233 150 164 ...
## $ Prior_purchases : int 3 2 4 4 3 3 3 2 3 3 ...
## $ Product_importance : chr "low" "low" "low" "medium" ...
## $ Gender : chr "F" "M" "M" "M" ...
## $ Discount_offered : int 44 59 48 10 46 12 3 48 11 29 ...
## $ Weight_in_gms : int 1233 3088 3374 1177 2484 1417 2371 2804 1861 1187 ...
## $ Reached.on.Time_Y.N: int 1 1 1 1 1 1 1 1 1 1 ...
summary(data) # Statistical summary
## ID Warehouse_block Mode_of_Shipment Customer_care_calls
## Min. : 1 Length:10999 Length:10999 Min. :2.000
## 1st Qu.: 2750 Class :character Class :character 1st Qu.:3.000
## Median : 5500 Mode :character Mode :character Median :4.000
## Mean : 5500 Mean :4.054
## 3rd Qu.: 8250 3rd Qu.:5.000
## Max. :10999 Max. :7.000
## Customer_rating Cost_of_the_Product Prior_purchases Product_importance
## Min. :1.000 Min. : 96.0 Min. : 2.000 Length:10999
## 1st Qu.:2.000 1st Qu.:169.0 1st Qu.: 3.000 Class :character
## Median :3.000 Median :214.0 Median : 3.000 Mode :character
## Mean :2.991 Mean :210.2 Mean : 3.568
## 3rd Qu.:4.000 3rd Qu.:251.0 3rd Qu.: 4.000
## Max. :5.000 Max. :310.0 Max. :10.000
## Gender Discount_offered Weight_in_gms Reached.on.Time_Y.N
## Length:10999 Min. : 1.00 Min. :1001 Min. :0.0000
## Class :character 1st Qu.: 4.00 1st Qu.:1840 1st Qu.:0.0000
## Mode :character Median : 7.00 Median :4149 Median :1.0000
## Mean :13.37 Mean :3634 Mean :0.5967
## 3rd Qu.:10.00 3rd Qu.:5050 3rd Qu.:1.0000
## Max. :65.00 Max. :7846 Max. :1.0000
head(data) # View first few rows
## ID Warehouse_block Mode_of_Shipment Customer_care_calls Customer_rating
## 1 1 D Flight 4 2
## 2 2 F Flight 4 5
## 3 3 A Flight 2 2
## 4 4 B Flight 3 3
## 5 5 C Flight 2 2
## 6 6 F Flight 3 1
## Cost_of_the_Product Prior_purchases Product_importance Gender
## 1 177 3 low F
## 2 216 2 low M
## 3 183 4 low M
## 4 176 4 medium M
## 5 184 3 medium F
## 6 162 3 medium F
## Discount_offered Weight_in_gms Reached.on.Time_Y.N
## 1 44 1233 1
## 2 59 3088 1
## 3 48 3374 1
## 4 10 1177 1
## 5 46 2484 1
## 6 12 1417 1
# Histogram for Product Cost
ggplot(data, aes(x = Cost_of_the_Product)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Product Cost", x = "Cost", y = "Frequency") +
theme_minimal()

# Bar chart for Warehouse Block
ggplot(data, aes(x = Warehouse_block, fill = Warehouse_block)) +
geom_bar() +
labs(title = "Orders per Warehouse Block", x = "Warehouse", y = "Count") +
theme_minimal()

# Box plot: Cost vs. Reached on Time
# Note: Convert Reached.on.Time_Y.N to a factor for better visualization
data$Reached.on.Time_Y.N <- as.factor(data$Reached.on.Time_Y.N)
ggplot(data, aes(x = Reached.on.Time_Y.N, y = Weight_in_gms, fill = Reached.on.Time_Y.N)) +
geom_boxplot() +
labs(title = "Weight vs. Delivery Status", x = "Reached on Time (1=Yes, 0=No)", y = "Weight (gms)") +
theme_minimal()

# Scatter plot: Discount vs. Weight
ggplot(data, aes(x = Discount_offered, y = Weight_in_gms, color = Reached.on.Time_Y.N)) +
geom_point(alpha = 0.5) +
labs(title = "Discount Offered vs. Weight", x = "Discount", y = "Weight") +
theme_minimal()

# Select only numeric columns
numeric_cols <- data %>% select(Customer_care_calls, Customer_rating, Cost_of_the_Product,
Prior_purchases, Discount_offered, Weight_in_gms)
# Calculate correlation
cor_matrix <- cor(numeric_cols, use = "complete.obs")
# Plot heatmap
corrplot(cor_matrix, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black")

# Check for missing values in each column
colSums(is.na(data))
## ID Warehouse_block Mode_of_Shipment Customer_care_calls
## 0 0 0 0
## Customer_rating Cost_of_the_Product Prior_purchases Product_importance
## 0 0 0 0
## Gender Discount_offered Weight_in_gms Reached.on.Time_Y.N
## 0 0 0 0
# A systematic check for missing values was conducted using the is.na() function in R. No missing values were detected across the 10,999 observations. Consequently, the dataset was deemed complete, and no imputation or listwise deletion was necessary, preserving the full statistical power of the sample
library(ggplot2)
library(gridExtra) # To show plots side-by-side
## Warning: package 'gridExtra' was built under R version 4.5.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Create plots for the three continuous variables
p1 <- ggplot(data, aes(y = Cost_of_the_Product)) + geom_boxplot(fill="orange") + labs(title="Cost")
p2 <- ggplot(data, aes(y = Discount_offered)) + geom_boxplot(fill="lightgreen") + labs(title="Discount")
p3 <- ggplot(data, aes(y = Weight_in_gms)) + geom_boxplot(fill="lightblue") + labs(title="Weight")
grid.arrange(p1, p2, p3, ncol=3)

# Standardizing numerical data is crucial for PCA and FA
# 1. Load the dataset
Train <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")
colnames(Train)
## [1] "ID" "Warehouse_block" "Mode_of_Shipment"
## [4] "Customer_care_calls" "Customer_rating" "Cost_of_the_Product"
## [7] "Prior_purchases" "Product_importance" "Gender"
## [10] "Discount_offered" "Weight_in_gms" "Reached.on.Time_Y.N"
df_numeric <- scale(Train[, c("Customer_care_calls", "Cost_of_the_Product",
"Prior_purchases", "Discount_offered", "Weight_in_gms")])
# Convert it to a data frame
df_numeric <- as.data.frame(df_numeric)
# Check the result
head(df_numeric)
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 1 -0.04770915 -0.6906903 -0.3727178 1.88989745
## 2 -0.04770915 0.1207401 -1.0293770 2.81550758
## 3 -1.79980563 -0.5658549 0.2839414 2.13672681
## 4 -0.92375739 -0.7114962 0.2839414 -0.20815218
## 5 -1.79980563 -0.5450490 -0.3727178 2.01331213
## 6 -0.92375739 -1.0027789 -0.3727178 -0.08473749
## Weight_in_gms
## 1 -1.4681730
## 2 -0.3338781
## 3 -0.1589950
## 4 -1.5024159
## 5 -0.7032119
## 6 -1.3556607
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# Use 'Principal Axis Factoring' and 'Varimax' rotation
fa_model <- fa(df_numeric, nfactors = 2, rotate = "varimax", fm = "pa")
print(fa_model$loadings, cutoff = 0.3)
##
## Loadings:
## PA1 PA2
## Customer_care_calls 0.659
## Cost_of_the_Product 0.466
## Prior_purchases 0.301
## Discount_offered -0.673
## Weight_in_gms -0.394 0.697
##
## PA1 PA2
## SS loadings 0.954 0.944
## Proportion Var 0.191 0.189
## Cumulative Var 0.191 0.380
# 1. Load the library
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
Train <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")
Train$Reached.on.Time_Y.N <- as.factor(Train$Reached.on.Time_Y.N)
# Create the model
lda_model <- lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product +
Prior_purchases + Discount_offered + Weight_in_gms, data = Train)
print(lda_model)
## Call:
## lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product +
## Prior_purchases + Discount_offered + Weight_in_gms, data = Train)
##
## Prior probabilities of groups:
## 0 1
## 0.4033094 0.5966906
##
## Group means:
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 0 4.147656 214.4986 3.670424 5.545987
## 1 3.991467 207.2892 3.498095 18.663721
## Weight_in_gms
## 0 4168.668
## 1 3272.640
##
## Coefficients of linear discriminants:
## LD1
## Customer_care_calls -0.130524749
## Cost_of_the_Product -0.001589589
## Prior_purchases -0.077249330
## Discount_offered 0.050429723
## Weight_in_gms -0.000282091
plot(lda_model)

df_numeric <- scale(Train[, c("Customer_care_calls", "Cost_of_the_Product",
"Prior_purchases", "Discount_offered", "Weight_in_gms")])
df_numeric <- as.data.frame(df_numeric)
class(df_numeric)
## [1] "data.frame"
head(df_numeric) # show the first 6 rows of numbers
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 1 -0.04770915 -0.6906903 -0.3727178 1.88989745
## 2 -0.04770915 0.1207401 -1.0293770 2.81550758
## 3 -1.79980563 -0.5658549 0.2839414 2.13672681
## 4 -0.92375739 -0.7114962 0.2839414 -0.20815218
## 5 -1.79980563 -0.5450490 -0.3727178 2.01331213
## 6 -0.92375739 -1.0027789 -0.3727178 -0.08473749
## Weight_in_gms
## 1 -1.4681730
## 2 -0.3338781
## 3 -0.1589950
## 4 -1.5024159
## 5 -0.7032119
## 6 -1.3556607
colMeans(df_numeric) # show numbers close to 0
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 2.092292e-16 -1.617635e-16 -1.369231e-17 1.307154e-17
## Weight_in_gms
## -1.368121e-16
summary(df_numeric) # show the statistical summary
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## Min. :-1.79981 Min. :-2.37597 Min. :-1.0294 Min. :-0.7635
## 1st Qu.:-0.92376 1st Qu.:-0.85714 1st Qu.:-0.3727 1st Qu.:-0.5784
## Median :-0.04771 Median : 0.07913 Median :-0.3727 Median :-0.3933
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.82834 3rd Qu.: 0.84895 3rd Qu.: 0.2839 3rd Qu.:-0.2082
## Max. : 2.58044 Max. : 2.07650 Max. : 4.2239 Max. : 3.1858
## Weight_in_gms
## Min. :-1.6100
## 1st Qu.:-1.0973
## Median : 0.3149
## Mean : 0.0000
## 3rd Qu.: 0.8658
## Max. : 2.5755
library(psych)
# Bartlett's Test
cortest.bartlett(cor(df_numeric), n = nrow(df_numeric))
## $chisq
## [1] 5535.419
##
## $p.value
## [1] 0
##
## $df
## [1] 10
# KMO Test
KMO(df_numeric)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_numeric)
## Overall MSA = 0.49
## MSA for each item =
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 0.53 0.63 0.62 0.40
## Weight_in_gms
## 0.45
# PCA
pca_result <- prcomp(df_numeric, center = TRUE, scale. = TRUE)
# View the Importance of components
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.2718 1.1634 0.9469 0.8238 0.67362
## Proportion of Variance 0.3235 0.2707 0.1793 0.1357 0.09075
## Cumulative Proportion 0.3235 0.5942 0.7735 0.9093 1.00000
# Create the Scree Plot (to see the "Elbow")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 40)) +
labs(title = "Scree Plot: Variance Explained")

# Create the Biplot (to see variable relationships)
fviz_pca_var(pca_result, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)

library(psych)
# 1. Parallel Analysis to see how many factors to extract
fa.parallel(df_numeric, fm="pa", fa="fa")

## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
fa_result <- fa(df_numeric, nfactors = 2, rotate = "varimax", fm = "pa")
print(fa_result$loadings, cutoff = 0.3)
##
## Loadings:
## PA1 PA2
## Customer_care_calls 0.659
## Cost_of_the_Product 0.466
## Prior_purchases 0.301
## Discount_offered -0.673
## Weight_in_gms -0.394 0.697
##
## PA1 PA2
## SS loadings 0.954 0.944
## Proportion Var 0.191 0.189
## Cumulative Var 0.191 0.380
library(psych)
# Run FA with 2 factors and Varimax rotation
fa_result <- fa(df_numeric, nfactors = 2, rotate = "varimax", fm = "pa")
# The 'cutoff = 0.3' hides small, unimportant relationships
print(fa_result$loadings, sort = TRUE, cutoff = 0.3)
##
## Loadings:
## PA1 PA2
## Customer_care_calls 0.659
## Discount_offered -0.673
## Weight_in_gms -0.394 0.697
## Cost_of_the_Product 0.466
## Prior_purchases 0.301
##
## PA1 PA2
## SS loadings 0.954 0.944
## Proportion Var 0.191 0.189
## Cumulative Var 0.191 0.380
library(MASS)
library(klaR) # For the partition plot
## Warning: package 'klaR' was built under R version 4.5.3
Train <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")
Train$Reached.on.Time_Y.N <- as.factor(Train$Reached.on.Time_Y.N)
# Fit the Model (The core analysis)
lda_model <- lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product +
Prior_purchases + Discount_offered + Weight_in_gms, data = Train)
# 3. View Numerical Results
print(lda_model)
## Call:
## lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product +
## Prior_purchases + Discount_offered + Weight_in_gms, data = Train)
##
## Prior probabilities of groups:
## 0 1
## 0.4033094 0.5966906
##
## Group means:
## Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 0 4.147656 214.4986 3.670424 5.545987
## 1 3.991467 207.2892 3.498095 18.663721
## Weight_in_gms
## 0 4168.668
## 1 3272.640
##
## Coefficients of linear discriminants:
## LD1
## Customer_care_calls -0.130524749
## Cost_of_the_Product -0.001589589
## Prior_purchases -0.077249330
## Discount_offered 0.050429723
## Weight_in_gms -0.000282091
# Histogram of discriminant scores
plot(lda_model, col = "skyblue")

# Partition plot showing how Weight and Discount separate the groups
partimat(Reached.on.Time_Y.N ~ Discount_offered + Weight_in_gms, data = Train, method = "lda")
