STA4053_Mini Project

# Load libraries
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot) # For correlation heatmap

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

# Load the dataset
data <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")

# Basic inspection
str(data)      # View data types

## 'data.frame':    10999 obs. of  12 variables:
##  $ ID                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Warehouse_block    : chr  "D" "F" "A" "B" ...
##  $ Mode_of_Shipment   : chr  "Flight" "Flight" "Flight" "Flight" ...
##  $ Customer_care_calls: int  4 4 2 3 2 3 3 4 3 3 ...
##  $ Customer_rating    : int  2 5 2 3 2 1 4 1 4 2 ...
##  $ Cost_of_the_Product: int  177 216 183 176 184 162 250 233 150 164 ...
##  $ Prior_purchases    : int  3 2 4 4 3 3 3 2 3 3 ...
##  $ Product_importance : chr  "low" "low" "low" "medium" ...
##  $ Gender             : chr  "F" "M" "M" "M" ...
##  $ Discount_offered   : int  44 59 48 10 46 12 3 48 11 29 ...
##  $ Weight_in_gms      : int  1233 3088 3374 1177 2484 1417 2371 2804 1861 1187 ...
##  $ Reached.on.Time_Y.N: int  1 1 1 1 1 1 1 1 1 1 ...

summary(data)  # Statistical summary

##        ID        Warehouse_block    Mode_of_Shipment   Customer_care_calls
##  Min.   :    1   Length:10999       Length:10999       Min.   :2.000      
##  1st Qu.: 2750   Class :character   Class :character   1st Qu.:3.000      
##  Median : 5500   Mode  :character   Mode  :character   Median :4.000      
##  Mean   : 5500                                         Mean   :4.054      
##  3rd Qu.: 8250                                         3rd Qu.:5.000      
##  Max.   :10999                                         Max.   :7.000      
##  Customer_rating Cost_of_the_Product Prior_purchases  Product_importance
##  Min.   :1.000   Min.   : 96.0       Min.   : 2.000   Length:10999      
##  1st Qu.:2.000   1st Qu.:169.0       1st Qu.: 3.000   Class :character  
##  Median :3.000   Median :214.0       Median : 3.000   Mode  :character  
##  Mean   :2.991   Mean   :210.2       Mean   : 3.568                     
##  3rd Qu.:4.000   3rd Qu.:251.0       3rd Qu.: 4.000                     
##  Max.   :5.000   Max.   :310.0       Max.   :10.000                     
##     Gender          Discount_offered Weight_in_gms  Reached.on.Time_Y.N
##  Length:10999       Min.   : 1.00    Min.   :1001   Min.   :0.0000     
##  Class :character   1st Qu.: 4.00    1st Qu.:1840   1st Qu.:0.0000     
##  Mode  :character   Median : 7.00    Median :4149   Median :1.0000     
##                     Mean   :13.37    Mean   :3634   Mean   :0.5967     
##                     3rd Qu.:10.00    3rd Qu.:5050   3rd Qu.:1.0000     
##                     Max.   :65.00    Max.   :7846   Max.   :1.0000

head(data)     # View first few rows

##   ID Warehouse_block Mode_of_Shipment Customer_care_calls Customer_rating
## 1  1               D           Flight                   4               2
## 2  2               F           Flight                   4               5
## 3  3               A           Flight                   2               2
## 4  4               B           Flight                   3               3
## 5  5               C           Flight                   2               2
## 6  6               F           Flight                   3               1
##   Cost_of_the_Product Prior_purchases Product_importance Gender
## 1                 177               3                low      F
## 2                 216               2                low      M
## 3                 183               4                low      M
## 4                 176               4             medium      M
## 5                 184               3             medium      F
## 6                 162               3             medium      F
##   Discount_offered Weight_in_gms Reached.on.Time_Y.N
## 1               44          1233                   1
## 2               59          3088                   1
## 3               48          3374                   1
## 4               10          1177                   1
## 5               46          2484                   1
## 6               12          1417                   1

# Histogram for Product Cost
ggplot(data, aes(x = Cost_of_the_Product)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Product Cost", x = "Cost", y = "Frequency") +
  theme_minimal()

# Bar chart for Warehouse Block
ggplot(data, aes(x = Warehouse_block, fill = Warehouse_block)) +
  geom_bar() +
  labs(title = "Orders per Warehouse Block", x = "Warehouse", y = "Count") +
  theme_minimal()

# Box plot: Cost vs. Reached on Time
# Note: Convert Reached.on.Time_Y.N to a factor for better visualization
data$Reached.on.Time_Y.N <- as.factor(data$Reached.on.Time_Y.N)

ggplot(data, aes(x = Reached.on.Time_Y.N, y = Weight_in_gms, fill = Reached.on.Time_Y.N)) +
  geom_boxplot() +
  labs(title = "Weight vs. Delivery Status", x = "Reached on Time (1=Yes, 0=No)", y = "Weight (gms)") +
  theme_minimal()

# Scatter plot: Discount vs. Weight
ggplot(data, aes(x = Discount_offered, y = Weight_in_gms, color = Reached.on.Time_Y.N)) +
  geom_point(alpha = 0.5) +
  labs(title = "Discount Offered vs. Weight", x = "Discount", y = "Weight") +
  theme_minimal()

# Select only numeric columns
numeric_cols <- data %>% select(Customer_care_calls, Customer_rating, Cost_of_the_Product, 
                                Prior_purchases, Discount_offered, Weight_in_gms)

# Calculate correlation
cor_matrix <- cor(numeric_cols, use = "complete.obs")

# Plot heatmap
corrplot(cor_matrix, method = "color", type = "upper", 
         addCoef.col = "black", tl.col = "black")

# Check for missing values in each column
colSums(is.na(data))

##                  ID     Warehouse_block    Mode_of_Shipment Customer_care_calls 
##                   0                   0                   0                   0 
##     Customer_rating Cost_of_the_Product     Prior_purchases  Product_importance 
##                   0                   0                   0                   0 
##              Gender    Discount_offered       Weight_in_gms Reached.on.Time_Y.N 
##                   0                   0                   0                   0

# A systematic check for missing values was conducted using the is.na() function in R. No missing values were detected across the 10,999 observations. Consequently, the dataset was deemed complete, and no imputation or listwise deletion was necessary, preserving the full statistical power of the sample

library(ggplot2)
library(gridExtra) # To show plots side-by-side

## Warning: package 'gridExtra' was built under R version 4.5.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

# Create plots for the three continuous variables
p1 <- ggplot(data, aes(y = Cost_of_the_Product)) + geom_boxplot(fill="orange") + labs(title="Cost")
p2 <- ggplot(data, aes(y = Discount_offered)) + geom_boxplot(fill="lightgreen") + labs(title="Discount")
p3 <- ggplot(data, aes(y = Weight_in_gms)) + geom_boxplot(fill="lightblue") + labs(title="Weight")

grid.arrange(p1, p2, p3, ncol=3)

# Standardizing numerical data is crucial for PCA and FA
# 1. Load the dataset
Train <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")

colnames(Train)

##  [1] "ID"                  "Warehouse_block"     "Mode_of_Shipment"   
##  [4] "Customer_care_calls" "Customer_rating"     "Cost_of_the_Product"
##  [7] "Prior_purchases"     "Product_importance"  "Gender"             
## [10] "Discount_offered"    "Weight_in_gms"       "Reached.on.Time_Y.N"

df_numeric <- scale(Train[, c("Customer_care_calls", "Cost_of_the_Product", 
                              "Prior_purchases", "Discount_offered", "Weight_in_gms")])

#  Convert it to a data frame 
df_numeric <- as.data.frame(df_numeric)

#  Check the result
head(df_numeric)

##   Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 1         -0.04770915          -0.6906903      -0.3727178       1.88989745
## 2         -0.04770915           0.1207401      -1.0293770       2.81550758
## 3         -1.79980563          -0.5658549       0.2839414       2.13672681
## 4         -0.92375739          -0.7114962       0.2839414      -0.20815218
## 5         -1.79980563          -0.5450490      -0.3727178       2.01331213
## 6         -0.92375739          -1.0027789      -0.3727178      -0.08473749
##   Weight_in_gms
## 1    -1.4681730
## 2    -0.3338781
## 3    -0.1589950
## 4    -1.5024159
## 5    -0.7032119
## 6    -1.3556607

library(psych)

## Warning: package 'psych' was built under R version 4.5.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

# Use 'Principal Axis Factoring' and 'Varimax' rotation
fa_model <- fa(df_numeric, nfactors = 2, rotate = "varimax", fm = "pa")
print(fa_model$loadings, cutoff = 0.3)

## 
## Loadings:
##                     PA1    PA2   
## Customer_care_calls  0.659       
## Cost_of_the_Product  0.466       
## Prior_purchases      0.301       
## Discount_offered           -0.673
## Weight_in_gms       -0.394  0.697
## 
##                  PA1   PA2
## SS loadings    0.954 0.944
## Proportion Var 0.191 0.189
## Cumulative Var 0.191 0.380

# 1. Load the library
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

Train <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")

Train$Reached.on.Time_Y.N <- as.factor(Train$Reached.on.Time_Y.N)

#  Create the model
lda_model <- lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product + 
                 Prior_purchases + Discount_offered + Weight_in_gms, data = Train)

print(lda_model)

## Call:
## lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product + 
##     Prior_purchases + Discount_offered + Weight_in_gms, data = Train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4033094 0.5966906 
## 
## Group means:
##   Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 0            4.147656            214.4986        3.670424         5.545987
## 1            3.991467            207.2892        3.498095        18.663721
##   Weight_in_gms
## 0      4168.668
## 1      3272.640
## 
## Coefficients of linear discriminants:
##                              LD1
## Customer_care_calls -0.130524749
## Cost_of_the_Product -0.001589589
## Prior_purchases     -0.077249330
## Discount_offered     0.050429723
## Weight_in_gms       -0.000282091

plot(lda_model)

df_numeric <- scale(Train[, c("Customer_care_calls", "Cost_of_the_Product", 
                              "Prior_purchases", "Discount_offered", "Weight_in_gms")])

df_numeric <- as.data.frame(df_numeric)
class(df_numeric)

## [1] "data.frame"

head(df_numeric)    # show the first 6 rows of numbers

##   Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 1         -0.04770915          -0.6906903      -0.3727178       1.88989745
## 2         -0.04770915           0.1207401      -1.0293770       2.81550758
## 3         -1.79980563          -0.5658549       0.2839414       2.13672681
## 4         -0.92375739          -0.7114962       0.2839414      -0.20815218
## 5         -1.79980563          -0.5450490      -0.3727178       2.01331213
## 6         -0.92375739          -1.0027789      -0.3727178      -0.08473749
##   Weight_in_gms
## 1    -1.4681730
## 2    -0.3338781
## 3    -0.1589950
## 4    -1.5024159
## 5    -0.7032119
## 6    -1.3556607

colMeans(df_numeric) # show numbers close to 0

## Customer_care_calls Cost_of_the_Product     Prior_purchases    Discount_offered 
##        2.092292e-16       -1.617635e-16       -1.369231e-17        1.307154e-17 
##       Weight_in_gms 
##       -1.368121e-16

summary(df_numeric) #  show the statistical summary

##  Customer_care_calls Cost_of_the_Product Prior_purchases   Discount_offered 
##  Min.   :-1.79981    Min.   :-2.37597    Min.   :-1.0294   Min.   :-0.7635  
##  1st Qu.:-0.92376    1st Qu.:-0.85714    1st Qu.:-0.3727   1st Qu.:-0.5784  
##  Median :-0.04771    Median : 0.07913    Median :-0.3727   Median :-0.3933  
##  Mean   : 0.00000    Mean   : 0.00000    Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.82834    3rd Qu.: 0.84895    3rd Qu.: 0.2839   3rd Qu.:-0.2082  
##  Max.   : 2.58044    Max.   : 2.07650    Max.   : 4.2239   Max.   : 3.1858  
##  Weight_in_gms    
##  Min.   :-1.6100  
##  1st Qu.:-1.0973  
##  Median : 0.3149  
##  Mean   : 0.0000  
##  3rd Qu.: 0.8658  
##  Max.   : 2.5755

library(psych)
# Bartlett's Test
cortest.bartlett(cor(df_numeric), n = nrow(df_numeric))

## $chisq
## [1] 5535.419
## 
## $p.value
## [1] 0
## 
## $df
## [1] 10

# KMO Test
KMO(df_numeric)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_numeric)
## Overall MSA =  0.49
## MSA for each item = 
## Customer_care_calls Cost_of_the_Product     Prior_purchases    Discount_offered 
##                0.53                0.63                0.62                0.40 
##       Weight_in_gms 
##                0.45

# PCA 
pca_result <- prcomp(df_numeric, center = TRUE, scale. = TRUE)

# View the Importance of components
summary(pca_result)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.2718 1.1634 0.9469 0.8238 0.67362
## Proportion of Variance 0.3235 0.2707 0.1793 0.1357 0.09075
## Cumulative Proportion  0.3235 0.5942 0.7735 0.9093 1.00000

# Create the Scree Plot (to see the "Elbow")
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.3

## Welcome to factoextra!

## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 40)) +
  labs(title = "Scree Plot: Variance Explained")

# Create the Biplot (to see variable relationships)
fviz_pca_var(pca_result, col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE)

library(psych)

# 1. Parallel Analysis to see how many factors to extract

fa.parallel(df_numeric, fm="pa", fa="fa")

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

fa_result <- fa(df_numeric, nfactors = 2, rotate = "varimax", fm = "pa")

print(fa_result$loadings, cutoff = 0.3)

## 
## Loadings:
##                     PA1    PA2   
## Customer_care_calls  0.659       
## Cost_of_the_Product  0.466       
## Prior_purchases      0.301       
## Discount_offered           -0.673
## Weight_in_gms       -0.394  0.697
## 
##                  PA1   PA2
## SS loadings    0.954 0.944
## Proportion Var 0.191 0.189
## Cumulative Var 0.191 0.380

library(psych)

# Run FA with 2 factors and Varimax rotation
fa_result <- fa(df_numeric, nfactors = 2, rotate = "varimax", fm = "pa")

# The 'cutoff = 0.3' hides small, unimportant relationships
print(fa_result$loadings, sort = TRUE, cutoff = 0.3)

## 
## Loadings:
##                     PA1    PA2   
## Customer_care_calls  0.659       
## Discount_offered           -0.673
## Weight_in_gms       -0.394  0.697
## Cost_of_the_Product  0.466       
## Prior_purchases      0.301       
## 
##                  PA1   PA2
## SS loadings    0.954 0.944
## Proportion Var 0.191 0.189
## Cumulative Var 0.191 0.380

library(MASS)
library(klaR) # For the partition plot

## Warning: package 'klaR' was built under R version 4.5.3

Train <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/archive (3)/Train.csv")
Train$Reached.on.Time_Y.N <- as.factor(Train$Reached.on.Time_Y.N)

# Fit the Model (The core analysis)
lda_model <- lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product + 
                 Prior_purchases + Discount_offered + Weight_in_gms, data = Train)

# 3. View Numerical Results
print(lda_model)

## Call:
## lda(Reached.on.Time_Y.N ~ Customer_care_calls + Cost_of_the_Product + 
##     Prior_purchases + Discount_offered + Weight_in_gms, data = Train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4033094 0.5966906 
## 
## Group means:
##   Customer_care_calls Cost_of_the_Product Prior_purchases Discount_offered
## 0            4.147656            214.4986        3.670424         5.545987
## 1            3.991467            207.2892        3.498095        18.663721
##   Weight_in_gms
## 0      4168.668
## 1      3272.640
## 
## Coefficients of linear discriminants:
##                              LD1
## Customer_care_calls -0.130524749
## Cost_of_the_Product -0.001589589
## Prior_purchases     -0.077249330
## Discount_offered     0.050429723
## Weight_in_gms       -0.000282091

# Histogram of discriminant scores
plot(lda_model, col = "skyblue")

# Partition plot showing how Weight and Discount separate the groups
partimat(Reached.on.Time_Y.N ~ Discount_offered + Weight_in_gms, data = Train, method = "lda")

STA4053_Mini Project

Lakshan Dulanga Samaraweera

2026-04-10