library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.3.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data <- read.csv("E:/Alliance University/Sem 3/data/Data CSV/Shopping Mall Customer Segmentation Data .csv")
head(data)
##                            Customer.ID Age Gender Annual.Income Spending.Score
## 1 d410ea53-6661-42a9-ad3a-f554b05fd2a7  30   Male        151479             89
## 2 1770b26f-493f-46b6-837f-4237fb5a314e  58 Female        185088             95
## 3 e81aa8eb-1767-4b77-87ce-1620dc732c5e  62 Female         70912             76
## 4 9795712a-ad19-47bf-8886-4f997d6046e3  23   Male         55460             57
## 5 64139426-2226-4cd6-bf09-91bce4b4db5e  24   Male        153752             76
## 6 7e211337-e92f-4140-8231-5c9ac7a2aa12  42   Male        158335             40
str(data)
## 'data.frame':    15079 obs. of  5 variables:
##  $ Customer.ID   : chr  "d410ea53-6661-42a9-ad3a-f554b05fd2a7" "1770b26f-493f-46b6-837f-4237fb5a314e" "e81aa8eb-1767-4b77-87ce-1620dc732c5e" "9795712a-ad19-47bf-8886-4f997d6046e3" ...
##  $ Age           : int  30 58 62 23 24 42 27 24 83 62 ...
##  $ Gender        : chr  "Male" "Female" "Female" "Male" ...
##  $ Annual.Income : int  151479 185088 70912 55460 153752 158335 163501 70476 47743 63448 ...
##  $ Spending.Score: int  89 95 76 57 76 40 37 17 34 3 ...
summary(data)
##  Customer.ID             Age           Gender          Annual.Income   
##  Length:15079       Min.   :18.00   Length:15079       Min.   : 20022  
##  Class :character   1st Qu.:36.00   Class :character   1st Qu.: 64141  
##  Mode  :character   Median :54.00   Mode  :character   Median :109190  
##                     Mean   :54.19                      Mean   :109743  
##                     3rd Qu.:72.00                      3rd Qu.:155008  
##                     Max.   :90.00                      Max.   :199974  
##  Spending.Score  
##  Min.   :  1.00  
##  1st Qu.: 26.00  
##  Median : 51.00  
##  Mean   : 50.59  
##  3rd Qu.: 75.00  
##  Max.   :100.00
colSums(is.na(data))
##    Customer.ID            Age         Gender  Annual.Income Spending.Score 
##              0              0              0              0              0
plot_missing(data)

# Histograms
num_vars <- sapply(data, is.numeric)
for (var in names(data)[num_vars]) {
  ggplot(data, aes_string(var)) + 
    geom_histogram(bins = 30, fill = "skyblue", color = "black") + 
    labs(title = paste("Histogram of", var)) +
    theme_minimal()
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Univariate analysis of categorical variables
# Bar plots
cat_vars <- sapply(data, is.factor)
for (var in names(data)[cat_vars]) {
  ggplot(data, aes_string(var)) + 
    geom_bar(fill = "salmon", color = "black") + 
    labs(title = paste("Bar Plot of", var)) +
    theme_minimal()
}
# Correlation matrix for numerical variables
cor_matrix <- cor(data[, num_vars], use = "complete.obs")
print(cor_matrix)
##                         Age Annual.Income Spending.Score
## Age             1.000000000   0.004776345   -0.008251286
## Annual.Income   0.004776345   1.000000000    0.003234119
## Spending.Score -0.008251286   0.003234119    1.000000000
# Plot correlation matrix
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, addCoef.col = "black")

ggplot(data, aes(x = Age, y = Annual.Income)) +
  geom_point(color = "blue", alpha = 0.5) +
  labs(title = "Relationship between Age and Annual Income") +
  theme_minimal()

ggplot(data, aes(x = Gender, y = Annual.Income, fill = Gender)) +
  geom_boxplot() +
  labs(title = "Annual Income by Gender") +
  theme_minimal()

library(clustertend)
## Package `clustertend` is deprecated.  Use package `hopkins` instead.
library(hopkins)
## Warning: package 'hopkins' was built under R version 4.3.3
## 
## Attaching package: 'hopkins'
## The following object is masked from 'package:clustertend':
## 
##     hopkins
hopkins(data[, num_vars])
## [1] 0.5167823
write.csv(data,"E:/Alliance University/Sem 3/data/Data CSV/shopping mall cleaned_dataset.csv", row.names = FALSE)