library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.3.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data <- read.csv("E:/Alliance University/Sem 3/data/Data CSV/Shopping Mall Customer Segmentation Data .csv")
head(data)
## Customer.ID Age Gender Annual.Income Spending.Score
## 1 d410ea53-6661-42a9-ad3a-f554b05fd2a7 30 Male 151479 89
## 2 1770b26f-493f-46b6-837f-4237fb5a314e 58 Female 185088 95
## 3 e81aa8eb-1767-4b77-87ce-1620dc732c5e 62 Female 70912 76
## 4 9795712a-ad19-47bf-8886-4f997d6046e3 23 Male 55460 57
## 5 64139426-2226-4cd6-bf09-91bce4b4db5e 24 Male 153752 76
## 6 7e211337-e92f-4140-8231-5c9ac7a2aa12 42 Male 158335 40
str(data)
## 'data.frame': 15079 obs. of 5 variables:
## $ Customer.ID : chr "d410ea53-6661-42a9-ad3a-f554b05fd2a7" "1770b26f-493f-46b6-837f-4237fb5a314e" "e81aa8eb-1767-4b77-87ce-1620dc732c5e" "9795712a-ad19-47bf-8886-4f997d6046e3" ...
## $ Age : int 30 58 62 23 24 42 27 24 83 62 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Annual.Income : int 151479 185088 70912 55460 153752 158335 163501 70476 47743 63448 ...
## $ Spending.Score: int 89 95 76 57 76 40 37 17 34 3 ...
summary(data)
## Customer.ID Age Gender Annual.Income
## Length:15079 Min. :18.00 Length:15079 Min. : 20022
## Class :character 1st Qu.:36.00 Class :character 1st Qu.: 64141
## Mode :character Median :54.00 Mode :character Median :109190
## Mean :54.19 Mean :109743
## 3rd Qu.:72.00 3rd Qu.:155008
## Max. :90.00 Max. :199974
## Spending.Score
## Min. : 1.00
## 1st Qu.: 26.00
## Median : 51.00
## Mean : 50.59
## 3rd Qu.: 75.00
## Max. :100.00
colSums(is.na(data))
## Customer.ID Age Gender Annual.Income Spending.Score
## 0 0 0 0 0
plot_missing(data)

# Histograms
num_vars <- sapply(data, is.numeric)
for (var in names(data)[num_vars]) {
ggplot(data, aes_string(var)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = paste("Histogram of", var)) +
theme_minimal()
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Univariate analysis of categorical variables
# Bar plots
cat_vars <- sapply(data, is.factor)
for (var in names(data)[cat_vars]) {
ggplot(data, aes_string(var)) +
geom_bar(fill = "salmon", color = "black") +
labs(title = paste("Bar Plot of", var)) +
theme_minimal()
}
# Correlation matrix for numerical variables
cor_matrix <- cor(data[, num_vars], use = "complete.obs")
print(cor_matrix)
## Age Annual.Income Spending.Score
## Age 1.000000000 0.004776345 -0.008251286
## Annual.Income 0.004776345 1.000000000 0.003234119
## Spending.Score -0.008251286 0.003234119 1.000000000
# Plot correlation matrix
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black")

ggplot(data, aes(x = Age, y = Annual.Income)) +
geom_point(color = "blue", alpha = 0.5) +
labs(title = "Relationship between Age and Annual Income") +
theme_minimal()

ggplot(data, aes(x = Gender, y = Annual.Income, fill = Gender)) +
geom_boxplot() +
labs(title = "Annual Income by Gender") +
theme_minimal()

library(clustertend)
## Package `clustertend` is deprecated. Use package `hopkins` instead.
library(hopkins)
## Warning: package 'hopkins' was built under R version 4.3.3
##
## Attaching package: 'hopkins'
## The following object is masked from 'package:clustertend':
##
## hopkins
hopkins(data[, num_vars])
## [1] 0.5167823
write.csv(data,"E:/Alliance University/Sem 3/data/Data CSV/shopping mall cleaned_dataset.csv", row.names = FALSE)