mydata <- read.table("./Shopping Mall Customer Segmentation Data2 .csv",
header = TRUE,
sep = ",",
dec = ",")
head(mydata)
## Customer.ID Age Gender Annual.Income Spending.Score
## 1 d410ea53-6661-42a9-ad3a-f554b05fd2a7 30 Male 151479 89
## 2 1770b26f-493f-46b6-837f-4237fb5a314e 58 Female 185088 95
## 3 e81aa8eb-1767-4b77-87ce-1620dc732c5e 62 Female 70912 76
## 4 9795712a-ad19-47bf-8886-4f997d6046e3 23 Male 55460 57
## 5 64139426-2226-4cd6-bf09-91bce4b4db5e 24 Male 153752 76
## 6 7e211337-e92f-4140-8231-5c9ac7a2aa12 42 Male 158335 40
General:
Variables:
1.ID:
2.Age:
3.Gender:
4.Annual Income:
5.Spending Score:
Source of the data: kaggle.com (https://www.kaggle.com/datasets/zubairmustafa/shopping-mall-customer-segmentation-data)
mydata$GenderF <- factor(mydata$Gender,
levels = c("Male", "Female"),
labels = c("Male", "Female"))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
mydata <- mydata %>%
rename(annual.income = Annual.Income) %>%
drop_na()
mydataF <- mydata[mydata$annual.income > 150000 , ]
library(dplyr)
mydata2F <- mydata %>%
filter(Gender == "Male" )
mydata3 <- mydata[mydata$Spending.Score >= 60 & mydata$Spending.Score <= 80 , ]
summary(mydata[ ,c(-1,-3,-6)])
## Age annual.income Spending.Score
## Min. :18.00 Min. : 22655 Min. : 1.00
## 1st Qu.:32.50 1st Qu.: 69202 1st Qu.: 27.00
## Median :48.00 Median :111526 Median : 45.00
## Mean :49.57 Mean :112493 Mean : 48.38
## 3rd Qu.:65.00 3rd Qu.:157317 3rd Qu.: 73.50
## Max. :90.00 Max. :199879 Max. :100.00
library(ggplot2)
ggplot(mydata, aes(x = Age)) +
geom_histogram(binwidth = 4, fill = "skyblue", color = "black")
ggplot(mydata, aes(x = annual.income, y = Spending.Score)) +
geom_point(color = "purple") +
labs(title = "Annual Income vs Spending Score", x = "Annual Income", y = "Spending Score")
library(ggplot2)
ggplot(mydata, aes(x = GenderF)) +
geom_bar()
-The female bar is taller than the male bar, showing more female consumers than male consumers. There are slightly over 100 female and just below 100 male costumers . While there is a small difference, the dataset appears to have a relatively balanced mix of male and female customers.
library(ggplot2)
ggplot(mydata, aes(x = Spending.Score, y = GenderF)) +
geom_boxplot()