Name: Venetia Polyzou

mydata <- read.table("./Shopping Mall Customer Segmentation Data2 .csv",
                     header = TRUE,
                     sep = ",",
                     dec = ",")
head(mydata)
##                            Customer.ID Age Gender Annual.Income Spending.Score
## 1 d410ea53-6661-42a9-ad3a-f554b05fd2a7  30   Male        151479             89
## 2 1770b26f-493f-46b6-837f-4237fb5a314e  58 Female        185088             95
## 3 e81aa8eb-1767-4b77-87ce-1620dc732c5e  62 Female         70912             76
## 4 9795712a-ad19-47bf-8886-4f997d6046e3  23   Male         55460             57
## 5 64139426-2226-4cd6-bf09-91bce4b4db5e  24   Male        153752             76
## 6 7e211337-e92f-4140-8231-5c9ac7a2aa12  42   Male        158335             40

General:

Variables:

1.ID:

2.Age:

3.Gender:

4.Annual Income:

5.Spending Score:

Source of the data: kaggle.com (https://www.kaggle.com/datasets/zubairmustafa/shopping-mall-customer-segmentation-data)

mydata$GenderF <- factor(mydata$Gender, 
                         levels = c("Male", "Female"),
                         labels = c("Male", "Female"))
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
mydata <- mydata %>%
  rename(annual.income = Annual.Income) %>%
  drop_na()
mydataF <- mydata[mydata$annual.income > 150000 , ]
library(dplyr)
mydata2F <- mydata %>%
  filter(Gender == "Male" )
mydata3 <- mydata[mydata$Spending.Score >= 60 & mydata$Spending.Score <= 80 , ]
summary(mydata[ ,c(-1,-3,-6)])
##       Age        annual.income    Spending.Score  
##  Min.   :18.00   Min.   : 22655   Min.   :  1.00  
##  1st Qu.:32.50   1st Qu.: 69202   1st Qu.: 27.00  
##  Median :48.00   Median :111526   Median : 45.00  
##  Mean   :49.57   Mean   :112493   Mean   : 48.38  
##  3rd Qu.:65.00   3rd Qu.:157317   3rd Qu.: 73.50  
##  Max.   :90.00   Max.   :199879   Max.   :100.00
library(ggplot2)
ggplot(mydata, aes(x = Age)) +
  geom_histogram(binwidth = 4, fill = "skyblue", color = "black") 

ggplot(mydata, aes(x = annual.income, y = Spending.Score)) +
  geom_point(color = "purple") +
  labs(title = "Annual Income vs Spending Score", x = "Annual Income", y = "Spending Score")

library(ggplot2)
ggplot(mydata, aes(x = GenderF)) +
  geom_bar()

-The female bar is taller than the male bar, showing more female consumers than male consumers. There are slightly over 100 female and just below 100 male costumers . While there is a small difference, the dataset appears to have a relatively balanced mix of male and female customers.

library(ggplot2)
ggplot(mydata, aes(x = Spending.Score, y = GenderF)) +
  geom_boxplot()