import<-read.csv("/Users/sofia/Downloads/Mall_Customers.csv", header=F)
mall<-import[2:201, 1:5]
colnames(mall)=c("id","gender","age","income","score")
Tried using all 200 observations but the dendograms were difficult to read with that many groups. Decided to randomly sort the data and pick the first 50 rows for the analysis
# mall2 <- mall[sample(nrow(mall)),]
# output <- mall2[1:50,1:5]
# write.csv(output, file="//Users/sofia/Desktop/mall_sample.csv")
mall<-read.csv("/Users/sofia/Desktop/mall_sample_data.csv", header=T)
str(mall)
## 'data.frame': 50 obs. of 4 variables:
## $ gender: Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 1 1 1 2 ...
## $ age : int 59 27 34 60 45 40 35 34 21 67 ...
## $ income: int 93 88 58 50 54 87 23 78 62 54 ...
## $ score : int 14 69 60 49 53 13 98 22 42 41 ...
mall$gender <- factor(mall$gender, levels=c("Male","Female"), labels=c(0,1))
mall$gender = as.numeric(gsub("\\$", "", mall$gender))
mall$age = as.numeric(gsub("\\$", "", mall$age))
mall$income = as.numeric(gsub("\\$", "", mall$income))
mall$score = as.numeric(gsub("\\$", "", mall$score))
str(mall)
## 'data.frame': 50 obs. of 4 variables:
## $ gender: num 0 0 1 1 1 0 1 1 1 0 ...
## $ age : num 59 27 34 60 45 40 35 34 21 67 ...
## $ income: num 93 88 58 50 54 87 23 78 62 54 ...
## $ score : num 14 69 60 49 53 13 98 22 42 41 ...
library(robustHD)
## Loading required package: ggplot2
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
std = standardize(mall[2:4])
# Looked at quantiles to create labels for score, income, age (lower, middle, upper)
quantile(mall$score)
## 0% 25% 50% 75% 100%
## 4.00 25.25 49.00 71.25 98.00
quantile(mall$income)
## 0% 25% 50% 75% 100%
## 18.0 51.0 64.5 85.0 137.0
quantile(mall$age)
## 0% 25% 50% 75% 100%
## 19.00 30.00 37.00 48.75 67.00
# labels with ID+Gender, Age Category, Salary Category, Spending Score Category
label<-c(
"179M, Old, High, $", "178M, Young, High, $$", "89F, Mid, Avg, $$", "73F, Old, Low, $$", "77F, Mid, Avg, $$",
"171M, Mid, High, $", "20F, Mid, Low, $$$", "149F, Mid, Avg, $", "106F, Young, Avg, $$", "83M, Old, Avg, $$",
"13F, Old, Low, $", "74F, Old, Low, $$", "194F, Mid, High, $$$", "46F, Young, Low, $$", "173M, Mid, High, $",
"88F, Young, Avg, $$", "131M, Mid, Avg, $", "72F, Mid, Low, $$", "162F, Young, Avg, $$$", "199M, Mid, High, $",
"104M, Young, Avg, $$","188M, Young, High, $$","93M, Mid, Avg, $$", "123F, Mid, Avg, $$","190F, Mid, High, $$$",
"87F, Old, Avg, $$", "55F, Old, Low, $$", "94F, Mid, Avg, $$", "124M, Mid, Avg, $$$", "141F, Old, Avg, $",
"170M, Mid, Avg, $$", "139M, Young, Avg, $", "90F, Old, Avg, $$", "80F, Mid, Avg, $$", "26M, Young, Low, $$$",
"119F, Old, Avg, $$", "198M, Mid, High, $$$", "10F, Mid, Low, $$", "8F, Young, Low, $$$", "195F, Mid, High, $",
"176F, Mid, High, $$$", "155F, Mid, Avg, $", "147M, Mid, Avg, $$", "142M, Mid, Avg, $$$", "35F, Mid, Low, $",
"148F, Mid, Avg, $$$", "31M, Old, Low, $", "21M, Mid, Low, $$", "101F, Young, Avg, $$", "172M, Young, Avg, $$$")
rownames(std)=label
std$gender<-mall$gender
Single Euclidean
dist.Es <- dist(std, method="euclidean")
clust.Es <- hclust(dist.Es, method="single")
plot(clust.Es, cex=.4)
rect.hclust(clust.Es, k=3)
Not great. It’s not useful to have 96% of the cases in one cluster.
Complete Euclidean
dist.Ec <- dist(std, method="euclidean")
clust.Ec <- hclust(dist.Ec, method="complete")
plot(clust.Ec, cex=.4)
rect.hclust(clust.Ec, k=3)
Average Euclidean
dist.Ea <- dist(std, method="euclidean")
clust.Ea <- hclust(dist.Ea, method="ave")
plot(clust.Ea, cex=.4)
rect.hclust(clust.Ea, k=3)
Single Canberra
dist.Cs <- dist(std, method="canberra")
clust.Cs <- hclust(dist.Cs, method="single")
plot(clust.Cs, cex=.4)
rect.hclust(clust.Cs, k=3)
Again, single linkage isn’t robust in finding meaningful clusters for my dataset.
Complete Canberra
dist.Cc <- dist(std, method="canberra")
clust.Cc <- hclust(dist.Cc, method="complete")
plot(clust.Cc, cex=.4)
rect.hclust(clust.Cc, k=3)
This looks pretty good, the clusters are broken out in a way where you can identify potential customers (Left), unlikely customers (Middle), and likely customers (Right). Gender seems to play a big role. Middle-aged men with high salaries and low shopping scores were clustered together (unlikely customers) but females of the same age, salary, and shopping score were grouped with the potential customers cluster on the left.
Average Canberra
dist.Ca <- dist(std, method="canberra")
clust.Ca <- hclust(dist.Ca, method="ave")
plot(clust.Ca, cex=.4)
rect.hclust(clust.Ca, k=3)
Middle cluster size decreased but overall pretty similar to the complete canberra. Left to right: likely customers, unlikely customers, potential customers.
Single Manhattan
dist.Ms <- dist(std, method="manhattan")
clust.Ms <- hclust(dist.Ms, method="single")
plot(clust.Ms, cex=.4)
rect.hclust(clust.Ms, k=3)
Complete Manhattan
dist.Mc <- dist(std, method="manhattan")
clust.Mc <- hclust(dist.Mc, method="complete")
plot(clust.Mc, cex=.4)
rect.hclust(clust.Mc, k=3)
Average Manhattan
dist.Ma <- dist(std, method="manhattan")
clust.Ma <- hclust(dist.Ma, method="ave")
plot(clust.Ma, cex=.4)
rect.hclust(clust.Ma, k=3)
Seems like a little bit of every variable influced these clusters. They are pretty informative and align with what I was expecting to see. Potential customers: High shopping index and/or high salary, all young or middle aged. Unlikely customers: Low shopping index (interestingly, the unlikely customers group doesn’t include any subjects with low salaries), even distribution of males/females, all age ranges present. Likely customers: Everyone else, shopping index in the middle with low and average salaries, mostly females, all age ranges present.
Subject 199 and 21 seemed to throw off any single-linkage distance calcluations, as well as the average Euclidean.