industry group: Motor Vehicle and Motor Vehicle Parts and Supplies Merchant Wholesalers: NAICS 4231
set.seed(1)
The first variable we simulate is the number of categories of products available in each wholesaler. We put 3 patterns in the data. In each pattern, we put 10 simulated values. The code is:
ctgr <- c(sample(1:50, 10, replace = TRUE),
sample(100:200, 15, replace = TRUE),
sample(250:300, 5, replace = TRUE))
ctgrscore <- (ctgr - mean(ctgr))/sd(ctgr)
The second variable we simulate is average number of products in each category available in each wholesaler. We put 2 patterns in the data. In each pattern, we put 15 simulated values. The code is:
nCtgr <- c(sample(1:10, 15, replace = TRUE),
sample(20:30, 15, replace = TRUE))
nCtgrscore <- (nCtgr - mean(nCtgr))/sd(nCtgr)
The third variable we simulate is the number of brands usually used by customers. We put 2 patterns in the data. In the first pattern, we put 20 simulated values. In the second pattern we put 10 simuated values. The code is:
nBrnd <- c(sample(10:30, 20, replace = TRUE),
sample(1:10, 10, replace = TRUE))
nBrndscore <- (nBrnd - mean(nBrnd))/sd(nBrnd)
The forth variable we simulate is the number of models used by customers. We put 2 patterns in the data. In the first pattern, we put 20 simulated values. In the second pattern, we put 10 simuated values. The code is:
nMdl <- c(sample(50:100, 20, replace = TRUE),
sample(1:50, 10, replace = TRUE))
nMdlscore <- (nMdl - mean(nMdl))/sd(nMdl)
The fifth variable we simulate is the price deviation from average of a product which has an around-average number of sales. We put only 1 pattern in the data. The code is:
pPrdct <- sample(1:300, size = 30, replace = TRUE)
pPrdctscore <- (pPrdct - mean(pPrdct))/sd(pPrdct)
The sixth variable we simulate is price deviation from average of a category which has an around-average number of sales. We put only 1 pattern in the data. The code is:
pCtgr <- sample(1:100, size = 30, replace = TRUE)
pCtgrscore <- (pCtgr - mean(pCtgr))/sd(pCtgr)
The seventh variable we simulate is the percentage of non-parts sold. We put 2 patterns in the data. In the first pattern, we put 20 simulated values. In the second pattern, we put 10 simuated values. The code is:
pNp <- c(sample(30:70, size = 20, replace = TRUE),
sample(1:100, size = 10, replace = TRUE))
pNpscore <- (pNp - mean(pNp))/sd(pNp)
The eighth variable we simulate is percentage of uncommon transportation products & parts. We put 3 patterns in the data. In each pattern, we put 10 simulated values. The code is:
larget <- sample(c(rep(0, times = 5),
sample(10:30, size = 5, replace = TRUE)), size = 10)
mediant <- sample(c(rep(0, times = 6),
rep(100, times = 2), 20, 10), size = 10)
smallt <- sample(c(rep(0, times = 6), rep(100, times = 4)), size = 10)
pUt <- c(larget, mediant, smallt)
pUtscore <- (pUt - mean(pUt))/sd(pUt)
The ninth variable we simulate is percentage of used products. We put 3 patterns in the data. In each pattern, we put 10 simulated values. The code is:
largeu <- sample(c(rep(0, times = 8), 3, 5), size = 10)
medianu <- sample(c(rep(0, times = 7), 3, 5, 8), size = 10)
smallu <- sample(c(rep(0, times = 6),
sample(1:20, size = 4, replace = TRUE)), size = 10)
pUp <- c(largeu, medianu, smallu)
pUpscore <- (pUp - mean(pUp))/sd(pUp)
Now we build our simulated data into a data frame, so it can be later used in clustering analysis.
simMV <- cbind(ctgrscore, nCtgrscore, nBrndscore, nMdlscore,
pPrdctscore, pCtgrscore, pNpscore, pUtscore, pUpscore)
set.seed(Sys.time())
library(cluster)
hcMV <- hclust(dist(simMV))
plot(hcMV, main = "Hierarchical Clustering to find K (MV)",
xlab = "Numbered Wholesalers")
abline(h = 4.6, col = "blue")
abline(h = 5.1, col = "red")
abline(h = 5.5, col = "green")
## k-means with k = 3
## repeat k-means clustering for 100 times with different start points
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmMV3 <- kmeans(simMV, 3)
totWithinSq <- c(kmMV3$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmMV3 <- kmeans(simMV, 3)
## plot the result with the smallest within-group sum of squares
clusplot(simMV, kmMV3$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Motor Vehicle while K = 3",
xlab = "Number of product categories available",
ylab = "Number of products in each category")
## label of clusters
kmMV3$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2
## cordinates of centers
kmMV3$centers
## ctgrscore nCtgrscore nBrndscore nMdlscore pPrdctscore pCtgrscore
## 1 -0.7217317 -0.9487876 0.4389097 0.6538996 -0.10778829 -0.2323299
## 2 0.8701004 0.8935854 -1.1928434 -1.2187335 -0.07775567 0.6471109
## 3 0.4249945 1.0591919 1.0689576 0.4757683 0.47887620 -0.5972322
## pNpscore pUtscore pUpscore
## 1 0.3242067 -0.3780789 -0.27590332
## 2 -0.6220715 0.3780789 0.38459250
## 3 0.2715231 0.3780789 0.05852495
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmMV3$withinss
## [1] 56.57548 73.02529 21.88084
## total within-cluster sum of distances
kmMV3$tot.withinss
## [1] 151.4816
## k-means with k = 4
## repeat k-means clustering for 100 times with different start points
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmMV4 <- kmeans(simMV, 4)
totWithinSq <- c(kmMV4$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmMV4 <- kmeans(simMV, 4)
## plot the result with the smallest within-group sum of squares
clusplot(simMV, kmMV4$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Motor Vehicle while K = 4",
xlab = "Number of product categories available",
ylab = "Number of products in each category")
## label of clusters
kmMV4$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 4 4 4 4 4 3 4 3
## cordinates of centers
kmMV4$centers
## ctgrscore nCtgrscore nBrndscore nMdlscore pPrdctscore pCtgrscore
## 1 -0.7217317 -0.9487876 0.4389097 0.6538996 -0.1077883 -0.2323299
## 2 0.4249945 1.0591919 1.0689576 0.4757683 0.4788762 -0.5972322
## 3 0.9389722 1.0747176 -1.2406279 -0.9498455 0.8065608 1.0074191
## 4 0.8241858 0.7728306 -1.1609870 -1.3979922 -0.6673000 0.4069054
## pNpscore pUtscore pUpscore
## 1 0.3242067 -0.3780789 -0.27590332
## 2 0.2715231 0.3780789 0.05852495
## 3 -0.8165955 1.9039576 -0.49328169
## 4 -0.4923888 -0.6391737 0.96984197
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmMV4$withinss
## [1] 56.57548 21.88084 12.33399 32.95276
## total within-cluster sum of distances
kmMV4$tot.withinss
## [1] 123.7431
## k-means with k = 5
## repeat k-means clustering for 100 times with different start points
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmMV5 <- kmeans(simMV, 5)
totWithinSq <- c(kmMV5$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmMV5 <- kmeans(simMV, 5)
## plot the result with the smallest within-group sum of squares
clusplot(simMV, kmMV5$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Motor Vehicle while K = 5",
xlab = "Number of product categories available",
ylab = "Number of products in each category")
## label of clusters
kmMV5$cluster
## [1] 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 1 4 1 4 2 2 5 5 5 5 5 2 5 2
## cordinates of centers
kmMV5$centers
## ctgrscore nCtgrscore nBrndscore nMdlscore pPrdctscore pCtgrscore
## 1 0.2815589 0.893585379 1.1751454 0.5806177 0.98860783 -0.5184763
## 2 0.9389722 1.074717551 -1.2406279 -0.9498455 0.80656082 1.0074191
## 3 -1.0669018 -0.969488385 0.4318305 0.6590242 -0.03186524 -0.6366101
## 4 0.3531141 -0.008378903 0.6897153 0.5395477 -0.12130590 0.2240793
## 5 0.8241858 0.772830598 -1.1609870 -1.3979922 -0.66730000 0.4069054
## pNpscore pUtscore pUpscore
## 1 -0.5126518 1.9039576 0.8862349
## 2 -0.8165955 1.9039576 -0.4932817
## 3 0.4406263 -0.3524934 -0.3108663
## 4 0.3427327 -0.5301823 -0.3141237
## 5 -0.4923888 -0.6391737 0.9698420
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmMV5$withinss
## [1] 6.943531 12.333986 28.328260 31.108963 32.952755
## total within-cluster sum of distances
kmMV5$tot.withinss
## [1] 111.6675
Read in data. Adjust it into the required form. Normalize the data
cidata <- read.csv(file = "/Users/YanYang/code/Car_Insurance_Example/2014 Auto Insurance Average Premium Data.csv")
lap <- sapply(1:nrow(cidata), function(x) mean(as.numeric(cidata[x, 3:7])))
cap <- sapply(1:nrow(cidata), function(x) mean(as.numeric(cidata[x, 8:12])))
cpap <- sapply(1:nrow(cidata), function(x) mean(as.numeric(cidata[x, 13:17])))
simCI <- scale(cbind(lap, cap, cpap))
hcCI <- hclust(dist(simCI))
plot(hcCI, main = "Hierarchical Clustering to find K (CI)",
xlab = "State Numbers")
abline(h = 3.5, col = "red")
abline(h = 2.5, col = "blue")
## k-means clustering when k = 4
## repeat k-means clustering for 100 times with different start points
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmCI4 <- kmeans(simCI, 4)
totWithinSq <- c(kmCI4$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmCI4 <- kmeans(simCI, 4)
## plot the result with the smallest within-group sum of squares
clusplot(simCI, kmCI4$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Car Insurance while K = 4",
xlab="Average Liablity Premium", ylab="Average Collision Premium")
## label of clusters
kmCI4$cluster
## [1] 1 4 1 1 2 1 4 4 4 4 1 2 2 2 2 3 3 2 4 2 4 4 4 3 1 1 3 3 4 2 4 1 4 2 3
## [36] 2 1 2 2 4 1 3 2 1 2 2 2 2 1 2 3 1
## cordinates of centers
kmCI4$centers
## lap cap cpap
## 1 -0.2162251 0.0671289 0.5416765
## 2 -0.4250923 -0.4167178 -0.9077268
## 3 -1.0233950 -1.0358827 1.3526659
## 4 1.4345960 1.1473312 -0.1172338
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmCI4$withinss
## [1] 7.330818 13.474020 4.984122 25.873355
## total within-cluster sum of distances
kmCI4$tot.withinss
## [1] 51.66232
## k-means clustering when k = 7
## repeat k-means clustering for 100 times with different start points
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmCI7 <- kmeans(simCI, 7)
totWithinSq <- c(kmCI7$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmCI7 <- kmeans(simCI, 7)
## plot the result with the smallest within-group sum of squares
clusplot(simCI, kmCI7$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Car Insurance while K = 7",
xlab="Average Liablity Premium", ylab="Average Collision Premium")
## label of clusters
kmCI7$cluster
## [1] 2 4 1 2 4 1 4 6 3 6 2 5 5 5 5 7 7 5 3 5 4 4 4 1 2 1 7 7 4 5 6 1 6 5 7
## [36] 5 2 5 4 4 1 7 5 2 5 5 5 5 2 5 7 4
## cordinates of centers
kmCI7$centers
## lap cap cpap
## 1 -0.2043081 -0.6370889 0.4854048
## 2 -0.3011256 0.4464633 0.7098097
## 3 1.3140978 2.4377592 1.8389374
## 4 0.7083975 0.9803677 -0.4775267
## 5 -0.4727496 -0.5727570 -0.9136164
## 6 2.2576671 0.5235082 -0.6085935
## 7 -1.1207312 -0.9874039 1.4669332
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmCI7$withinss
## [1] 1.5108276 2.8056016 0.8263802 7.0045401 8.8901085 3.8771411 3.5907569
## total within-cluster sum of distances
kmCI7$tot.withinss
## [1] 28.50536
The input “data” should be a dataframe. maxk is the largest k.
elbow <- function(data, maxk){
vc <- vector()
for(j in 1:maxk){
km <- kmeans(data, j)
datak <- cbind(data, km$cluster)
v <- vector()
for(i in 1:j){
count <- sum(datak[ ,ncol(datak)] == i)
if(count == 1){
sdgi == 0
}else{
dgi <- dist(datak[datak[ ,ncol(datak)] == i,
c(1:3)],
method = "euclidean",
diag = TRUE,
upper = TRUE)^2
sdgi <- sum(as.numeric(dgi))/(2*count)
}
v <- c(v, sdgi)
}
vc <- c(vc, sum(v))
}
vc
}
Since the random start points will affect Wk we get, we use 100 sets of random numbers provided by R to avoid the effect of randomness.
vv <- vector()
for(i in 1:100){
set.seed(i)
wk <- elbow(simMV, 20)
vv <- rbind(vv, wk)
}
meanWkMV <- colMeans(vv)
Observe the figure below to find the k where Wk decrease fastest.
cc <- as.data.frame(cbind(1:20, meanWkMV))
library(ggplot2)
g = ggplot(cc, aes(x = V1, y = meanWkMV))
g = g + geom_point(size = 5)
g = g + geom_line(size = 1)
g = g + labs(x = "k", y = "mean of 100 Wk",
title = "Example 1: Elbow Method to find k")
g = g + scale_color_gradient2(low = "purple", high = "red")
g
From the plot we know k should be 2.
Since R already has a package to calculate Gap Statistic, we will use the function from package cluster directly.
library(cluster)
gap_clust <- clusGap(simMV, kmeans, K.max = 20, B = 30)
gap_clust
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = simMV, FUNcluster = kmeans, K.max = 20, B = 30)
## B=30 simulated reference sets, k = 1..20; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 2
## logW E.logW gap SE.sim
## [1,] 3.382692 3.519929 0.1372375 0.03233278
## [2,] 3.172895 3.360477 0.1875819 0.04082617
## [3,] 3.089048 3.248258 0.1592098 0.03538789
## [4,] 2.950934 3.154275 0.2033417 0.03680279
## [5,] 2.878570 3.069781 0.1912115 0.03568238
## [6,] 2.803485 2.986658 0.1831729 0.03518157
## [7,] 2.674556 2.910726 0.2361702 0.04006084
## [8,] 2.593718 2.834799 0.2410807 0.03378752
## [9,] 2.492509 2.760599 0.2680900 0.03932611
## [10,] 2.442075 2.688316 0.2462415 0.04048860
## [11,] 2.351627 2.611193 0.2595656 0.03915989
## [12,] 2.284554 2.533258 0.2487041 0.04585996
## [13,] 2.191623 2.456227 0.2646039 0.04196856
## [14,] 2.188607 2.374776 0.1861685 0.03743426
## [15,] 2.034108 2.295494 0.2613855 0.05008610
## [16,] 1.912181 2.205446 0.2932655 0.03806879
## [17,] 1.887987 2.106953 0.2189663 0.04487602
## [18,] 1.743643 2.007366 0.2637225 0.04176051
## [19,] 1.580676 1.901965 0.3212900 0.04092718
## [20,] 1.554539 1.798900 0.2443611 0.05964913
From the result we know Gap statistic suggests the best k is 2
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmMV2 <- kmeans(simMV, 2)
totWithinSq <- c(kmMV5$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmMV2 <- kmeans(simMV, 2)
clusplot(simMV, kmMV2$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Motor Vehicle while K = 2",
xlab = "Number of product categories available",
ylab = "Number of products in each category")
###Other relevant aspects of the result
## label of clusters
kmMV2$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## cordinates of centers
kmMV2$centers
## ctgrscore nCtgrscore nBrndscore nMdlscore pPrdctscore pCtgrscore
## 1 -0.4350502 -0.4467927 0.5964217 0.6093667 0.03887784 -0.3235554
## 2 0.8701004 0.8935854 -1.1928434 -1.2187335 -0.07775567 0.6471109
## pNpscore pUtscore pUpscore
## 1 0.3110358 -0.1890394 -0.1922963
## 2 -0.6220715 0.3780789 0.3845925
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmMV2$withinss
## [1] 104.47897 73.02529
## total within-cluster sum of distances
kmMV2$tot.withinss
## [1] 177.5043
vv <- vector()
for(i in 1:100){
set.seed(i)
wk <- elbow(simCI, 20)
vv <- rbind(vv, wk)
}
meanWk <- colMeans(vv)
cc <- as.data.frame(cbind(1:20, meanWk))
library(ggplot2)
g = ggplot(cc, aes(x = V1, y = meanWk))
g = g + geom_point(size = 5)
g = g + geom_line(size = 1)
g = g + labs(x = "k", y = "mean of 100 Wk",
title = "Example 2: Elbow Method to find k")
g = g + scale_color_gradient2(low = "purple", high = "red")
g
From the plot we know k should be 3
library(cluster)
gap_clust <- clusGap(simCI, kmeans, K.max = 20, B = 52)
gap_clust
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = simCI, FUNcluster = kmeans, K.max = 20, B = 52)
## B=52 simulated reference sets, k = 1..20; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 3
## logW E.logW gap SE.sim
## [1,] 3.339359 3.615619 0.2762602 0.03735108
## [2,] 3.086954 3.363500 0.2765454 0.04025360
## [3,] 2.860584 3.194636 0.3340520 0.05045086
## [4,] 2.764690 3.054210 0.2895200 0.04720784
## [5,] 2.691983 2.949296 0.2573135 0.05029899
## [6,] 2.544050 2.855352 0.3113014 0.04983119
## [7,] 2.526056 2.768268 0.2422118 0.04872225
## [8,] 2.366667 2.688637 0.3219698 0.04943543
## [9,] 2.284573 2.618430 0.3338567 0.05184171
## [10,] 2.230376 2.540910 0.3105346 0.05384160
## [11,] 2.142184 2.489855 0.3476711 0.05281161
## [12,] 2.090391 2.413626 0.3232352 0.05258721
## [13,] 2.045172 2.357836 0.3126637 0.06204036
## [14,] 1.962233 2.292731 0.3304978 0.05859114
## [15,] 1.927258 2.228948 0.3016906 0.06271220
## [16,] 1.877180 2.163823 0.2866432 0.05876341
## [17,] 1.796337 2.109715 0.3133781 0.06329356
## [18,] 1.763847 2.050724 0.2868765 0.06414906
## [19,] 1.681757 1.999434 0.3176773 0.06064812
## [20,] 1.692030 1.938623 0.2465932 0.07002069
From the result we know Gap statistic suggests the best k is 3
totWithinSq <- vector()
for (i in 1:100){
set.seed(i)
kmCI3 <- kmeans(simCI, 3)
totWithinSq <- c(kmCI3$tot.withinss, totWithinSq)
}
## find the result with the smallest within-group sum of squares
sed <- c(1:100)[totWithinSq == min(totWithinSq)][1]
## redo the one with the smallest within-group sum of squares
set.seed(sed)
kmCI3 <- kmeans(simCI, 3)
clusplot(simCI, kmCI3$cluster, color=TRUE, shade=TRUE, labels=2,
lines=0, main = "Market division of Car Insurance while K = 3",
xlab="Average Liablity Premium", ylab="Average Collision Premium")
###Other relevant aspects of the result
## label of clusters
kmCI3$cluster
## [1] 1 3 1 1 3 1 3 3 2 3 1 1 1 1 1 1 1 1 2 1 3 3 3 1 1 1 1 1 3 1 3 1 3 1 1
## [36] 1 1 1 1 3 1 1 1 2 1 1 1 1 2 1 1 1
## cordinates of centers
kmCI3$centers
## lap cap cpap
## 1 -0.5193289 -0.4909840 0.02171412
## 2 0.7130070 1.5820433 1.40981372
## 3 1.3203177 0.9456041 -0.53508027
## total squared distance from data points to their centers
## it is equal to sum of squares because the center is the mean
## in each dimensions
kmCI3$withinss
## [1] 58.344648 6.037484 15.133394
## total within-cluster sum of distances
kmCI3$tot.withinss
## [1] 79.51553