# load libraries
library(mlba)
library(factoextra)
library(cluster)
library(dplyr)
library(ggplot2)
library(skimr)
library(tidyverse)
library(kableExtra)
library(caret)
set.seed(123)
Pharmaceutical Industry. An equities analyst is studying the pharmaceutical industry and would like your help in exploring and understanding the financial data collected by her firm. Her main objective is to understand the structure of the pharmaceutical industry using some basic financial measures.
Financial data gathered on 21 firms in the pharmaceutical industry are available in the file Pharmaceuticals.csv. For each firm, the following variables are recorded:
Use only the numerical variables to cluster the 21 firms. Justify the various choices made in conducting the cluster analysis, such as:
# load data
data(Pharmaceuticals) # need to load library mlba
skim(Pharmaceuticals) # need to load library skimr
Name | Pharmaceuticals |
Number of rows | 21 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 5 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Symbol | 0 | 1 | 3 | 4 | 0 | 21 | 0 |
Name | 0 | 1 | 5 | 34 | 0 | 21 | 0 |
Median_Recommendation | 0 | 1 | 4 | 13 | 0 | 4 | 0 |
Location | 0 | 1 | 2 | 11 | 0 | 7 | 0 |
Exchange | 0 | 1 | 4 | 6 | 0 | 3 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Market_Cap | 0 | 1 | 57.65 | 58.60 | 0.41 | 6.30 | 48.19 | 73.84 | 199.47 | ▇▆▁▂▂ |
Beta | 0 | 1 | 0.53 | 0.26 | 0.18 | 0.35 | 0.46 | 0.65 | 1.11 | ▆▇▃▂▂ |
PE_Ratio | 0 | 1 | 25.46 | 16.31 | 3.60 | 18.90 | 21.50 | 27.90 | 82.50 | ▅▇▁▁▁ |
ROE | 0 | 1 | 25.80 | 15.08 | 3.90 | 14.90 | 22.60 | 31.00 | 62.90 | ▇▇▃▂▂ |
ROA | 0 | 1 | 10.51 | 5.32 | 1.40 | 5.70 | 11.20 | 15.00 | 20.30 | ▃▇▂▇▂ |
Asset_Turnover | 0 | 1 | 0.70 | 0.22 | 0.30 | 0.60 | 0.60 | 0.90 | 1.10 | ▂▇▁▆▂ |
Leverage | 0 | 1 | 0.59 | 0.78 | 0.00 | 0.16 | 0.34 | 0.60 | 3.51 | ▇▂▁▁▁ |
Rev_Growth | 0 | 1 | 13.37 | 11.05 | -3.17 | 6.38 | 9.37 | 21.87 | 34.21 | ▅▇▅▂▅ |
Net_Profit_Margin | 0 | 1 | 15.70 | 6.56 | 2.60 | 11.20 | 16.10 | 21.10 | 25.50 | ▂▅▇▅▇ |
# head of data
head(Pharmaceuticals)
## Symbol Name Market_Cap Beta PE_Ratio ROE ROA Asset_Turnover
## 1 ABT Abbott Laboratories 68.44 0.32 24.7 26.4 11.8 0.7
## 2 AGN Allergan, Inc. 7.58 0.41 82.5 12.9 5.5 0.9
## 3 AHM Amersham plc 6.30 0.46 20.7 14.9 7.8 0.9
## 4 AZN AstraZeneca PLC 67.63 0.52 21.5 27.4 15.4 0.9
## 5 AVE Aventis 47.16 0.32 20.1 21.8 7.5 0.6
## 6 BAY Bayer AG 16.90 1.11 27.9 3.9 1.4 0.6
## Leverage Rev_Growth Net_Profit_Margin Median_Recommendation Location Exchange
## 1 0.42 7.54 16.1 Moderate Buy US NYSE
## 2 0.60 9.16 5.5 Moderate Buy CANADA NYSE
## 3 0.27 7.05 11.2 Strong Buy UK NYSE
## 4 0.00 15.00 18.0 Moderate Sell UK NYSE
## 5 0.34 26.81 12.9 Moderate Buy FRANCE NYSE
## 6 0.00 -3.17 2.6 Hold GERMANY NYSE
# check missing values
sum(is.na(Pharmaceuticals))
## [1] 0
# check if numeric variables, then normalize data
Pharmaceuticals.scaled <- Pharmaceuticals %>%
select_if(is.numeric) %>%
scale()
We either choose K-means or Hierarchical clustering for this analysis.
# K-means clustering using for loop and kmeans function
# set.seed(123)
wss <- numeric(10)
for (i in 1:10) {
kmeansPharmaceuticals <- kmeans(Pharmaceuticals.scaled, centers = i, nstart = 25)
wss[i] <- kmeansPharmaceuticals$tot.withinss
}
# Plot the elbow method using ggplot2
data.frame(k = 1:10, wss = wss) %>%
ggplot(aes(x = k, y = wss)) +
geom_line() +
geom_point() +
geom_vline(xintercept = 3, linetype = 2) +
labs(
title = "Elbow method",
subtitle = "K-means clustering",
x = "Number of clusters (k)",
y = "Total within-cluster sum of squares"
) +
theme_minimal()
# K-means clustering, using k = 4
k4 <- kmeans(Pharmaceuticals.scaled, centers = 4, nstart = 25)
k4
## K-means clustering with 4 clusters of sizes 8, 3, 4, 6
##
## Cluster means:
## Market_Cap Beta PE_Ratio ROE ROA Asset_Turnover
## 1 -0.03142211 -0.4360989 -0.3172485 0.1950459 0.4083915 1.729746e-01
## 2 -0.52462814 0.4451409 1.8498439 -1.0404550 -1.1865838 -3.330669e-16
## 3 1.69558112 -0.1780563 -0.1984582 1.2349879 1.3503431 1.153164e+00
## 4 -0.82617719 0.4775991 -0.3696184 -0.5631589 -0.8514589 -9.994088e-01
## Leverage Rev_Growth Net_Profit_Margin
## 1 -0.2744931 -0.7041516 0.5569544
## 2 -0.3443544 -0.5769454 -1.6095439
## 3 -0.4680782 0.4671788 0.5912425
## 4 0.8502201 0.9158889 -0.3319956
##
## Clustering vector:
## [1] 1 2 1 1 4 2 1 4 4 1 3 4 3 4 3 1 3 2 1 4 1
##
## Within cluster sum of squares by cluster:
## [1] 21.879320 14.938904 9.284424 32.143356
## (between_SS / total_SS = 56.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Hierarchical clustering, using dendrogram to find the best k
d <- dist(Pharmaceuticals.scaled, method = "euclidean")
hc <- hclust(d, method = "ward.D2")
plot(hc, main = "Dendrogram", hang = -1)
# Add a horizontal line at height 5.8
abline(h = 5.8, col = "red", lty = 2)
# cut the dendrogram at 4 clusters
cutree(hc, h = 5.8)
## [1] 1 2 3 1 3 2 1 3 3 1 4 3 4 3 4 1 4 2 1 3 1
# highlight the clusters
rect.hclust(hc, k = 4, border = 2:5)
Interpret the clusters with respect to the numerical variables used in forming the clusters.
# centroids of clusters
k4$centers
## Market_Cap Beta PE_Ratio ROE ROA Asset_Turnover
## 1 -0.03142211 -0.4360989 -0.3172485 0.1950459 0.4083915 1.729746e-01
## 2 -0.52462814 0.4451409 1.8498439 -1.0404550 -1.1865838 -3.330669e-16
## 3 1.69558112 -0.1780563 -0.1984582 1.2349879 1.3503431 1.153164e+00
## 4 -0.82617719 0.4775991 -0.3696184 -0.5631589 -0.8514589 -9.994088e-01
## Leverage Rev_Growth Net_Profit_Margin
## 1 -0.2744931 -0.7041516 0.5569544
## 2 -0.3443544 -0.5769454 -1.6095439
## 3 -0.4680782 0.4671788 0.5912425
## 4 0.8502201 0.9158889 -0.3319956
Is there any other ways to present the cluster information?
Is there a pattern in the clusters with respect to the non-numerical variables (10 to 12)? (those not used in forming the clusters)
# add cluster variable to Pharmaceuticals data
Pharmaceuticals$cluster <- k4$cluster
# table of cluster variable by location, stock exchange, and median recommendation
table(Pharmaceuticals$cluster, Pharmaceuticals$Location)
##
## CANADA FRANCE GERMANY IRELAND SWITZERLAND UK US
## 1 0 0 0 0 1 2 5
## 2 1 0 1 0 0 0 1
## 3 0 0 0 0 0 1 3
## 4 0 1 0 1 0 0 4
table(Pharmaceuticals$cluster, Pharmaceuticals$Exchange)
##
## AMEX NASDAQ NYSE
## 1 0 0 8
## 2 0 0 3
## 3 0 0 4
## 4 1 1 4
table(Pharmaceuticals$cluster, Pharmaceuticals$Median_Recommendation)
##
## Hold Moderate Buy Moderate Sell Strong Buy
## 1 4 1 2 1
## 2 2 1 0 0
## 3 2 2 0 0
## 4 1 3 2 0
Any pattern in the clusters with respect to the non-numerical variables (10 to 12)?
Provide an appropriate name for each cluster using any or all of the variables in the dataset.
Marketing to Frequent Fliers. The file EastWestAirlinesCluster.csv contains information on 3999 passengers who belong to an airline’s frequent flier program. For each passenger, the data include information on their mileage history and on different ways they accrued or spent miles in the last year. The goal is to try to identify clusters of passengers that have similar characteristics for the purpose of targeting different segments for different types of mileage offers.
Apply hierarchical clustering with Euclidean distance and Ward’s method. Make sure to normalize the data first. How many clusters appear?
# read data from mlba package
data(EastWestAirlinesCluster, package = "mlba")
airlines.df <- EastWestAirlinesCluster
# check data
head(airlines.df)
# normalize data, keep ID.
airlines.norm <- sapply(airlines.df[, -1], scale)
# check data
head(airlines.norm)
# apply hierarchical clustering with Euclidean distance and Ward’s method
distance <- dist(airlines.norm, method = "euclidean")
airlines.ward <- hclust(distance, method = "ward.D2")
# check data
summary(airlines.ward)
## Length Class Mode
## merge 7996 -none- numeric
## height 3998 -none- numeric
## order 3999 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
# plot dendrogram
plot(airlines.ward, main = "Dendrogram", ann = FALSE, hang = -1)
The dendrogram shows ?? clusters.
What would happen if the data were not normalized?
# check the summary of the unnormailzed data
skim(airlines.df)
Name | airlines.df |
Number of rows | 3999 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
numeric | 12 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
ID. | 0 | 1 | 2014.82 | 1160.76 | 1 | 1010.5 | 2016 | 3020.5 | 4021 | ▇▇▇▇▇ |
Balance | 0 | 1 | 73601.33 | 100775.66 | 0 | 18527.5 | 43097 | 92404.0 | 1704838 | ▇▁▁▁▁ |
Qual_miles | 0 | 1 | 144.11 | 773.66 | 0 | 0.0 | 0 | 0.0 | 11148 | ▇▁▁▁▁ |
cc1_miles | 0 | 1 | 2.06 | 1.38 | 1 | 1.0 | 1 | 3.0 | 5 | ▇▁▂▂▁ |
cc2_miles | 0 | 1 | 1.01 | 0.15 | 1 | 1.0 | 1 | 1.0 | 3 | ▇▁▁▁▁ |
cc3_miles | 0 | 1 | 1.01 | 0.20 | 1 | 1.0 | 1 | 1.0 | 5 | ▇▁▁▁▁ |
Bonus_miles | 0 | 1 | 17144.85 | 24150.97 | 0 | 1250.0 | 7171 | 23800.5 | 263685 | ▇▁▁▁▁ |
Bonus_trans | 0 | 1 | 11.60 | 9.60 | 0 | 3.0 | 12 | 17.0 | 86 | ▇▂▁▁▁ |
Flight_miles_12mo | 0 | 1 | 460.06 | 1400.21 | 0 | 0.0 | 0 | 311.0 | 30817 | ▇▁▁▁▁ |
Flight_trans_12 | 0 | 1 | 1.37 | 3.79 | 0 | 0.0 | 0 | 1.0 | 53 | ▇▁▁▁▁ |
Days_since_enroll | 0 | 1 | 4118.56 | 2065.13 | 2 | 2330.0 | 4096 | 5790.5 | 8296 | ▅▇▇▇▅ |
Award. | 0 | 1 | 0.37 | 0.48 | 0 | 0.0 | 0 | 1.0 | 1 | ▇▁▁▁▅ |
Compare the cluster centroid to characterize the different clusters, and try to give each cluster a label.
# cut the dendrogram to get 2 clusters
airlines.cut1 <- cutree(airlines.ward, k = 2)
# calculate cluster centroids
centers2 <- aggregate(airlines.norm, by = list(airlines.cut1), FUN = mean)
centers2
## Group.1 Balance Qual_miles cc1_miles cc2_miles cc3_miles Bonus_miles
## 1 1 -0.2667555 -0.1742877 -0.5935343 0.05959289 -0.06275873 -0.5013072
## 2 2 0.4397049 0.2872862 0.9783490 -0.09822960 0.10344800 0.8263269
## Bonus_trans Flight_miles_12mo Flight_trans_12 Days_since_enroll Award.
## 1 -0.4696781 -0.1706092 -0.1775063 -0.1674339 -0.3774981
## 2 0.7741912 0.2812228 0.2925916 0.2759887 0.6222468
To check the stability of the clusters,
Does the same picture emerge?
# remove 5% of the data
airlines.df2 <- airlines.df[sample(1:nrow(airlines.df), 0.95 * nrow(airlines.df)), ]
# normalize data, keep ID.
airlines.norm2 <- sapply(airlines.df2[, -1], scale)
# apply hierarchical clustering with Euclidean distance and Ward’s method
distance2 <- dist(airlines.norm2, method = "euclidean")
airlines.ward2 <- hclust(distance2, method = "ward.D2")
# check clusters
summary(airlines.ward2)
## Length Class Mode
## merge 7596 -none- numeric
## height 3798 -none- numeric
## order 3799 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
# plot dendrogram
plot(airlines.ward2, main = "Dendrogram", ann = FALSE, hang = -1)
Comment: are they similar or different?
# cut the dendrogram to get 2 clusters
airlines.cut2 <- cutree(airlines.ward2, k = 2)
# calculate cluster centroids
centers3 <- aggregate(airlines.norm2, by = list(airlines.cut2), FUN = mean)
centers3
## Group.1 Balance Qual_miles cc1_miles cc2_miles cc3_miles Bonus_miles
## 1 1 -0.1822690 -0.09164765 -0.6730999 0.05835554 -0.06328427 -0.5129120
## 2 2 0.3036538 0.15268176 1.1213609 -0.09721829 0.10542937 0.8544934
## Bonus_trans Flight_miles_12mo Flight_trans_12 Days_since_enroll Award.
## 1 -0.4520827 -0.0812243 -0.08281062 -0.1560105 -0.2310954
## 2 0.7531539 0.1353168 0.13795959 0.2599081 0.3849968
Comment: are they similar or different?
Use k-means clustering with the number of clusters that you found above. Does the same picture emerge?
# apply k-means clustering with k = 2
airlines.kmeans <- kmeans(airlines.norm, centers = 2, nstart = 2) # try different nstart values
table(airlines.cut1, airlines.kmeans$cluster)
##
## airlines.cut1 1 2
## 1 53 2436
## 2 1249 261
Which clusters would you target for offers, and what types of offers would you target to customers in that cluster?
Look at the cluster centroids and characteristics to decide which clusters to target for offers.