Types of machine learning: (1) Unsupervised learning–finding structure in unlabeled data w/o a target (2) Supervised learning–Making predictions (regression or classification) based on labeled data (3) Reinforcement learning
Unsupervised learning: (1) Clustering–finding homogeneous subgroups within larger group (2) Dimensionality Reduction– a) finding patterns in the features of the data, b) visualization of high dimensional data, and c) pre-processing before supervised learning
Challenges and Benefits: (1) No single goal of analysis (2) Requires more creativity (3) Much more unlabeled data available than cleanly labeled data
Breaks observations into pre-defined number of clusters
Generates a dataset with 3 clusters:
x1 = rnorm(150, mean = -3, sd = 0.8), x2 = rnorm(150, mean = 2, sd = 1)
x1 = rnorm(100, mean = 2, sd = 0.75), x2 = rnorm(100, mean = 2, sd = 0.5)
x1 = rnorm(50, mean = 0, sd = 0.8), x2 = rnorm(50, mean = 0.5, sd = 0.75)
Then, use the built-in command:
kmeans(x, centers = 3, nstart = 20)
in R built-in
{stats}
, one observation per row, one feature per column;
k-means has a random component; run algorithm multiple times to improve
odds of the best model
set.seed(2)
x <- data.frame (
x1 = c(rnorm(150, mean = -2, sd = .8), rnorm(100, mean = 2, sd = .75), rnorm(50, mean = 1, sd = .5)),
x2 = c(rnorm(150, mean = 3, sd = .8), rnorm(100, mean = 2, sd = 1), rnorm(50, mean = 1, sd = .5))
) # x1 and x2 are two column features
plot(x)
km.out <- kmeans(x, centers = 3, nstart = 20) # centers = 3 specifies three clusters
names(km.out)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
summary(km.out)
## Length Class Mode
## cluster 300 -none- numeric
## centers 6 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# cluster membership component
head(km.out$cluster)
## [1] 1 1 1 1 1 1
# the km.out object
km.out
## K-means clustering with 3 clusters of sizes 149, 92, 59
##
## Cluster means:
## x1 x2
## 1 -1.979331 3.127701
## 2 1.403014 1.137719
## 3 2.127083 2.718915
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 2 3 3 2 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 2 3 3 3 2 3 2 2 2 2 2 2 2 2 2 3
## [186] 2 2 3 2 2 3 3 3 2 3 3 2 2 2 3 2 3 3 2 3 3 2 3 2 3 2 3 2 3 3 3 3 3 2 3 3 3
## [223] 3 3 3 3 3 3 3 3 2 2 3 3 2 2 3 3 3 3 3 3 2 3 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 203.88398 77.37057 62.06793
## (between_SS / total_SS = 78.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(x, col = km.out$cluster, main = "K-means with 3 clusters", xlab = "", ylab = "")
Model Selection – best outcome is based on total within cluster sum of squares (1) For each cluster, determine squared distance from each observation to the cluster center (2) Sum all of squared distance together
kmeans(x, centers = 3, nstart = 20) # run 20 times with 3 centers
Running algorithm multiple times helps find the global minimum total within cluster sum of squares
par(mfrow = c(2, 3), mar = c(2,2,3,1.5))
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.out <- kmeans(x, centers = 3, nstart = 2)
# Plot clusters
plot(x, col = km.out$cluster,
main = paste("Total w/n cluster sum of squares error = ",round(km.out$tot.withinss, 4)),
xlab = "", ylab = "", cex.main = 1)
}
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(x, centers = i, nstart = 20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Set k equal to the number of clusters corresponding to the elbow location
k <- 2
library(readr)
library(dplyr)
pokemon <- read_csv("pokemon.csv")
names(pokemon)
## [1] "#" "Name" "Type 1" "Type 2" "Total"
## [6] "HP" "Attack" "Defense" "Sp. Atk" "Sp. Def"
## [11] "Speed" "Generation" "Legendary"
pokemon <- pokemon[,names(pokemon) %in% c("HP", "Attack", "Defense", "Sp. Atk", "Sp. Def","Speed")]
head(pokemon)
## # A tibble: 6 × 6
## HP Attack Defense `Sp. Atk` `Sp. Def` Speed
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 45 49 49 65 65 45
## 2 60 62 63 80 80 60
## 3 80 82 83 100 100 80
## 4 80 100 123 122 120 80
## 5 39 52 43 60 50 65
## 6 58 64 58 80 65 80
pokemon <- scale(pokemon)
head(pokemon)
## HP Attack Defense Sp. Atk Sp. Def Speed
## [1,] -0.9500319 -0.92432794 -0.7966553 -0.2389808 -0.2480334 -0.8010021
## [2,] -0.3625953 -0.52380252 -0.3476999 0.2194223 0.2909743 -0.2848371
## [3,] 0.4206536 0.09239043 0.2936649 0.8306264 1.0096513 0.4033830
## [4,] 0.4206536 0.64696408 1.5763945 1.5029509 1.7283282 0.4033830
## [5,] -1.1850065 -0.83189899 -0.9890647 -0.3917818 -0.7870411 -0.1127821
## [6,] -0.4409201 -0.46218322 -0.5080411 0.2194223 -0.2480334 0.4033830
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters
k <- 2
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = 2, nstart = 20, iter.max = 50)
km.out$centers
## HP Attack Defense Sp. Atk Sp. Def Speed
## 1 -0.6073605 -0.6349958 -0.5579345 -0.6139122 -0.6676757 -0.4902048
## 2 0.5386027 0.5631095 0.4947721 0.5444127 0.5920898 0.4347100
km.out$withinss
## [1] 967.9501 2303.3369
km.out$tot.withinss
## [1] 3271.287
km.out$betweenss
## [1] 1522.713
km.out$betweenss/(km.out$betweenss + km.out$tot.withinss) # (between_SS / total_SS = 31.8 %)
## [1] 0.3176289
km.out$size
## [1] 376 424
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
Create dataset
set.seed(2)
x <- data.frame (
x1 = c(rnorm(50, mean = -2, sd = 0.75), rnorm(50, mean = 1, sd = .5)),
x2 = c(rnorm(50, mean = 2, sd = 0.75), rnorm(50, mean = 1, sd = .5))
) # x1 and x2 are two column features
plot(x)
dist_matrix <- dist(x) # Euclidean distance btw observations
hclust.out <- hclust(d = dist_matrix) # pass the distance to the d parameter
hclust.out
##
## Call:
## hclust(d = dist_matrix)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 100
summary(hclust.out) # not very helpful
## Length Class Mode
## merge 198 -none- numeric
## height 99 -none- numeric
## order 100 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
plot(hclust.out)
abline(h = 4, col = "red") # h = desired distance between clusters
## cut by height h
cutree(hclust.out, h = 4)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## cut by number of clusters k
cutree(hclust.out, k = 2)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
Linking clusters in hierarchical clustering (1) How is distance between clusters determined? Rules–Four methods:
1. Complete: parwise similarity between all observations in cluster 1 and cluster 2, and users largest of similarities
2. Single: smallest of similarities
3. Average: average of similarities
4. Centroid: finds centroid of cluster 1 and centroid of cluster 2, and uses similarity between two centroids
Complete and average tend to give balanced trees, most commonly used. Data must be scaled so that features have same mean and standard deviation.
set.seed(1)
head(x)
## x1 x2
## 1 -2.672686 2.805845
## 2 -1.861363 2.195448
## 3 -0.809116 1.764296
## 4 -2.847782 1.437777
## 5 -2.060189 1.353351
## 6 -1.900685 3.536030
colMeans(x)
## x1 x2
## -0.5067068 1.5369275
apply(x, 2, sd)
## x1 x2
## 1.6217639 0.8718234
## scale/normalize data
x <- scale(x) # normalized features
head(x)
## x1 x2
## [1,] -1.3355699 1.4554749
## [2,] -0.8352981 0.7553374
## [3,] -0.1864693 0.2607965
## [4,] -1.4435362 -0.1137273
## [5,] -0.9578965 -0.2105659
## [6,] -0.8595443 2.2930134
round(colMeans(x),4)
## x1 x2
## 0 0
apply(x, 2, sd)
## x1 x2
## 1 1
d <- dist(x)
hclust.complete <- hclust(d, method = "complete")
hclust.average <- hclust(d, method = "average")
hclust.single <- hclust(d, method = "single")
hclust.centroid <- hclust(d, method = "centroid")
par( mar = c(2,4,3,1))
plot(hclust.complete, xlab= "", cex = .75, main = "Complete")
plot(hclust.average, xlab= "", cex = .75, main = "Average")
plot(hclust.single, xlab= "", cex = .75, main = "Single") # too many singles
plot(hclust.centroid, xlab= "", cex = .75, main = "Centroid") # inversed tree
Pokemon has been scaled
round(colMeans(pokemon),3)
## HP Attack Defense Sp. Atk Sp. Def Speed
## 0 0 0 0 0 0
apply(pokemon, MARGIN = 2, FUN = sd)
## HP Attack Defense Sp. Atk Sp. Def Speed
## 1 1 1 1 1 1
hclust.pokemon <- hclust(dist(pokemon), method = "complete")
par( mar = c(2,4,3,1))
plot(hclust.pokemon, xlab = "", main = "Complete", labels = F) # real-life dataset rarely gives a good look
kmeans()
& hclust()
set.seed(1)
cut.pokemon <- cutree(hclust.pokemon, k = 3)
km.pokemon <- kmeans(pokemon, centers = 3, nstart = 20, iter.max = 50)
table(kmeans = km.pokemon$cluster, hclust.cutree = cut.pokemon)
## hclust.cutree
## kmeans 1 2 3
## 1 204 9 1
## 2 342 1 0
## 3 242 1 0
Three goals of PCA: (1) Find linear combination of variables to create principal components (2) Maintain most variance in the data (3) Principal components are uncorrelated (orthogonal to each other)
pr.out <- prcomp(x = iris[,-5],
scale = TRUE,
center = TRUE) # always leave center = TRUE, centered around 0
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
pr.out$rotation
## PC1 PC2 PC3 PC4
## Sepal.Length 0.5210659 -0.37741762 0.7195664 0.2612863
## Sepal.Width -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length 0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width 0.5648565 -0.06694199 -0.6342727 0.5235971
biplot(pr.out, cex = .5) # original variables' loadings in first two principal components
pr.var <- pr.out$sdev^2 # variance explained
pve <- pr.var/sum(pr.var) # proportion of variance explained
round(pve,3)
## [1] 0.730 0.229 0.037 0.005
par(mfrow = c(1,2), mar = c(4,4,2,1))
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
Practical issues with PCA
1. Scaling the data
2. Missing values: drop observations with missing values; impute/estimate missing values
3. Categorical data: exclude them; encode categorical features as numbers
pokemon <- read_csv("pokemon.csv")
pokemon <- pokemon[,names(pokemon) %in% c("HP", "Attack", "Defense", "Sp. Atk", "Sp. Def","Speed")]
pr.with.scaling <- prcomp(pokemon, scale = T, center = T)
pr.without.scaling <- prcomp(pokemon, scale = F, center = T)
pr.without.scaling$rotation <- -pr.without.scaling$rotation # changes sign for mirrored plot
pr.without.scaling$x <- -pr.without.scaling$x # changes sign for mirrored plot
par(mfrow = c(1,2), mar = c(4,5,2,1))
biplot(pr.with.scaling, cex = .4)
biplot(pr.without.scaling, cex =.4)
URL <- "http://assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
wisc.df <- read.csv(URL)
names(wisc.df)
## [1] "id" "diagnosis"
## [3] "radius_mean" "texture_mean"
## [5] "perimeter_mean" "area_mean"
## [7] "smoothness_mean" "compactness_mean"
## [9] "concavity_mean" "concave.points_mean"
## [11] "symmetry_mean" "fractal_dimension_mean"
## [13] "radius_se" "texture_se"
## [15] "perimeter_se" "area_se"
## [17] "smoothness_se" "compactness_se"
## [19] "concavity_se" "concave.points_se"
## [21] "symmetry_se" "fractal_dimension_se"
## [23] "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst"
## [27] "smoothness_worst" "compactness_worst"
## [29] "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_dimension_worst"
## [33] "X"
wisc.df <- wisc.df[,!names(wisc.df) %in% c("X")]
dim(wisc.df)
## [1] 569 32
# Convert the features of the data: wisc.data, excluding column 1 id and 2 diagnosis
wisc.data <- as.matrix(wisc.df[,3:32])
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
table(wisc.df$diagnosis)
##
## B M
## 357 212
head(round(colMeans(wisc.data),3))
## radius_mean texture_mean perimeter_mean area_mean
## 14.127 19.290 91.969 654.889
## smoothness_mean compactness_mean
## 0.096 0.104
head(round(apply(X = wisc.data, MARGIN = 2, FUN = sd, na.rm = TRUE),3))
## radius_mean texture_mean perimeter_mean area_mean
## 3.524 4.301 24.299 351.914
## smoothness_mean compactness_mean
## 0.014 0.053
wisc.pr <- prcomp(wisc.data, scale. = T, center = T)
summary(wisc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion 0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion 0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
## PC29 PC30
## Standard deviation 0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion 1.00000 1.00000
biplot(wisc.pr, cex = .5)
par(mar = c(4,4,3,1))
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC2", main = "Observations by PC1 & PC2")
pr.var <- wisc.pr$sdev^2
pve <- pr.var/sum(pr.var)
round(cumsum(pve),3)
## [1] 0.443 0.632 0.726 0.792 0.847 0.888 0.910 0.926 0.940 0.952 0.961 0.970
## [13] 0.978 0.983 0.986 0.989 0.991 0.993 0.995 0.996 0.997 0.997 0.998 0.999
## [25] 0.999 1.000 1.000 1.000 1.000 1.000
round(wisc.pr$rotation[1:10, 1:5],3)
## PC1 PC2 PC3 PC4 PC5
## radius_mean -0.219 0.234 -0.009 0.041 -0.038
## texture_mean -0.104 0.060 0.065 -0.603 0.049
## perimeter_mean -0.228 0.215 -0.009 0.042 -0.037
## area_mean -0.221 0.231 0.029 0.053 -0.010
## smoothness_mean -0.143 -0.186 -0.104 0.159 0.365
## compactness_mean -0.239 -0.152 -0.074 0.032 -0.012
## concavity_mean -0.258 -0.060 0.003 0.019 -0.086
## concave.points_mean -0.261 0.035 -0.026 0.065 0.044
## symmetry_mean -0.138 -0.190 -0.040 0.067 0.306
## fractal_dimension_mean -0.064 -0.367 -0.023 0.049 0.044
wisc.pr$rotation[,1][8] # For the first principal component, what is the component of the loading vector for the feature concave.points_mean?
## concave.points_mean
## -0.2608538
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b", main = "Scree plot")
data.scaled <- scale(wisc.data) # scale the data
data.dist <- dist(data.scaled) # Calculate the euclidean distance, has to be a matrix
wisc.hclust <- hclust(d = data.dist, method = "complete") # conduct hierarchical clustering analysis
par(mar=c(2,5,3,1))
plot(wisc.hclust, labels = F, cex = .8, col = "lightsteelblue", xlab = "", sub = "", main = "Complete") # what is the height at which the clustering model has 4 clusters? 20
wisc.hclust.clusters <- cutree(wisc.hclust, k = 4)
table(hclust = wisc.hclust.clusters, diagnosis)
## diagnosis
## hclust 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
wisc.km <- kmeans(scale(wisc.data), centers = 2, nstart = 20) # nstart = 1 is the default value
table(kmeans = wisc.km$cluster, diagnosis)
## diagnosis
## kmeans 0 1
## 1 14 175
## 2 343 37
table(kmeans = wisc.km$cluster, hclust = wisc.hclust.clusters)
## hclust
## kmeans 1 2 3 4
## 1 160 7 20 2
## 2 17 0 363 0
wisc.pr.hclust <- hclust(dist(wisc.pr$x[,1:7]), method = "complete")
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k = 4)
table(diagnosis, pr.cluster=wisc.pr.hclust.clusters )
## pr.cluster
## diagnosis 1 2 3 4
## 0 5 350 2 0
## 1 113 97 0 2
table(diagnosis, cluster = wisc.hclust.clusters)
## cluster
## diagnosis 1 2 3 4
## 0 12 2 343 0
## 1 165 5 40 2
table(diagnosis, kmeans = wisc.km$cluster)
## kmeans
## diagnosis 1 2
## 0 14 343
## 1 175 37