datacamp course link

1 Conceptualization

Types of machine learning:
(1) Unsupervised learning–finding structure in unlabeled data w/o a target
(2) Supervised learning–Making predictions (regression or classification) based on labeled data
(3) Reinforcement learning

Unsupervised learning:
(1) Clustering–finding homogeneous subgroups within larger group
(2) Dimensionality Reduction– a) finding patterns in the features of the data, b) visualization of high dimensional data, and c) pre-processing before supervised learning

Challenges and Benefits:
(1) No single goal of analysis
(2) Requires more creativity
(3) Much more unlabeled data available than cleanly labeled data

2 K-means clustering

Breaks observations into pre-defined number of clusters

Generates a dataset with 3 clusters:
x1 = rnorm(150, mean = -3, sd = 0.8), x2 = rnorm(150, mean = 2, sd = 1)
x1 = rnorm(100, mean = 2, sd = 0.75), x2 = rnorm(100, mean = 2, sd = 0.5)
x1 = rnorm(50, mean = 0, sd = 0.8), x2 = rnorm(50, mean = 0.5, sd = 0.75)
Then, use the built-in command:
kmeans(x, centers = 3, nstart = 20) in R built-in {stats}, one observation per row, one feature per column; k-means has a random component; run algorithm multiple times to improve odds of the best model

set.seed(2)
x <- data.frame (
  x1 = c(rnorm(150, mean = -2, sd = .8), rnorm(100, mean = 2, sd = .75), rnorm(50, mean = 1, sd = .5)), 
  x2 = c(rnorm(150, mean = 3, sd = .8), rnorm(100, mean = 2, sd = 1), rnorm(50, mean = 1, sd = .5))
  ) # x1 and x2 are two column features
plot(x)

km.out <- kmeans(x, centers = 3, nstart = 20) # centers = 3 specifies three clusters
names(km.out)
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
summary(km.out)
##              Length Class  Mode   
## cluster      300    -none- numeric
## centers        6    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
# cluster membership component
head(km.out$cluster)
## [1] 1 1 1 1 1 1
# the km.out object
km.out
## K-means clustering with 3 clusters of sizes 149, 92, 59
## 
## Cluster means:
##          x1       x2
## 1 -1.979331 3.127701
## 2  1.403014 1.137719
## 3  2.127083 2.718915
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 2 3 3 2 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 2 3 3 3 2 3 2 2 2 2 2 2 2 2 2 3
## [186] 2 2 3 2 2 3 3 3 2 3 3 2 2 2 3 2 3 3 2 3 3 2 3 2 3 2 3 2 3 3 3 3 3 2 3 3 3
## [223] 3 3 3 3 3 3 3 3 2 2 3 3 2 2 3 3 3 3 3 3 2 3 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 203.88398  77.37057  62.06793
##  (between_SS / total_SS =  78.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
plot(x, col = km.out$cluster, main = "K-means with 3 clusters", xlab = "", ylab = "")

2.1 Model Selection

Model Selection – best outcome is based on total within cluster sum of squares
(1) For each cluster, determine squared distance from each observation to the cluster center
(2) Sum all of squared distance together

kmeans(x, centers = 3, nstart = 20) # run 20 times with 3 centers

Running algorithm multiple times helps find the global minimum total within cluster sum of squares

par(mfrow = c(2, 3), mar = c(2,2,3,1.5))
for(i in 1:6) {
  # Run kmeans() on x with three clusters and one start
  km.out <- kmeans(x, centers = 3, nstart = 2)
  # Plot clusters
  plot(x, col = km.out$cluster, 
       main = paste("Total w/n cluster sum of squares error = ",round(km.out$tot.withinss, 4)), 
       xlab = "", ylab = "", cex.main = 1)
}

2.2 Selecting number of clusters

# Initialize total within sum of squares error: wss
wss <- 0

# For 1 to 15 cluster centers
for (i in 1:15) {
  km.out <- kmeans(x, centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}

# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Set k equal to the number of clusters corresponding to the elbow location
k <- 2

2.3 Pokemon dataset

Download Pokemon dataset

  1. Selecting the variables to cluster upon
library(readr)
library(dplyr)
pokemon <- read_csv("pokemon.csv")
names(pokemon)
##  [1] "#"          "Name"       "Type 1"     "Type 2"     "Total"     
##  [6] "HP"         "Attack"     "Defense"    "Sp. Atk"    "Sp. Def"   
## [11] "Speed"      "Generation" "Legendary"
pokemon <- pokemon[,names(pokemon) %in% c("HP", "Attack", "Defense", "Sp. Atk", "Sp. Def","Speed")]
head(pokemon)
## # A tibble: 6 × 6
##      HP Attack Defense `Sp. Atk` `Sp. Def` Speed
##   <dbl>  <dbl>   <dbl>     <dbl>     <dbl> <dbl>
## 1    45     49      49        65        65    45
## 2    60     62      63        80        80    60
## 3    80     82      83       100       100    80
## 4    80    100     123       122       120    80
## 5    39     52      43        60        50    65
## 6    58     64      58        80        65    80
  1. Scaling the data
pokemon <- scale(pokemon)
head(pokemon)
##              HP      Attack    Defense    Sp. Atk    Sp. Def      Speed
## [1,] -0.9500319 -0.92432794 -0.7966553 -0.2389808 -0.2480334 -0.8010021
## [2,] -0.3625953 -0.52380252 -0.3476999  0.2194223  0.2909743 -0.2848371
## [3,]  0.4206536  0.09239043  0.2936649  0.8306264  1.0096513  0.4033830
## [4,]  0.4206536  0.64696408  1.5763945  1.5029509  1.7283282  0.4033830
## [5,] -1.1850065 -0.83189899 -0.9890647 -0.3917818 -0.7870411 -0.1127821
## [6,] -0.4409201 -0.46218322 -0.5080411  0.2194223 -0.2480334  0.4033830
  1. Determining the number of clusters–often no clean “elbow” in scree plot
# Initialize total within sum of squares error: wss
wss <- 0

# Look over 1 to 15 possible clusters
for (i in 1:15) {
  # Fit the model: km.out
  km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
  # Save the within cluster sum of squares
  wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Select number of clusters
k <- 2
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = 2, nstart = 20, iter.max = 50)
km.out$centers
##           HP     Attack    Defense    Sp. Atk    Sp. Def      Speed
## 1 -0.6073605 -0.6349958 -0.5579345 -0.6139122 -0.6676757 -0.4902048
## 2  0.5386027  0.5631095  0.4947721  0.5444127  0.5920898  0.4347100
km.out$withinss
## [1]  967.9501 2303.3369
km.out$tot.withinss
## [1] 3271.287
km.out$betweenss
## [1] 1522.713
km.out$betweenss/(km.out$betweenss + km.out$tot.withinss) # (between_SS / total_SS =  31.8 %)
## [1] 0.3176289
km.out$size
## [1] 376 424
  1. Visualize the results for interpretation
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
     col = km.out$cluster,
     main = paste("k-means clustering of Pokemon with", k, "clusters"),
     xlab = "Defense", ylab = "Speed")

3 Hierarchical Clustering

Create dataset

set.seed(2)
x <- data.frame (
  x1 = c(rnorm(50, mean = -2, sd = 0.75), rnorm(50, mean = 1, sd = .5)), 
  x2 = c(rnorm(50, mean = 2, sd = 0.75), rnorm(50, mean = 1, sd = .5))
  ) # x1 and x2 are two column features
plot(x)

3.1 Number of clusters is known

  1. Number of clusters is not known ahead of time.
  2. Two kinds: bottom-up and top-down. For bottom-up clustering, each point as a cluster at the beginning, then join together closest clusters, stops when there’s only 1 cluser left.
  3. Calculate similarity as euclidean distance between observations
dist_matrix <- dist(x) # Euclidean distance btw observations
hclust.out <- hclust(d = dist_matrix) # pass the distance to the d parameter
hclust.out
## 
## Call:
## hclust(d = dist_matrix)
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 100
summary(hclust.out) # not very helpful
##             Length Class  Mode     
## merge       198    -none- numeric  
## height       99    -none- numeric  
## order       100    -none- numeric  
## labels        0    -none- NULL     
## method        1    -none- character
## call          2    -none- call     
## dist.method   1    -none- character
plot(hclust.out)
abline(h = 4, col = "red") # h = desired distance between clusters

## cut by height h
cutree(hclust.out, h = 4)
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
##  [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## cut by number of clusters k
cutree(hclust.out, k = 2)
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
##  [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

3.2 Four methods

Linking clusters in hierarchical clustering
(1) How is distance between clusters determined? Rules–Four methods:

  1. Complete: parwise similarity between all observations in cluster 1 and cluster 2, and users largest of similarities 
  2. Single: smallest of similarities 
  3. Average: average of similarities 
  4. Centroid: finds centroid of cluster 1 and centroid of cluster 2, and uses similarity between two centroids

Complete and average tend to give balanced trees, most commonly used.
Data must be scaled so that features have same mean and standard deviation.

set.seed(1)
head(x)
##          x1       x2
## 1 -2.672686 2.805845
## 2 -1.861363 2.195448
## 3 -0.809116 1.764296
## 4 -2.847782 1.437777
## 5 -2.060189 1.353351
## 6 -1.900685 3.536030
colMeans(x)
##         x1         x2 
## -0.5067068  1.5369275
apply(x, 2, sd)
##        x1        x2 
## 1.6217639 0.8718234
## scale/normalize data
x <- scale(x) # normalized features
head(x)
##              x1         x2
## [1,] -1.3355699  1.4554749
## [2,] -0.8352981  0.7553374
## [3,] -0.1864693  0.2607965
## [4,] -1.4435362 -0.1137273
## [5,] -0.9578965 -0.2105659
## [6,] -0.8595443  2.2930134
round(colMeans(x),4)
## x1 x2 
##  0  0
apply(x, 2, sd)
## x1 x2 
##  1  1
d <- dist(x)
hclust.complete <- hclust(d, method = "complete")
hclust.average <- hclust(d, method = "average")
hclust.single <- hclust(d, method = "single")
hclust.centroid <- hclust(d, method = "centroid")
par( mar = c(2,4,3,1))
plot(hclust.complete, xlab= "", cex = .75, main = "Complete")

plot(hclust.average, xlab= "", cex = .75, main = "Average")

plot(hclust.single, xlab= "", cex = .75, main = "Single") # too many singles

plot(hclust.centroid, xlab= "", cex = .75, main = "Centroid") # inversed tree

3.3 Pokemon dataset

Pokemon has been scaled

round(colMeans(pokemon),3)
##      HP  Attack Defense Sp. Atk Sp. Def   Speed 
##       0       0       0       0       0       0
apply(pokemon, MARGIN = 2, FUN = sd)
##      HP  Attack Defense Sp. Atk Sp. Def   Speed 
##       1       1       1       1       1       1
hclust.pokemon <- hclust(dist(pokemon), method = "complete")
par( mar = c(2,4,3,1))
plot(hclust.pokemon, xlab = "", main = "Complete", labels = F) # real-life dataset rarely gives a good look

3.4 Comparing kmeans() & hclust()

set.seed(1)
cut.pokemon <- cutree(hclust.pokemon, k = 3)
km.pokemon <- kmeans(pokemon, centers = 3, nstart = 20, iter.max = 50)
table(kmeans = km.pokemon$cluster, hclust.cutree = cut.pokemon)
##       hclust.cutree
## kmeans   1   2   3
##      1 204   9   1
##      2 342   1   0
##      3 242   1   0

4 Dimensionality Reduction–PCA

  1. Find structure in features
  2. Aid in visualization

Three goals of PCA:
(1) Find linear combination of variables to create principal components
(2) Maintain most variance in the data
(3) Principal components are uncorrelated (orthogonal to each other)

pr.out <- prcomp(x = iris[,-5],
                  scale = TRUE,
                  center = TRUE) # always leave center = TRUE, centered around 0
summary(pr.out)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion  0.7296 0.9581 0.99482 1.00000
pr.out$rotation
##                     PC1         PC2        PC3        PC4
## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971
biplot(pr.out, cex = .5) # original variables' loadings in first two principal components

pr.var <- pr.out$sdev^2 # variance explained
pve <- pr.var/sum(pr.var) # proportion of variance explained
round(pve,3)
## [1] 0.730 0.229 0.037 0.005
par(mfrow = c(1,2), mar = c(4,4,2,1))
plot(pve, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")
plot(cumsum(pve), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

Practical issues with PCA

1. Scaling the data 
2. Missing values: drop observations with missing values; impute/estimate missing values 
3. Categorical data: exclude them; encode categorical features as numbers
pokemon <- read_csv("pokemon.csv")
pokemon <- pokemon[,names(pokemon) %in% c("HP", "Attack", "Defense", "Sp. Atk", "Sp. Def","Speed")]
pr.with.scaling <- prcomp(pokemon, scale = T, center = T)
pr.without.scaling <- prcomp(pokemon, scale = F, center = T)
pr.without.scaling$rotation <- -pr.without.scaling$rotation # changes sign for mirrored plot
pr.without.scaling$x <- -pr.without.scaling$x # changes sign for mirrored plot
par(mfrow = c(1,2), mar = c(4,5,2,1))
biplot(pr.with.scaling, cex = .4)
biplot(pr.without.scaling, cex =.4)

5 Unsupervised Learning Case Study

Human breast mass data

URL <- "http://assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
wisc.df <- read.csv(URL)
names(wisc.df)
##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave.points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave.points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave.points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"
## [33] "X"
wisc.df <- wisc.df[,!names(wisc.df) %in% c("X")]
dim(wisc.df)
## [1] 569  32
# Convert the features of the data: wisc.data, excluding column 1 id and 2 diagnosis
wisc.data <- as.matrix(wisc.df[,3:32])
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
table(wisc.df$diagnosis)
## 
##   B   M 
## 357 212

5.1 PCA

head(round(colMeans(wisc.data),3))
##      radius_mean     texture_mean   perimeter_mean        area_mean 
##           14.127           19.290           91.969          654.889 
##  smoothness_mean compactness_mean 
##            0.096            0.104
head(round(apply(X = wisc.data, MARGIN = 2, FUN = sd, na.rm = TRUE),3))
##      radius_mean     texture_mean   perimeter_mean        area_mean 
##            3.524            4.301           24.299          351.914 
##  smoothness_mean compactness_mean 
##            0.014            0.053
wisc.pr <- prcomp(wisc.data, scale. = T, center = T)
summary(wisc.pr)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000
biplot(wisc.pr, cex = .5)

par(mar = c(4,4,3,1))
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC2", main = "Observations by PC1 & PC2")

pr.var <- wisc.pr$sdev^2
pve <- pr.var/sum(pr.var)
round(cumsum(pve),3)
##  [1] 0.443 0.632 0.726 0.792 0.847 0.888 0.910 0.926 0.940 0.952 0.961 0.970
## [13] 0.978 0.983 0.986 0.989 0.991 0.993 0.995 0.996 0.997 0.997 0.998 0.999
## [25] 0.999 1.000 1.000 1.000 1.000 1.000
round(wisc.pr$rotation[1:10, 1:5],3)
##                           PC1    PC2    PC3    PC4    PC5
## radius_mean            -0.219  0.234 -0.009  0.041 -0.038
## texture_mean           -0.104  0.060  0.065 -0.603  0.049
## perimeter_mean         -0.228  0.215 -0.009  0.042 -0.037
## area_mean              -0.221  0.231  0.029  0.053 -0.010
## smoothness_mean        -0.143 -0.186 -0.104  0.159  0.365
## compactness_mean       -0.239 -0.152 -0.074  0.032 -0.012
## concavity_mean         -0.258 -0.060  0.003  0.019 -0.086
## concave.points_mean    -0.261  0.035 -0.026  0.065  0.044
## symmetry_mean          -0.138 -0.190 -0.040  0.067  0.306
## fractal_dimension_mean -0.064 -0.367 -0.023  0.049  0.044
wisc.pr$rotation[,1][8] # For the first principal component, what is the component of the loading vector for the feature concave.points_mean?
## concave.points_mean 
##          -0.2608538
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained", 
     type = "b", main = "Scree plot")

5.2 Hierarchical Clustering

data.scaled <- scale(wisc.data) # scale the data
data.dist <- dist(data.scaled) # Calculate the euclidean distance, has to be a matrix
wisc.hclust <- hclust(d = data.dist, method = "complete") # conduct hierarchical clustering analysis
par(mar=c(2,5,3,1))
plot(wisc.hclust, labels = F, cex = .8, col = "lightsteelblue", xlab = "", sub = "", main = "Complete") # what is the height at which the clustering model has 4 clusters? 20

wisc.hclust.clusters <- cutree(wisc.hclust, k = 4)
table(hclust = wisc.hclust.clusters, diagnosis)
##       diagnosis
## hclust   0   1
##      1  12 165
##      2   2   5
##      3 343  40
##      4   0   2

5.3 K-means

wisc.km <- kmeans(scale(wisc.data), centers = 2, nstart = 20) # nstart = 1 is the default value
table(kmeans = wisc.km$cluster, diagnosis)
##       diagnosis
## kmeans   0   1
##      1  14 175
##      2 343  37
table(kmeans = wisc.km$cluster, hclust = wisc.hclust.clusters)
##       hclust
## kmeans   1   2   3   4
##      1 160   7  20   2
##      2  17   0 363   0

5.4 Clustering on PCA

wisc.pr.hclust <- hclust(dist(wisc.pr$x[,1:7]), method = "complete")
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k = 4)
table(diagnosis, pr.cluster=wisc.pr.hclust.clusters )
##          pr.cluster
## diagnosis   1   2   3   4
##         0   5 350   2   0
##         1 113  97   0   2
table(diagnosis, cluster = wisc.hclust.clusters)
##          cluster
## diagnosis   1   2   3   4
##         0  12   2 343   0
##         1 165   5  40   2
table(diagnosis, kmeans = wisc.km$cluster)
##          kmeans
## diagnosis   1   2
##         0  14 343
##         1 175  37