Classification and Clustering

Classification with Fisher’s Linear Discriminant Function

Crabs<-read.table("http://stat4ds.rwth-aachen.de/data/Crabs.dat", header=TRUE)
library(MASS)
fit.lda<-lda(y~weight + width+color+spine,data=Crabs)#lineardiscrim.anal.
fit.lda

Call:
lda(y ~ weight + width + color + spine, data = Crabs)

Prior probabilities of groups:
        0         1 
0.3583815 0.6416185 

Group means:
    weight    width    color    spine
0 2.139113 25.16935 2.725806 2.516129
1 2.603685 26.92973 2.279279 2.468468

Coefficients of linear discriminants:
              LD1
weight  0.6829109
width   0.2564224
color  -0.6500500
spine   0.3148502

#summary(lm(y~weight+width + color+spine,data=Crabs))#least squares fit
#fit.logistic<-glm(y~weight + width+color+spine,family=binomial,data=Crabs)
#summary(fit.logistic)#MLfitof logistic regressionmodel

lda.predict<-predict(fit.lda)$posterior
head(cbind(lda.predict,Crabs$y),2)

          0         1  
1 0.1018824 0.8981176 1
2 0.7547380 0.2452620 0

predict(fit.lda)$class

  [1] 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1
 [38] 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1
 [75] 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1
[112] 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1
[149] 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1
Levels: 0 1

Visualization

prob1 <- predict(fit.lda)$posterior[,2] # estimated P(Y=1)
predY <- predict(fit.lda)$class # predicted class of Y

library(ggplot2)

Warning: package 'ggplot2' was built under R version 4.2.3

Crabs$y <- factor(Crabs$y) # need factor response variable for ggplot2
qplot(width, color, data=Crabs, col=y) # Figure A8.1 (i)

Warning: `qplot()` was deprecated in ggplot2 3.4.0.

qplot(width, color, data=Crabs, col=predY) # Figure A8.1 (ii)

Crabs$pred.right = predY == Crabs$y
round(mean(Crabs$pred.right),3) # prediction accuracy

[1] 0.728

qplot(width, color, data=Crabs, col=pred.right) # Figure A8.1 (iii)

ggplot(data=Crabs,aes(x=width,y=color,color=prob1)) + # Figure A8.1 (iv)
geom_point(alpha=0.6) + scale_color_gradient2(low="white",high="darkblue")

Classification Trees

library(rpart) #or use tree package described by James etal.(2021, Section 8.3.1)
fit<-rpart(y~weight+width+ color +spine,method="class",data=Crabs)
# method = "class" for categorical y
plotcp(fit)#plots error rate by cp=complexity parameter for pruning

p.fit<-prune(fit,cp=0.056) #prune with particular value for cp
library(rpart.plot)

Warning: package 'rpart.plot' was built under R version 4.2.3

rpart.plot(p.fit,extra=1,digits=4,box.palette="auto") #pruned tree

GSS <- read.table("http://stat4ds.rwth-aachen.de/data/GSS2018.dat", header=T)

suppressMessages({library(rattle) # for fancy classification tree
library(caret)
library(rpart)
library(tidyverse)})

Warning: package 'rattle' was built under R version 4.2.3

Warning: package 'tibble' was built under R version 4.2.3

Warning: package 'caret' was built under R version 4.2.3

Warning: package 'dplyr' was built under R version 4.2.3

GSS2018full <- na.omit(GSS)
GSS2018full$PRES16 <- factor(GSS2018full$PRES16, levels=c(1,2,3,4), labels=c("Clinton", "Trump", "Other", "Not vote"))

GSS2018f <- GSS2018full %>% select(-PARTYID) %>%
mutate_at(vars(-c("AGE","EDUC")), as.factor) %>%
mutate_at(vars(c("AGE","EDUC")), as.double)

sapply(GSS2018f, class) # check the class of the variables

  subject       AGE       SEX      RACE      EDUC   WRKSTAT   MARITAL    EARNRS 
 "factor" "numeric"  "factor"  "factor" "numeric"  "factor"  "factor"  "factor" 
   INCOME   RINCOME    GUNLAW    PRES16  SMALLGAP  TRCOURTS 
 "factor"  "factor"  "factor"  "factor"  "factor"  "factor"

index = createDataPartition(y=GSS2018f$PRES16, p=0.7, list=FALSE)
train = GSS2018f[index,]
test = GSS2018f[-index,]
dim(train)

[1] 160  14

dim(test)

[1] 65 14

GSS2018.tree = train(PRES16 ~ ., data=train, method="rpart", trControl = trainControl(method = "cv"))
GSS2018.tree

CART 

160 samples
 13 predictor
  4 classes: 'Clinton', 'Trump', 'Other', 'Not vote' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 143, 143, 145, 145, 144, 143, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.02857143  0.6845588  0.3869735
  0.05714286  0.6340441  0.2669002
  0.20000000  0.5775980  0.1027295

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02857143.

GSS2018.pred = predict(GSS2018.tree, newdata = test)
table(GSS2018.pred, test$PRES16)

            
GSS2018.pred Clinton Trump Other Not vote
    Clinton       31    12     3        0
    Trump          7    11     1        0
    Other          0     0     0        0
    Not vote       0     0     0        0

error.rate = round(mean(GSS2018.pred != test$PRES16),3)
error.rate

[1] 0.354

fancyRpartPlot(GSS2018.tree$finalModel) # Figure A8.3 (left)

ggplot(GSS2018.tree) # Figure A8.3 (right)

#summary(GSS2018.tree$finalModel) # long output with detailed information (not shown)

kNN

Crabs.std<-scale(Crabs[, 4:7]) #standardizeweight,width,color,spine
y<-Crabs$y # columns 4,5,6,7ofCrabsdatafile
Crabs2<-data.frame(cbind(y, Crabs.std))
head(Crabs2,1)

  y   weight     width     color     spine
1 2 1.062015 0.9488375 -0.547809 0.6231873

train_index<-sample(nrow(Crabs), (2/3)*nrow(Crabs))
Crabs_train<-Crabs2[train_index, ] # 2/3ofdataintrainingsample
Crabs_test<-Crabs2[-train_index, ] # 1/3ofdataintestsample
target_cat<-Crabs2[train_index, 1] #actualyvaluesfortrainingset
test_cat<-Crabs2[-train_index, 1] #actualyvaluesfortestset

library(class)#package needed forknnfunction
pr1<-knn(Crabs_train, Crabs_test, cl=target_cat,k=1)#usingk=1neighbor
table(pr1,test_cat)# classification tablefor58crabsintestsample

   test_cat
pr1  1  2
  1 15  2
  2  0 41

pr3<-knn(Crabs_train, Crabs_test, cl=target_cat,k=3)#usingk=3neighbors
table(pr3,test_cat)

   test_cat
pr3  1  2
  1 14  3
  2  1 40

pr5<-knn(Crabs_train, Crabs_test, cl=target_cat,k=5)#usingk=5neighbors
table(pr5,test_cat)

   test_cat
pr5  1  2
  1 15  2
  2  0 41

Cluster Analysis

Measuring Dissimilarity between Observations

Hierarchical Clustering Algorithm and Its Dendrogram

Elections<-read.table("http://stat4ds.rwth-aachen.de/data/Elections2.dat", header=T)
Elections #0=Republican,1 = Democraticelectoralcollegewinner

   number         state e1 e2 e3 e4 e5 e6 e7 e8 e9 e10 e11
1       1       Arizona  0  0  0  0  1  0  0  0  0   0   1
2       2    California  0  0  0  1  1  1  1  1  1   1   1
3       3      Colorado  0  0  0  1  0  0  0  1  1   1   1
4       4       Florida  0  0  0  0  1  0  0  1  1   0   0
5       5      Illinois  0  0  0  1  1  1  1  1  1   1   1
6       6 Massachusetts  0  0  1  1  1  1  1  1  1   1   1
7       7     Minnesota  1  1  1  1  1  1  1  1  1   1   1
8       8      Missouri  0  0  0  1  1  0  0  0  0   0   0
9       9     NewMexico  0  0  0  1  1  1  0  1  1   1   1
10     10       NewYork  0  0  1  1  1  1  1  1  1   1   1
11     11          Ohio  0  0  0  1  1  0  0  1  1   0   0
12     12         Texas  0  0  0  0  0  0  0  0  0   0   0
13     13      Virginia  0  0  0  0  0  0  0  1  1   1   1
14     14       Wyoming  0  0  0  0  0  0  0  0  0   0   0

distances<-dist(Elections[,3:13],method="manhattan")
# manhattandissimilarity=no.ofelectionoutcomesthat differ
democlust<-hclust(distances,"average")#hierarchicalclustering
plot(democlust,labels=Elections$state)

library(gplots)#heatmapportraysobservationswithdendrogram

Warning: package 'gplots' was built under R version 4.2.3


Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess

clust.meth<-function(d)hclust(d,method="average")
dist.meth<-function(x)dist(x,method="manhattan")
heatmap.2(as.matrix(Elections[,3:13]),distfun=dist.meth,hclustfun=clust.meth, labRow =Elections$state,dendrogram="row",key=FALSE,Colv=FALSE)

Some more data analysis

UN <- read.table("http://stat4ds.rwth-aachen.de/data/UN.dat", header=T)
suppressPackageStartupMessages({
library(tidyverse)
library(cluster)
library(gplots) # heatmap.2()
library(colorspace) })

UN_scaled <- UN %>%
select_if(is.numeric) %>% # selets only numeric variables
# select(-C02) %>% # command to remove a variable, not activated here
mutate_all(as.double) %>%
scale()
row.names(UN_scaled) <- UN[,1] # Declare column 1 as the row names
d <- dist(UN_scaled, method="euclidean") # dissimilarity matrix

hc1 <- agnes(d,method="single"); hc1$ac

[1] 0.6978301

hc2 <- agnes(d,method="average"); hc2$ac # hierarchical clustering with average linkage

[1] 0.808305

hc3 <- agnes(d,method="complete"); hc3$ac #

[1] 0.8667266

hc4 <- agnes(d,method="ward"); hc4$ac #

[1] 0.9211221

hc <- hclust(d,method="ward.D2")
plot(hc, hang = -1, cex = 0.6) # Figure A8.4 (left)

dist.f <- function(x) dist(x, method = 'euclidean')
hclust.w <- function(x) hclust(x, method="ward.D2")

#heatmap.2(d, distfun=dist.f, hclustfun=hclust.w, # Figure A8.4 (left)
#trace="none", dendrogram = "row",
#col = heat_hcl, # other colors: diverge_hcl,rainbow_hcl
#cexRow=0.8, cexCol = 0.8, # size of row/column labels
#key.par=list(mar=c(5,1,3,1)) )

# k-means Clustering:
kmeans.fit <- kmeans(UN_scaled, 2) # k = 2 clusters
attributes(kmeans.fit) # output not shown here

$names
[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

$class
[1] "kmeans"

kmeans.fit$centers # Centroids of clusters

         GDP        HDI        GII  Fertility        CO2   Homicide      Prison
1  0.6976866  0.6779086 -0.7242041 -0.4003824  0.5456380 -0.3853873 -0.05102439
2 -1.0260097 -0.9969244  1.0650061  0.5887976 -0.8024088  0.5667460  0.07503587
    Internet
1  0.6796963
2 -0.9995534

kmeans.fit$cluster # Cluster Id per case (not shown here)

    Algeria   Argentina   Australia     Austria     Belgium      Brazil 
          2           2           1           1           1           2 
     Canada       Chile       China     Denmark     Finland      France 
          1           2           2           1           1           1 
    Germany      Greece       India   Indonesia        Iran     Ireland 
          1           1           2           2           2           1 
     Israel       Italy       Japan       Korea    Malaysia      Mexico 
          1           1           1           1           1           2 
    Morocco Netherlands  NewZealand     Nigeria      Norway    Pakistan 
          2           1           1           2           1           2 
       Peru Philippines    Portugal      Russia SouthAfrica       Spain 
          2           2           1           1           2           1 
     Sweden Switzerland      Turkey          UK          US     Vietnam 
          1           1           2           1           1           2

kmeans.fit$size # Cluster size

[1] 25 17

clusplot(UN_scaled, kmeans.fit$cluster, main=' ', # Figure A8.5
color=TRUE, shade=TRUE, labels=2, lines=0)

library(factoextra)

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(gridExtra)


Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

layout(matrix(c(1,2,3),1,3))
pl1 <- fviz_nbclust(UN_scaled, FUNcluster = kmeans, method = "wss" , # other options: "silhouette", "gap_stat"
diss = d, k.max = 10)

pl2 <-fviz_nbclust(UN_scaled, FUNcluster = kmeans, method = "silhouette" ,
diss = d, k.max = 10)

pl3 <-fviz_nbclust(UN_scaled, FUNcluster = kmeans, method = "gap_stat" ,
diss = d, k.max = 10)

grid.arrange(pl1,pl2,pl3, nrow=1) # to print all plots in a matrix form

Exercises