library(rpart) #or use tree package described by James etal.(2021, Section 8.3.1)fit<-rpart(y~weight+width+ color +spine,method="class",data=Crabs)# method = "class" for categorical yplotcp(fit)#plots error rate by cp=complexity parameter for pruning
p.fit<-prune(fit,cp=0.056) #prune with particular value for cplibrary(rpart.plot)
Warning: package 'rpart.plot' was built under R version 4.2.3
rpart.plot(p.fit,extra=1,digits=4,box.palette="auto") #pruned tree
GSS <-read.table("http://stat4ds.rwth-aachen.de/data/GSS2018.dat", header=T)suppressMessages({library(rattle) # for fancy classification treelibrary(caret)library(rpart)library(tidyverse)})
Warning: package 'rattle' was built under R version 4.2.3
Warning: package 'tibble' was built under R version 4.2.3
Warning: package 'caret' was built under R version 4.2.3
Warning: package 'dplyr' was built under R version 4.2.3
GSS2018full <-na.omit(GSS)GSS2018full$PRES16 <-factor(GSS2018full$PRES16, levels=c(1,2,3,4), labels=c("Clinton", "Trump", "Other", "Not vote"))GSS2018f <- GSS2018full %>%select(-PARTYID) %>%mutate_at(vars(-c("AGE","EDUC")), as.factor) %>%mutate_at(vars(c("AGE","EDUC")), as.double)sapply(GSS2018f, class) # check the class of the variables
subject AGE SEX RACE EDUC WRKSTAT MARITAL EARNRS
"factor" "numeric" "factor" "factor" "numeric" "factor" "factor" "factor"
INCOME RINCOME GUNLAW PRES16 SMALLGAP TRCOURTS
"factor" "factor" "factor" "factor" "factor" "factor"
index =createDataPartition(y=GSS2018f$PRES16, p=0.7, list=FALSE)train = GSS2018f[index,]test = GSS2018f[-index,]dim(train)
CART
160 samples
13 predictor
4 classes: 'Clinton', 'Trump', 'Other', 'Not vote'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 143, 143, 145, 145, 144, 143, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.02857143 0.6845588 0.3869735
0.05714286 0.6340441 0.2669002
0.20000000 0.5775980 0.1027295
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02857143.
GDP HDI GII Fertility CO2 Homicide Prison
1 0.6976866 0.6779086 -0.7242041 -0.4003824 0.5456380 -0.3853873 -0.05102439
2 -1.0260097 -0.9969244 1.0650061 0.5887976 -0.8024088 0.5667460 0.07503587
Internet
1 0.6796963
2 -0.9995534
kmeans.fit$cluster # Cluster Id per case (not shown here)
Algeria Argentina Australia Austria Belgium Brazil
2 2 1 1 1 2
Canada Chile China Denmark Finland France
1 2 2 1 1 1
Germany Greece India Indonesia Iran Ireland
1 1 2 2 2 1
Israel Italy Japan Korea Malaysia Mexico
1 1 1 1 1 2
Morocco Netherlands NewZealand Nigeria Norway Pakistan
2 1 1 2 1 2
Peru Philippines Portugal Russia SouthAfrica Spain
2 2 1 1 2 1
Sweden Switzerland Turkey UK US Vietnam
1 1 2 1 1 2