What about practices that we didn’t include in the original dataset or new practices. How do we assign them to clusters?
This can be done by developing supervised learning machine algorithms which can help classify practices. There are range of algorithms developed for classification. They include:
random forests
support vector machines
regression trees
k nearest neighbours
latent discriminant analysis
The idea is to split the practices into randomly into 2 groups; a ‘training’ subset (80% of practices) to develop the models; a validation group to see how well the model works (20% of the data) to finalise the best model for assigning cluster labels to new or unseen practices.
library(caret); require(dplyr)
## Loading required package: lattice
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("~/Documents/R_projects/K_GP/update/update_gp/GP-clusters")
data <- read.csv("gp_k.csv")
str(data)
## 'data.frame': 7750 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ practice: Factor w/ 7750 levels "A81007","A81008",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ccg : Factor w/ 209 levels "NHS Airedale, Wharfdale And Craven CCG",..: 74 160 160 74 160 160 74 160 160 74 ...
## $ X.0.4 : num 5.28 5.99 6.66 5.53 7.32 ...
## $ X.5.14 : num 12.5 11 12.1 10.6 12 ...
## $ X..18 : num 21.5 20.5 22.6 19.7 22.9 ...
## $ X.65. : num 18.6 16.6 14.7 18.6 14.3 ...
## $ X.75. : num 9.26 7.73 6.72 8.51 7.34 ...
## $ X.85. : num 2.71 2.06 1.82 1.92 1.79 ...
## $ imd : num 31.7 51.9 35.5 34.8 49.7 ...
## $ pop : int 9525 4088 9265 11285 4756 6077 3786 3021 9505 21147 ...
## $ eth1 : num 2.57 4.37 12.96 2.31 8.98 ...
## $ cluster : int 5 8 8 5 8 8 2 1 8 6 ...
## $ imd1 : Factor w/ 10 levels "(15.9,22.2]",..: 3 7 5 3 7 3 1 2 6 1 ...
data <- data[, -1]
colnames(data) <- c("prac", "ccg","under4", "five_14", "under18", "over65","over75", "over85", "imd", "pop", "eth1", "cluster", "imd1")
train <- sample_frac(data, 0.8)
praclist <- train$prac
test <- data[!data$prac %in% praclist,]
str(test)
## 'data.frame': 1550 obs. of 13 variables:
## $ prac : Factor w/ 7750 levels "A81007","A81008",..: 5 6 9 14 16 24 27 28 44 65 ...
## $ ccg : Factor w/ 209 levels "NHS Airedale, Wharfdale And Craven CCG",..: 160 160 160 160 160 160 74 160 160 160 ...
## $ under4 : num 7.32 6.33 6.42 6.03 6.44 ...
## $ five_14: num 12 12.3 11.3 11.3 11.7 ...
## $ under18: num 22.9 22.3 21 20.8 21.8 ...
## $ over65 : num 14.3 17.8 13.6 19.4 11 ...
## $ over75 : num 7.34 6.99 6.45 8.53 4.59 ...
## $ over85 : num 1.79 1.66 1.85 2.17 1.11 ...
## $ imd : num 49.7 29.9 43.8 34.7 46.5 ...
## $ pop : int 4756 6077 9505 13373 7749 3444 17336 7124 8009 2276 ...
## $ eth1 : num 8.98 0.845 17.214 2.489 16.049 ...
## $ cluster: int 8 8 8 5 8 8 6 8 1 14 ...
## $ imd1 : Factor w/ 10 levels "(15.9,22.2]",..: 7 3 6 3 6 7 2 6 2 9 ...
library(caret)
x<- train[, c(3:11)]
y <- train[,12]
par(mfrow = c(1,1))
featurePlot(x=log(x), y=y, plot = "box")
## NULL
par(mfrow = c(1,1))
scales <- list(x= list(relation = "free"), y = list(relation = "free"))
featurePlot(x=x, y=y, plot = "density", scales = scales)
## NULL
require(caret)
trainControl <- trainControl(method = "repeatedcv", number = 10)
metric <- "Accuracy"
require(caret); require(caretEnsemble); require(rpart); require(rattle)
## Tree
set.seed(7)
train1 <- train[, c(3:12)]
train1$cluster <- factor(train1$cluster)
fit.tree <- rpart(cluster~., data = train1)
printcp(fit.tree)
##
## Classification tree:
## rpart(formula = cluster ~ ., data = train1)
##
## Variables actually used in tree construction:
## [1] eth1 imd over65 over75 pop under18
##
## Root node error: 5453/6200 = 0.87952
##
## n= 6200
##
## CP nsplit rel error xerror xstd
## 1 0.091601 0 1.00000 1.00660 0.0046010
## 2 0.088392 2 0.81680 0.88813 0.0059706
## 3 0.073721 4 0.64001 0.77700 0.0067167
## 4 0.058317 5 0.56629 0.51366 0.0071862
## 5 0.049147 6 0.50798 0.47845 0.0071288
## 6 0.037227 7 0.45883 0.45828 0.0070829
## 7 0.029158 8 0.42160 0.43261 0.0070106
## 8 0.025124 9 0.39244 0.40198 0.0069032
## 9 0.019622 10 0.36732 0.39501 0.0068755
## 10 0.016321 11 0.34770 0.37832 0.0068039
## 11 0.014304 12 0.33138 0.36475 0.0067403
## 12 0.013754 13 0.31707 0.34788 0.0066541
## 13 0.010636 14 0.30332 0.33009 0.0065544
## 14 0.010000 15 0.29268 0.31781 0.0064800
plotcp(fit.tree)
summary(fit.tree)
## Call:
## rpart(formula = cluster ~ ., data = train1)
## n= 6200
##
## CP nsplit rel error xerror xstd
## 1 0.09160095 0 1.0000000 1.0066019 0.004600977
## 2 0.08839171 2 0.8167981 0.8881350 0.005970570
## 3 0.07372089 4 0.6400147 0.7770035 0.006716732
## 4 0.05831652 5 0.5662938 0.5136622 0.007186225
## 5 0.04914726 6 0.5079773 0.4784522 0.007128750
## 6 0.03722721 7 0.4588300 0.4582798 0.007082904
## 7 0.02915826 8 0.4216028 0.4326059 0.007010593
## 8 0.02512379 9 0.3924445 0.4019806 0.006903237
## 9 0.01962223 10 0.3673207 0.3950119 0.006875503
## 10 0.01632129 11 0.3476985 0.3783239 0.006803949
## 11 0.01430405 12 0.3313772 0.3647533 0.006740291
## 12 0.01375390 13 0.3170732 0.3478819 0.006654080
## 13 0.01063635 14 0.3033193 0.3300935 0.006554374
## 14 0.01000000 15 0.2926829 0.3178067 0.006480014
##
## Variable importance
## over65 under18 over75 under4 pop five_14 over85 eth1 imd
## 16 15 13 11 11 9 9 8 8
##
## Node number 1: 6200 observations, complexity param=0.09160095
## predicted class=13 expected loss=0.8795161 P(node) =1
## class counts: 662 559 310 262 720 121 155 492 556 540 549 263 747 264
## probabilities: 0.107 0.090 0.050 0.042 0.116 0.020 0.025 0.079 0.090 0.087 0.089 0.042 0.120 0.043
## left son=2 (1348 obs) right son=3 (4852 obs)
## Primary splits:
## over65 < 21.861 to the right, improve=345.5030, (0 missing)
## pop < 9006 to the left, improve=344.5195, (0 missing)
## over75 < 9.7435 to the right, improve=335.0696, (0 missing)
## eth1 < 35.72555 to the left, improve=325.3097, (0 missing)
## imd < 22.835 to the right, improve=298.3400, (0 missing)
## Surrogate splits:
## over75 < 10.115 to the right, agree=0.934, adj=0.696, (0 split)
## over85 < 3.1025 to the right, agree=0.873, adj=0.416, (0 split)
## under4 < 4.6675 to the left, agree=0.842, adj=0.272, (0 split)
## eth1 < 2.0449 to the left, agree=0.826, adj=0.202, (0 split)
## under18 < 17.9845 to the left, agree=0.817, adj=0.157, (0 split)
##
## Node number 2: 1348 observations, complexity param=0.07372089
## predicted class=2 expected loss=0.615727 P(node) =0.2174194
## class counts: 65 518 0 0 8 15 155 0 0 0 450 0 137 0
## probabilities: 0.048 0.384 0.000 0.000 0.006 0.011 0.115 0.000 0.000 0.000 0.334 0.000 0.102 0.000
## left son=4 (786 obs) right son=5 (562 obs)
## Primary splits:
## pop < 7819.5 to the left, improve=292.85050, (0 missing)
## over75 < 13.997 to the left, improve=146.25090, (0 missing)
## over65 < 30.049 to the right, improve=113.14750, (0 missing)
## over85 < 4.639 to the left, improve=104.70930, (0 missing)
## under18 < 17.82 to the left, improve= 98.00896, (0 missing)
## Surrogate splits:
## under18 < 18.6625 to the left, agree=0.612, adj=0.069, (0 split)
## under4 < 4.8575 to the left, agree=0.604, adj=0.050, (0 split)
## over65 < 22.5555 to the right, agree=0.589, adj=0.014, (0 split)
## imd < 11.576 to the right, agree=0.589, adj=0.014, (0 split)
## five_14 < 12.976 to the left, agree=0.585, adj=0.005, (0 split)
##
## Node number 3: 4852 observations, complexity param=0.09160095
## predicted class=5 expected loss=0.8532564 P(node) =0.7825806
## class counts: 597 41 310 262 712 106 0 492 556 540 99 263 610 264
## probabilities: 0.123 0.008 0.064 0.054 0.147 0.022 0.000 0.101 0.115 0.111 0.020 0.054 0.126 0.054
## left son=6 (1414 obs) right son=7 (3438 obs)
## Primary splits:
## pop < 9103 to the right, improve=322.4685, (0 missing)
## eth1 < 37.325 to the left, improve=320.6162, (0 missing)
## over65 < 15.5585 to the right, improve=306.3342, (0 missing)
## over75 < 6.696 to the right, improve=291.5177, (0 missing)
## imd < 22.345 to the right, improve=265.9353, (0 missing)
## Surrogate splits:
## five_14 < 1.654 to the left, agree=0.710, adj=0.006, (0 split)
## over65 < 0.5565 to the left, agree=0.710, adj=0.006, (0 split)
## under4 < 1.2875 to the left, agree=0.710, adj=0.005, (0 split)
## over75 < 0.1525 to the left, agree=0.710, adj=0.004, (0 split)
## imd < 4.1775 to the left, agree=0.709, adj=0.001, (0 split)
##
## Node number 4: 786 observations, complexity param=0.01430405
## predicted class=2 expected loss=0.3715013 P(node) =0.1267742
## class counts: 50 494 0 0 0 0 93 0 0 0 24 0 125 0
## probabilities: 0.064 0.628 0.000 0.000 0.000 0.000 0.118 0.000 0.000 0.000 0.031 0.000 0.159 0.000
## left son=8 (690 obs) right son=9 (96 obs)
## Primary splits:
## over75 < 13.9835 to the left, improve=102.47360, (0 missing)
## over85 < 4.322 to the left, improve= 71.23109, (0 missing)
## over65 < 29.735 to the left, improve= 63.93385, (0 missing)
## under18 < 18.609 to the left, improve= 63.73358, (0 missing)
## five_14 < 10.392 to the left, improve= 44.66031, (0 missing)
## Surrogate splits:
## over85 < 4.6645 to the left, agree=0.945, adj=0.552, (0 split)
## over65 < 30.099 to the left, agree=0.941, adj=0.521, (0 split)
## under18 < 15.0885 to the right, agree=0.888, adj=0.083, (0 split)
## under4 < 2.5505 to the right, agree=0.883, adj=0.042, (0 split)
##
## Node number 5: 562 observations
## predicted class=11 expected loss=0.2419929 P(node) =0.09064516
## class counts: 15 24 0 0 8 15 62 0 0 0 426 0 12 0
## probabilities: 0.027 0.043 0.000 0.000 0.014 0.027 0.110 0.000 0.000 0.000 0.758 0.000 0.021 0.000
##
## Node number 6: 1414 observations, complexity param=0.02915826
## predicted class=5 expected loss=0.5346535 P(node) =0.2280645
## class counts: 27 0 16 94 658 106 0 54 21 51 88 229 41 29
## probabilities: 0.019 0.000 0.011 0.066 0.465 0.075 0.000 0.038 0.015 0.036 0.062 0.162 0.029 0.021
## left son=12 (901 obs) right son=13 (513 obs)
## Primary splits:
## over65 < 14.771 to the right, improve=176.2167, (0 missing)
## over75 < 6.616 to the right, improve=156.9533, (0 missing)
## over85 < 1.696 to the right, improve=129.1275, (0 missing)
## under18 < 22.794 to the left, improve=123.4090, (0 missing)
## under4 < 7.108 to the left, improve=118.9813, (0 missing)
## Surrogate splits:
## over75 < 6.6225 to the right, agree=0.943, adj=0.842, (0 split)
## over85 < 1.8265 to the right, agree=0.888, adj=0.690, (0 split)
## eth1 < 21.54195 to the left, agree=0.808, adj=0.472, (0 split)
## under4 < 7.08 to the left, agree=0.785, adj=0.407, (0 split)
## under18 < 23.3165 to the left, agree=0.769, adj=0.363, (0 split)
##
## Node number 7: 3438 observations, complexity param=0.08839171
## predicted class=1 expected loss=0.8342059 P(node) =0.5545161
## class counts: 570 41 294 168 54 0 0 438 535 489 11 34 569 235
## probabilities: 0.166 0.012 0.086 0.049 0.016 0.000 0.000 0.127 0.156 0.142 0.003 0.010 0.166 0.068
## left son=14 (2594 obs) right son=15 (844 obs)
## Primary splits:
## eth1 < 35.8134 to the left, improve=309.0286, (0 missing)
## over65 < 16.8655 to the right, improve=304.0387, (0 missing)
## imd < 22.1585 to the right, improve=299.3851, (0 missing)
## over75 < 7.511 to the right, improve=271.1723, (0 missing)
## under18 < 26.2035 to the left, improve=222.0559, (0 missing)
## Surrogate splits:
## over65 < 9.968 to the right, agree=0.820, adj=0.267, (0 split)
## over75 < 4.3015 to the right, agree=0.795, adj=0.165, (0 split)
## over85 < 0.848 to the right, agree=0.795, adj=0.165, (0 split)
## five_14 < 16.003 to the left, agree=0.783, adj=0.115, (0 split)
## under18 < 27.0155 to the left, agree=0.776, adj=0.088, (0 split)
##
## Node number 8: 690 observations
## predicted class=2 expected loss=0.2971014 P(node) =0.1112903
## class counts: 50 485 0 0 0 0 6 0 0 0 24 0 125 0
## probabilities: 0.072 0.703 0.000 0.000 0.000 0.000 0.009 0.000 0.000 0.000 0.035 0.000 0.181 0.000
##
## Node number 9: 96 observations
## predicted class=7 expected loss=0.09375 P(node) =0.01548387
## class counts: 0 9 0 0 0 0 87 0 0 0 0 0 0 0
## probabilities: 0.000 0.094 0.000 0.000 0.000 0.000 0.906 0.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 12: 901 observations, complexity param=0.0137539
## predicted class=5 expected loss=0.3285239 P(node) =0.1453226
## class counts: 26 0 0 1 605 83 0 34 4 2 88 17 41 0
## probabilities: 0.029 0.000 0.000 0.001 0.671 0.092 0.000 0.038 0.004 0.002 0.098 0.019 0.046 0.000
## left son=24 (76 obs) right son=25 (825 obs)
## Primary splits:
## pop < 16940 to the right, improve=105.02420, (0 missing)
## over65 < 20.6195 to the left, improve= 44.05260, (0 missing)
## over75 < 9.798 to the left, improve= 41.96706, (0 missing)
## imd < 32.912 to the right, improve= 34.11510, (0 missing)
## under4 < 5.3965 to the right, improve= 32.39240, (0 missing)
## Surrogate splits:
## over65 < 21.8435 to the right, agree=0.918, adj=0.026, (0 split)
##
## Node number 13: 513 observations, complexity param=0.01632129
## predicted class=12 expected loss=0.5867446 P(node) =0.08274194
## class counts: 1 0 16 93 53 23 0 20 17 49 0 212 0 29
## probabilities: 0.002 0.000 0.031 0.181 0.103 0.045 0.000 0.039 0.033 0.096 0.000 0.413 0.000 0.057
## left son=26 (103 obs) right son=27 (410 obs)
## Primary splits:
## under18 < 18.0225 to the left, improve=83.83692, (0 missing)
## five_14 < 9.2115 to the left, improve=82.02679, (0 missing)
## under4 < 5.739 to the left, improve=68.70553, (0 missing)
## over65 < 8.143 to the left, improve=27.14185, (0 missing)
## over75 < 3.4145 to the left, improve=24.65588, (0 missing)
## Surrogate splits:
## five_14 < 9.473 to the left, agree=0.965, adj=0.825, (0 split)
## under4 < 5.5035 to the left, agree=0.926, adj=0.631, (0 split)
## over65 < 2.6095 to the left, agree=0.838, adj=0.194, (0 split)
## over75 < 0.633 to the left, agree=0.832, adj=0.165, (0 split)
## over85 < 0.147 to the left, agree=0.828, adj=0.146, (0 split)
##
## Node number 14: 2594 observations, complexity param=0.08839171
## predicted class=13 expected loss=0.7814187 P(node) =0.4183871
## class counts: 553 38 244 108 54 0 0 427 522 32 10 19 567 20
## probabilities: 0.213 0.015 0.094 0.042 0.021 0.000 0.000 0.165 0.201 0.012 0.004 0.007 0.219 0.008
## left son=28 (1447 obs) right son=29 (1147 obs)
## Primary splits:
## imd < 22.333 to the right, improve=313.8706, (0 missing)
## over65 < 16.928 to the left, improve=260.0130, (0 missing)
## over75 < 7.511 to the left, improve=227.7185, (0 missing)
## under18 < 21.2635 to the left, improve=224.7215, (0 missing)
## under4 < 6.323 to the right, improve=199.1127, (0 missing)
## Surrogate splits:
## over65 < 18.8845 to the left, agree=0.618, adj=0.137, (0 split)
## under4 < 5.584 to the right, agree=0.616, adj=0.132, (0 split)
## over85 < 2.251 to the left, agree=0.609, adj=0.116, (0 split)
## over75 < 7.505 to the left, agree=0.601, adj=0.098, (0 split)
## pop < 5524.5 to the left, agree=0.567, adj=0.022, (0 split)
##
## Node number 15: 844 observations, complexity param=0.03722721
## predicted class=10 expected loss=0.4585308 P(node) =0.136129
## class counts: 17 3 50 60 0 0 0 11 13 457 1 15 2 215
## probabilities: 0.020 0.004 0.059 0.071 0.000 0.000 0.000 0.013 0.015 0.541 0.001 0.018 0.002 0.255
## left son=30 (595 obs) right son=31 (249 obs)
## Primary splits:
## under18 < 26.68 to the left, improve=218.48890, (0 missing)
## five_14 < 14.9535 to the left, improve=186.10760, (0 missing)
## under4 < 8.257 to the left, improve=128.59740, (0 missing)
## imd < 43.8185 to the left, improve= 71.07051, (0 missing)
## over65 < 7.3205 to the right, improve= 43.29074, (0 missing)
## Surrogate splits:
## five_14 < 14.9535 to the left, agree=0.957, adj=0.855, (0 split)
## under4 < 8.26 to the left, agree=0.863, adj=0.534, (0 split)
## imd < 43.8185 to the left, agree=0.800, adj=0.321, (0 split)
## over65 < 6.6285 to the right, agree=0.726, adj=0.072, (0 split)
## eth1 < 71.75385 to the left, agree=0.718, adj=0.044, (0 split)
##
## Node number 24: 76 observations
## predicted class=6 expected loss=0.01315789 P(node) =0.01225806
## class counts: 0 0 0 0 0 75 0 0 0 0 0 1 0 0
## probabilities: 0.000 0.000 0.000 0.000 0.000 0.987 0.000 0.000 0.000 0.000 0.000 0.013 0.000 0.000
##
## Node number 25: 825 observations
## predicted class=5 expected loss=0.2666667 P(node) =0.1330645
## class counts: 26 0 0 1 605 8 0 34 4 2 88 16 41 0
## probabilities: 0.032 0.000 0.000 0.001 0.733 0.010 0.000 0.041 0.005 0.002 0.107 0.019 0.050 0.000
##
## Node number 26: 103 observations
## predicted class=4 expected loss=0.1359223 P(node) =0.0166129
## class counts: 1 0 0 89 8 3 0 0 0 2 0 0 0 0
## probabilities: 0.010 0.000 0.000 0.864 0.078 0.029 0.000 0.000 0.000 0.019 0.000 0.000 0.000 0.000
##
## Node number 27: 410 observations
## predicted class=12 expected loss=0.4829268 P(node) =0.06612903
## class counts: 0 0 16 4 45 20 0 20 17 47 0 212 0 29
## probabilities: 0.000 0.000 0.039 0.010 0.110 0.049 0.000 0.049 0.041 0.115 0.000 0.517 0.000 0.071
##
## Node number 28: 1447 observations, complexity param=0.05831652
## predicted class=1 expected loss=0.6434001 P(node) =0.2333871
## class counts: 516 11 200 67 19 0 0 427 145 25 3 10 6 18
## probabilities: 0.357 0.008 0.138 0.046 0.013 0.000 0.000 0.295 0.100 0.017 0.002 0.007 0.004 0.012
## left son=56 (685 obs) right son=57 (762 obs)
## Primary splits:
## under18 < 21.307 to the left, improve=229.8899, (0 missing)
## over65 < 16.8325 to the right, improve=220.4252, (0 missing)
## five_14 < 11.49 to the left, improve=185.1137, (0 missing)
## under4 < 6.3565 to the left, improve=183.0349, (0 missing)
## over75 < 7.5455 to the right, improve=180.3142, (0 missing)
## Surrogate splits:
## five_14 < 11.4725 to the left, agree=0.918, adj=0.826, (0 split)
## under4 < 6.2435 to the left, agree=0.865, adj=0.715, (0 split)
## over65 < 16.924 to the right, agree=0.744, adj=0.458, (0 split)
## over75 < 7.4525 to the right, agree=0.713, adj=0.393, (0 split)
## over85 < 1.9285 to the right, agree=0.654, adj=0.269, (0 split)
##
## Node number 29: 1147 observations, complexity param=0.04914726
## predicted class=13 expected loss=0.510898 P(node) =0.185
## class counts: 37 27 44 41 35 0 0 0 377 7 7 9 561 2
## probabilities: 0.032 0.024 0.038 0.036 0.031 0.000 0.000 0.000 0.329 0.006 0.006 0.008 0.489 0.002
## left son=58 (489 obs) right son=59 (658 obs)
## Primary splits:
## over65 < 16.9345 to the left, improve=221.7107, (0 missing)
## over75 < 7.5105 to the left, improve=204.6501, (0 missing)
## under4 < 5.9945 to the right, improve=142.0264, (0 missing)
## under18 < 21.051 to the right, improve=139.4877, (0 missing)
## over85 < 1.9305 to the left, improve=129.8850, (0 missing)
## Surrogate splits:
## over75 < 7.2965 to the left, agree=0.898, adj=0.761, (0 split)
## over85 < 1.7745 to the left, agree=0.792, adj=0.511, (0 split)
## under4 < 6.4195 to the right, agree=0.736, adj=0.380, (0 split)
## under18 < 21.672 to the right, agree=0.717, adj=0.335, (0 split)
## eth1 < 9.2766 to the right, agree=0.684, adj=0.258, (0 split)
##
## Node number 30: 595 observations, complexity param=0.01063635
## predicted class=10 expected loss=0.2453782 P(node) =0.09596774
## class counts: 17 3 24 60 0 0 0 11 13 449 1 11 2 4
## probabilities: 0.029 0.005 0.040 0.101 0.000 0.000 0.000 0.018 0.022 0.755 0.002 0.018 0.003 0.007
## left son=60 (62 obs) right son=61 (533 obs)
## Primary splits:
## under18 < 16.4995 to the left, improve=87.86096, (0 missing)
## five_14 < 8.462 to the left, improve=67.71882, (0 missing)
## under4 < 4.9785 to the left, improve=38.02655, (0 missing)
## eth1 < 42.7763 to the left, improve=22.31810, (0 missing)
## over65 < 15.581 to the right, improve=16.92980, (0 missing)
## Surrogate splits:
## five_14 < 8.462 to the left, agree=0.975, adj=0.758, (0 split)
## under4 < 4.6065 to the left, agree=0.933, adj=0.355, (0 split)
## over75 < 0.1885 to the left, agree=0.901, adj=0.048, (0 split)
## over65 < 0.793 to the left, agree=0.899, adj=0.032, (0 split)
## pop < 966 to the left, agree=0.899, adj=0.032, (0 split)
##
## Node number 31: 249 observations
## predicted class=14 expected loss=0.1526104 P(node) =0.04016129
## class counts: 0 0 26 0 0 0 0 0 0 8 0 4 0 211
## probabilities: 0.000 0.000 0.104 0.000 0.000 0.000 0.000 0.000 0.000 0.032 0.000 0.016 0.000 0.847
##
## Node number 56: 685 observations
## predicted class=1 expected loss=0.3080292 P(node) =0.1104839
## class counts: 474 11 0 67 9 0 0 67 30 19 3 0 5 0
## probabilities: 0.692 0.016 0.000 0.098 0.013 0.000 0.000 0.098 0.044 0.028 0.004 0.000 0.007 0.000
##
## Node number 57: 762 observations, complexity param=0.02512379
## predicted class=8 expected loss=0.5275591 P(node) =0.1229032
## class counts: 42 0 200 0 10 0 0 360 115 6 0 10 1 18
## probabilities: 0.055 0.000 0.262 0.000 0.013 0.000 0.000 0.472 0.151 0.008 0.000 0.013 0.001 0.024
## left son=114 (190 obs) right son=115 (572 obs)
## Primary splits:
## under18 < 26.181 to the right, improve=120.20380, (0 missing)
## over65 < 11.6925 to the left, improve=114.70050, (0 missing)
## under4 < 8.106 to the right, improve=106.52990, (0 missing)
## imd < 29.283 to the right, improve=104.70090, (0 missing)
## over75 < 4.679 to the left, improve= 85.27674, (0 missing)
## Surrogate splits:
## five_14 < 14.179 to the right, agree=0.920, adj=0.679, (0 split)
## under4 < 8.4845 to the right, agree=0.871, adj=0.484, (0 split)
## over65 < 11.6925 to the left, agree=0.837, adj=0.347, (0 split)
## over75 < 3.407 to the left, agree=0.815, adj=0.258, (0 split)
## over85 < 0.9075 to the left, agree=0.797, adj=0.184, (0 split)
##
## Node number 58: 489 observations
## predicted class=9 expected loss=0.3578732 P(node) =0.07887097
## class counts: 14 0 44 41 12 0 0 0 314 7 0 9 46 2
## probabilities: 0.029 0.000 0.090 0.084 0.025 0.000 0.000 0.000 0.642 0.014 0.000 0.018 0.094 0.004
##
## Node number 59: 658 observations
## predicted class=13 expected loss=0.2173252 P(node) =0.106129
## class counts: 23 27 0 0 23 0 0 0 63 0 7 0 515 0
## probabilities: 0.035 0.041 0.000 0.000 0.035 0.000 0.000 0.000 0.096 0.000 0.011 0.000 0.783 0.000
##
## Node number 60: 62 observations
## predicted class=4 expected loss=0.06451613 P(node) =0.01
## class counts: 3 1 0 58 0 0 0 0 0 0 0 0 0 0
## probabilities: 0.048 0.016 0.000 0.935 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 61: 533 observations
## predicted class=10 expected loss=0.1575985 P(node) =0.08596774
## class counts: 14 2 24 2 0 0 0 11 13 449 1 11 2 4
## probabilities: 0.026 0.004 0.045 0.004 0.000 0.000 0.000 0.021 0.024 0.842 0.002 0.021 0.004 0.008
##
## Node number 114: 190 observations
## predicted class=3 expected loss=0.1947368 P(node) =0.03064516
## class counts: 0 0 153 0 0 0 0 16 1 0 0 2 0 18
## probabilities: 0.000 0.000 0.805 0.000 0.000 0.000 0.000 0.084 0.005 0.000 0.000 0.011 0.000 0.095
##
## Node number 115: 572 observations, complexity param=0.01962223
## predicted class=8 expected loss=0.3986014 P(node) =0.09225806
## class counts: 42 0 47 0 10 0 0 344 114 6 0 8 1 0
## probabilities: 0.073 0.000 0.082 0.000 0.017 0.000 0.000 0.601 0.199 0.010 0.000 0.014 0.002 0.000
## left son=230 (401 obs) right son=231 (171 obs)
## Primary splits:
## imd < 29.782 to the right, improve=130.99220, (0 missing)
## over65 < 10.206 to the left, improve= 32.60214, (0 missing)
## over75 < 3.9495 to the left, improve= 25.13173, (0 missing)
## under4 < 8.117 to the right, improve= 22.50324, (0 missing)
## over85 < 0.766 to the left, improve= 15.43288, (0 missing)
## Surrogate splits:
## under4 < 4.5685 to the right, agree=0.705, adj=0.012, (0 split)
## five_14 < 9.8385 to the right, agree=0.705, adj=0.012, (0 split)
## eth1 < 1.393 to the right, agree=0.703, adj=0.006, (0 split)
##
## Node number 230: 401 observations
## predicted class=8 expected loss=0.1571072 P(node) =0.06467742
## class counts: 23 0 34 0 1 0 0 338 1 2 0 2 0 0
## probabilities: 0.057 0.000 0.085 0.000 0.002 0.000 0.000 0.843 0.002 0.005 0.000 0.005 0.000 0.000
##
## Node number 231: 171 observations
## predicted class=9 expected loss=0.3391813 P(node) =0.02758065
## class counts: 19 0 13 0 9 0 0 6 113 4 0 6 1 0
## probabilities: 0.111 0.000 0.076 0.000 0.053 0.000 0.000 0.035 0.661 0.023 0.000 0.035 0.006 0.000
rsq.rpart(fit.tree)
##
## Classification tree:
## rpart(formula = cluster ~ ., data = train1)
##
## Variables actually used in tree construction:
## [1] eth1 imd over65 over75 pop under18
##
## Root node error: 5453/6200 = 0.87952
##
## n= 6200
##
## CP nsplit rel error xerror xstd
## 1 0.091601 0 1.00000 1.00660 0.0046010
## 2 0.088392 2 0.81680 0.88813 0.0059706
## 3 0.073721 4 0.64001 0.77700 0.0067167
## 4 0.058317 5 0.56629 0.51366 0.0071862
## 5 0.049147 6 0.50798 0.47845 0.0071288
## 6 0.037227 7 0.45883 0.45828 0.0070829
## 7 0.029158 8 0.42160 0.43261 0.0070106
## 8 0.025124 9 0.39244 0.40198 0.0069032
## 9 0.019622 10 0.36732 0.39501 0.0068755
## 10 0.016321 11 0.34770 0.37832 0.0068039
## 11 0.014304 12 0.33138 0.36475 0.0067403
## 12 0.013754 13 0.31707 0.34788 0.0066541
## 13 0.010636 14 0.30332 0.33009 0.0065544
## 14 0.010000 15 0.29268 0.31781 0.0064800
plot(fit.tree, uniform=TRUE,
main="Regression Tree for practices ")
text(fit.tree, use.n=TRUE, all=TRUE, cex=.8)
fancyRpartPlot(fit.tree, palettes = c("Reds", "Blues"))
set.seed(7)
fit.rf <- train(cluster~., data = train1, method = "rf", metric = metric, trControl = trainControl)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#Latent discriminant analysis
set.seed(7)
fit.lda <- train(cluster~., data = train1, method = "lda", metric = metric, trControl = trainControl)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#Support Vector Machine
set.seed(7)
fit.svm <- train(cluster~., data = train1, method = "svmRadial", metric = metric, trControl = trainControl)
## Loading required package: kernlab
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
#K nearest neightbours
set.seed(7)
fit.knn <- train(cluster~., data = train1, method = "knn", metric = metric, trControl = trainControl)
results <- resamples(list(RF = fit.rf, LDA = fit.lda, SVM = fit.svm, KNN = fit.knn))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: RF, LDA, SVM, KNN
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.8968 0.9046 0.9161 0.9121 0.9193 0.9208 0
## LDA 0.8824 0.8956 0.8999 0.9032 0.9108 0.9324 0
## SVM 0.9436 0.9552 0.9588 0.9589 0.9646 0.9710 0
## KNN 0.3635 0.3748 0.3968 0.3921 0.4060 0.4129 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.8870 0.8955 0.9082 0.9038 0.9116 0.9133 0
## LDA 0.8710 0.8854 0.8901 0.8938 0.9021 0.9259 0
## SVM 0.9383 0.9509 0.9549 0.9550 0.9612 0.9683 0
## KNN 0.2969 0.3100 0.3343 0.3288 0.3439 0.3518 0
dotplot(results)
Random forests and supoprt vector machine models appear most accurate -with 99% + prediction accuracy.
set.seed(7)
predictions <- predict(fit.svm, test)
confusionMatrix(predictions, test$cluster)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(predictions, test$cluster): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 151 3 0 0 0 0 0 0 1 0 0 0 1 0 0
## 2 0 143 0 0 0 0 2 0 0 0 0 0 0 0 0
## 3 0 0 78 0 0 0 0 0 4 1 0 0 0 1 0
## 4 1 0 0 63 0 1 0 0 0 1 0 0 1 0 1
## 5 2 0 0 0 181 5 0 0 2 0 2 0 0 0 0
## 6 0 0 0 0 0 31 0 0 0 0 0 0 0 0 0
## 7 0 1 0 0 0 0 31 0 0 0 1 0 0 0 0
## 8 1 0 0 0 1 0 0 139 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 143 1 0 3 2 0 0
## 10 1 1 2 1 1 0 0 1 5 121 0 0 1 2 0
## 11 0 1 0 0 0 1 0 0 0 0 128 0 1 0 0
## 12 0 0 1 0 0 0 0 0 0 0 0 62 0 0 0
## 13 1 2 0 0 0 0 0 0 0 0 1 0 158 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 59 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.949, 0.9692)
## No Information Rate : 0.1181
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9562
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.96178 0.94702 0.96296 0.98438 0.9891 0.81579
## Specificity 0.99641 0.99857 0.99592 0.99664 0.9920 1.00000
## Pos Pred Value 0.96795 0.98621 0.92857 0.92647 0.9427 1.00000
## Neg Pred Value 0.99570 0.99431 0.99795 0.99933 0.9985 0.99539
## Prevalence 0.10129 0.09742 0.05226 0.04129 0.1181 0.02452
## Detection Rate 0.09742 0.09226 0.05032 0.04065 0.1168 0.02000
## Detection Prevalence 0.10065 0.09355 0.05419 0.04387 0.1239 0.02000
## Balanced Accuracy 0.97910 0.97280 0.97944 0.99051 0.9905 0.90789
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity 0.93939 0.99286 0.92258 0.97581 0.96970
## Specificity 0.99868 0.99858 0.99570 0.98948 0.99788
## Pos Pred Value 0.93939 0.98582 0.95973 0.88971 0.97710
## Neg Pred Value 0.99868 0.99929 0.99143 0.99788 0.99718
## Prevalence 0.02129 0.09032 0.10000 0.08000 0.08516
## Detection Rate 0.02000 0.08968 0.09226 0.07806 0.08258
## Detection Prevalence 0.02129 0.09097 0.09613 0.08774 0.08452
## Balanced Accuracy 0.96904 0.99572 0.95914 0.98264 0.98379
## Class: 12 Class: 13 Class: 14 Class: 15
## Sensitivity 0.95385 0.9634 0.95161 0.0000000
## Specificity 0.99933 0.9971 1.00000 1.0000000
## Pos Pred Value 0.98413 0.9753 1.00000 NaN
## Neg Pred Value 0.99798 0.9957 0.99799 0.9993548
## Prevalence 0.04194 0.1058 0.04000 0.0006452
## Detection Rate 0.04000 0.1019 0.03806 0.0000000
## Detection Prevalence 0.04065 0.1045 0.03806 0.0000000
## Balanced Accuracy 0.97659 0.9803 0.97581 0.5000000