# Lesson 5
# DT using titanic data set for classification
# DT can work with both categorical and numeric variables so we don't need to convert all variables to numeric
# process: create forks/splits on feature that best separates and recursiveley for sub forks
# Pros: model interpretability is high as it's white box, works with both numeric and discrete data
# Cons: subject to overfitting - RF addresses that by creating an ensemble.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
titanic = read.csv("https://goo.gl/At238b")
str(titanic) # describe values and variable types, notice NA values in age
## 'data.frame': 1309 obs. of 14 variables:
## $ pclass : Factor w/ 1 level "1st": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : Factor w/ 186 levels "A10","A11","A14",..: 44 80 80 80 80 150 146 16 62 NA ...
## $ embarked : Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
## $ boat : Factor w/ 27 levels "1","10","11",..: 12 3 NA NA NA 13 2 NA 27 NA ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: Factor w/ 369 levels "?Havana, Cuba",..: 309 231 231 231 231 237 162 24 22 229 ...
nrow(titanic)
## [1] 1309
head(titanic)
## pclass survived name sex
## 1 1st 1 Allen, Miss. Elisabeth Walton female
## 2 1st 1 Allison, Master. Hudson Trevor male
## 3 1st 0 Allison, Miss. Helen Loraine female
## 4 1st 0 Allison, Mr. Hudson Joshua Creighton male
## 5 1st 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
## 6 1st 1 Anderson, Mr. Harry male
## age sibsp parch ticket fare cabin embarked boat body
## 1 29.00 0 0 24160 211.3375 B5 S 2 NA
## 2 0.92 1 2 113781 151.5500 C22 C26 S 11 NA
## 3 2.00 1 2 113781 151.5500 C22 C26 S <NA> NA
## 4 30.00 1 2 113781 151.5500 C22 C26 S <NA> 135
## 5 25.00 1 2 113781 151.5500 C22 C26 S <NA> NA
## 6 48.00 0 0 19952 26.5500 E12 S 3 NA
## home.dest
## 1 St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6 New York, NY
titanic = titanic %>% filter(age > 0) %>% select(-body, -boat, -cabin, -home.dest) # remove non useful variables
titanic[!complete.cases(titanic),] # check if na values
## pclass survived name sex age
## 149 1st 1 Icard, Miss. Amelie female 38.0
## 250 1st 1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.0
## 985 1st 0 Storey, Mr. Thomas male 60.5
## sibsp parch ticket fare embarked
## 149 0 0 113572 80 <NA>
## 250 0 0 113572 80 <NA>
## 985 0 0 3701 NA S
titanic = na.omit(titanic) # remove na values
titanic[!complete.cases(titanic),] # verify no na values
## [1] pclass survived name sex age sibsp parch
## [8] ticket fare embarked
## <0 rows> (or 0-length row.names)
nrow(titanic)
## [1] 1043
# select relevant variables to train
titanic = titanic %>% select(sex, age, sibsp, parch, fare, survived, embarked)
head(titanic) # our data set is ready for ML
## sex age sibsp parch fare survived embarked
## 1 female 29.00 0 0 211.3375 1 S
## 2 male 0.92 1 2 151.5500 1 S
## 3 female 2.00 1 2 151.5500 0 S
## 4 male 30.00 1 2 151.5500 0 S
## 5 female 25.00 1 2 151.5500 0 S
## 6 male 48.00 0 0 26.5500 1 S
(n = nrow(titanic))
## [1] 1043
trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
train = titanic[trainIndex ,]
test = titanic[-trainIndex ,]
library(rpart)
library(rpart.plot)
dtModel = rpart(survived~., data = train, method = 'class') # don't need to figure imp features as DT will figure it out
dtModel # difficult to interpret so we will use a plot
## n= 834
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 834 332 0 (0.60191847 0.39808153)
## 2) sex=male 528 106 0 (0.79924242 0.20075758)
## 4) age>=14.25 483 83 0 (0.82815735 0.17184265) *
## 5) age< 14.25 45 22 1 (0.48888889 0.51111111)
## 10) sibsp>=2.5 17 1 0 (0.94117647 0.05882353) *
## 11) sibsp< 2.5 28 6 1 (0.21428571 0.78571429) *
## 3) sex=female 306 80 1 (0.26143791 0.73856209)
## 6) fare< 48.2021 220 77 1 (0.35000000 0.65000000)
## 12) sibsp>=2.5 15 4 0 (0.73333333 0.26666667) *
## 13) sibsp< 2.5 205 66 1 (0.32195122 0.67804878)
## 26) parch>=3.5 7 1 0 (0.85714286 0.14285714) *
## 27) parch< 3.5 198 60 1 (0.30303030 0.69696970) *
## 7) fare>=48.2021 86 3 1 (0.03488372 0.96511628) *
rpart.plot(dtModel)
# now apply model to test data
predictSurvivedDT = predict(dtModel, newdata = test, type="class")
# compare accuracy to eariler models
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
accuracy_DT = Accuracy(y_pred = predictSurvivedDT, y_true = test$survived)
# Ensemble model approaches
# Average, e.g. mean of predicted values from DT + LR on the same data.
# Majority, e.g. mode/most common predicted value from DT, LR, ... on the same data.
# Bagging, e.g. RF. Building multiple models (typically of the same type) from different subsamples of the training dataset.
# Boosting, e.g. GB. Building multiple models (typically of the same type) each of which learns to fix the prediction errors of a prior model in the chain.
# Stacking, building multiple models (typically of differing types) and supervisor model and then aggregate or select one
# let's try bagging
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'caret'
## The following objects are masked from 'package:MLmetrics':
##
## MAE, RMSE

rftrain = na.omit(train)
control = trainControl(method="cv", number=10) # ten folds for cross validation
tunegrid <- expand.grid(.mtry=c(1:6))
rfModel = train(as.factor(survived)~., data=rftrain, method="rf", metric='Accuracy', tuneGrid=tunegrid, trControl=control)
print(rfModel)
## Random Forest
##
## 834 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 750, 751, 751, 751, 751, 751, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7674125 0.4879084
## 2 0.7710413 0.5146885
## 3 0.7757889 0.5266299
## 4 0.7709696 0.5172968
## 5 0.7662220 0.5079688
## 6 0.7602266 0.4952580
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
plot(rfModel)

# predict and observe accuracy
predictSurvivedRF = predict(rfModel, newdata = test)
accuracy_RF = Accuracy(y_pred = predictSurvivedRF, y_true = test$survived)
# stacking to compare several models
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
control = trainControl(method="cv", number=10)
algorithmList <- c('lda', 'rpart', 'glm', 'knn', 'svmRadial')
rfModels = caretList(as.factor(survived)~., data=rftrain, methodList=algorithmList, trControl=control)
## Warning in trControlCheck(x = trControl, y = target):
## trControl$savePredictions not 'all' or 'final'. Setting to 'final' so we
## can ensemble the models.
## Warning in trControlCheck(x = trControl, y = target): indexes not defined
## in trControl. Attempting to set them ourselves, so each model in the
## ensemble will have the same resampling indexes.
results = resamples(rfModels)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, rpart, glm, knn, svmRadial
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.6867470 0.7597533 0.7891566 0.7769793 0.7976190 0.8192771 0
## rpart 0.7108434 0.7374139 0.8072289 0.7842226 0.8208907 0.8333333 0
## glm 0.6867470 0.7641997 0.7784710 0.7769937 0.8042169 0.8313253 0
## knn 0.5952381 0.6304862 0.6867470 0.6714429 0.7014845 0.7261905 0
## svmRadial 0.7142857 0.7612952 0.8012048 0.7950373 0.8349111 0.8554217 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.3719441 0.4980242 0.5517300 0.5311399 0.5696198 0.6207737 0
## rpart 0.3879781 0.4518010 0.5933864 0.5487453 0.6259381 0.6478788 0
## glm 0.3780980 0.4919093 0.5286971 0.5294487 0.5821116 0.6442131 0
## knn 0.1233886 0.1902235 0.3314640 0.2908080 0.3714968 0.4290780 0
## svmRadial 0.3898305 0.4795758 0.5751143 0.5626519 0.6492806 0.6918317 0
dotplot(results)

(predictSurvivedStacking = predict(rfModels, newdata = test))
## lda rpart glm knn svmRadial
## [1,] "1" "1" "1" "1" "1"
## [2,] "0" "0" "0" "0" "0"
## [3,] "1" "0" "1" "1" "0"
## [4,] "1" "1" "1" "1" "1"
## [5,] "1" "1" "1" "1" "1"
## [6,] "0" "0" "0" "1" "0"
## [7,] "1" "1" "1" "1" "1"
## [8,] "1" "1" "1" "1" "1"
## [9,] "1" "1" "1" "0" "1"
## [10,] "1" "1" "1" "1" "1"
## [11,] "1" "1" "1" "1" "1"
## [12,] "0" "0" "0" "0" "0"
## [13,] "1" "1" "1" "1" "0"
## [14,] "0" "0" "0" "0" "0"
## [15,] "1" "1" "1" "1" "1"
## [16,] "0" "0" "0" "1" "0"
## [17,] "1" "1" "1" "1" "1"
## [18,] "0" "0" "0" "1" "0"
## [19,] "1" "1" "1" "0" "1"
## [20,] "0" "0" "0" "1" "0"
## [21,] "0" "0" "0" "1" "0"
## [22,] "0" "0" "0" "0" "0"
## [23,] "0" "0" "0" "0" "0"
## [24,] "1" "1" "1" "1" "1"
## [25,] "1" "1" "1" "1" "1"
## [26,] "0" "0" "0" "0" "0"
## [27,] "1" "1" "1" "1" "1"
## [28,] "1" "1" "1" "1" "1"
## [29,] "1" "1" "1" "1" "0"
## [30,] "1" "1" "1" "1" "1"
## [31,] "0" "0" "1" "1" "0"
## [32,] "0" "0" "0" "0" "0"
## [33,] "1" "1" "1" "1" "1"
## [34,] "1" "1" "1" "1" "1"
## [35,] "0" "0" "0" "1" "0"
## [36,] "1" "1" "1" "1" "1"
## [37,] "0" "0" "0" "0" "0"
## [38,] "1" "1" "1" "1" "1"
## [39,] "1" "1" "1" "1" "1"
## [40,] "0" "0" "0" "0" "0"
## [41,] "0" "0" "0" "0" "0"
## [42,] "0" "0" "0" "1" "0"
## [43,] "1" "0" "1" "1" "0"
## [44,] "1" "1" "1" "0" "1"
## [45,] "1" "1" "1" "1" "1"
## [46,] "0" "0" "0" "1" "0"
## [47,] "0" "0" "0" "0" "0"
## [48,] "1" "1" "1" "1" "1"
## [49,] "0" "0" "1" "1" "0"
## [50,] "0" "0" "0" "0" "0"
## [51,] "0" "0" "0" "0" "0"
## [52,] "1" "1" "1" "1" "1"
## [53,] "0" "0" "0" "1" "0"
## [54,] "0" "0" "0" "1" "0"
## [55,] "1" "1" "1" "1" "1"
## [56,] "1" "1" "1" "1" "1"
## [57,] "0" "0" "0" "0" "0"
## [58,] "1" "1" "1" "0" "1"
## [59,] "1" "1" "1" "1" "1"
## [60,] "0" "0" "0" "0" "0"
## [61,] "1" "1" "1" "1" "1"
## [62,] "1" "1" "1" "0" "1"
## [63,] "0" "0" "0" "0" "0"
## [64,] "0" "0" "0" "0" "0"
## [65,] "1" "1" "1" "0" "1"
## [66,] "0" "0" "0" "0" "0"
## [67,] "1" "1" "1" "0" "1"
## [68,] "1" "1" "1" "0" "1"
## [69,] "0" "1" "0" "1" "1"
## [70,] "1" "1" "1" "0" "1"
## [71,] "1" "1" "1" "1" "1"
## [72,] "1" "1" "1" "1" "1"
## [73,] "1" "1" "1" "1" "1"
## [74,] "0" "1" "0" "0" "1"
## [75,] "1" "1" "1" "0" "1"
## [76,] "1" "1" "1" "1" "1"
## [77,] "0" "0" "0" "0" "0"
## [78,] "0" "0" "0" "0" "0"
## [79,] "0" "0" "0" "0" "0"
## [80,] "0" "0" "0" "0" "0"
## [81,] "0" "0" "0" "0" "0"
## [82,] "0" "0" "0" "0" "0"
## [83,] "0" "0" "0" "0" "0"
## [84,] "0" "0" "0" "0" "0"
## [85,] "1" "1" "1" "1" "1"
## [86,] "0" "0" "0" "0" "0"
## [87,] "0" "0" "0" "0" "0"
## [88,] "0" "0" "0" "0" "0"
## [89,] "1" "1" "1" "0" "1"
## [90,] "1" "1" "1" "0" "1"
## [91,] "0" "0" "0" "1" "0"
## [92,] "1" "1" "1" "0" "1"
## [93,] "1" "1" "1" "0" "1"
## [94,] "0" "0" "0" "0" "0"
## [95,] "0" "0" "0" "0" "0"
## [96,] "0" "0" "0" "0" "0"
## [97,] "1" "1" "1" "0" "1"
## [98,] "0" "0" "0" "0" "0"
## [99,] "0" "0" "0" "0" "0"
## [100,] "0" "0" "0" "0" "0"
## [101,] "0" "0" "0" "0" "0"
## [102,] "0" "1" "0" "1" "1"
## [103,] "0" "0" "0" "0" "0"
## [104,] "1" "1" "1" "0" "1"
## [105,] "1" "1" "1" "1" "1"
## [106,] "0" "0" "0" "0" "0"
## [107,] "0" "0" "0" "0" "0"
## [108,] "0" "1" "0" "1" "0"
## [109,] "0" "0" "0" "0" "0"
## [110,] "0" "0" "0" "1" "0"
## [111,] "1" "1" "1" "0" "1"
## [112,] "1" "1" "1" "0" "1"
## [113,] "1" "1" "1" "0" "1"
## [114,] "1" "1" "1" "0" "1"
## [115,] "1" "1" "1" "1" "1"
## [116,] "1" "1" "1" "0" "1"
## [117,] "1" "1" "1" "0" "1"
## [118,] "0" "0" "0" "0" "0"
## [119,] "1" "1" "1" "0" "1"
## [120,] "0" "1" "0" "0" "1"
## [121,] "0" "0" "0" "0" "0"
## [122,] "0" "0" "0" "0" "0"
## [123,] "0" "0" "0" "0" "0"
## [124,] "0" "0" "0" "0" "0"
## [125,] "0" "0" "0" "0" "0"
## [126,] "1" "1" "1" "1" "1"
## [127,] "1" "1" "1" "1" "1"
## [128,] "0" "0" "0" "0" "0"
## [129,] "0" "0" "0" "0" "0"
## [130,] "1" "1" "1" "0" "0"
## [131,] "0" "0" "0" "0" "0"
## [132,] "1" "1" "1" "0" "1"
## [133,] "1" "1" "1" "0" "1"
## [134,] "1" "1" "1" "0" "1"
## [135,] "0" "0" "0" "0" "0"
## [136,] "0" "0" "0" "0" "0"
## [137,] "1" "1" "0" "0" "0"
## [138,] "0" "0" "0" "0" "0"
## [139,] "0" "0" "0" "0" "0"
## [140,] "0" "0" "0" "0" "0"
## [141,] "0" "0" "0" "0" "0"
## [142,] "0" "1" "0" "1" "1"
## [143,] "0" "0" "0" "0" "0"
## [144,] "0" "0" "0" "0" "0"
## [145,] "1" "1" "1" "0" "1"
## [146,] "0" "0" "0" "0" "0"
## [147,] "0" "0" "0" "0" "0"
## [148,] "0" "0" "0" "0" "0"
## [149,] "1" "1" "1" "0" "1"
## [150,] "0" "0" "0" "0" "0"
## [151,] "1" "1" "1" "1" "1"
## [152,] "0" "0" "0" "0" "0"
## [153,] "0" "0" "0" "0" "0"
## [154,] "0" "0" "0" "0" "0"
## [155,] "1" "1" "1" "1" "0"
## [156,] "0" "0" "0" "0" "0"
## [157,] "0" "0" "0" "0" "0"
## [158,] "0" "0" "0" "0" "0"
## [159,] "0" "0" "0" "0" "0"
## [160,] "0" "0" "0" "0" "0"
## [161,] "0" "0" "0" "0" "0"
## [162,] "0" "0" "0" "0" "0"
## [163,] "1" "1" "1" "0" "1"
## [164,] "1" "1" "1" "0" "1"
## [165,] "0" "0" "0" "0" "0"
## [166,] "0" "0" "0" "0" "0"
## [167,] "1" "1" "1" "0" "0"
## [168,] "1" "1" "1" "0" "1"
## [169,] "0" "0" "0" "0" "0"
## [170,] "0" "0" "0" "0" "0"
## [171,] "0" "0" "0" "0" "0"
## [172,] "0" "0" "0" "0" "0"
## [173,] "0" "1" "0" "0" "1"
## [174,] "0" "0" "0" "0" "0"
## [175,] "0" "0" "0" "0" "0"
## [176,] "1" "1" "1" "0" "1"
## [177,] "0" "0" "0" "0" "0"
## [178,] "0" "0" "0" "0" "0"
## [179,] "1" "1" "1" "0" "1"
## [180,] "0" "0" "0" "0" "0"
## [181,] "0" "0" "0" "0" "0"
## [182,] "0" "0" "0" "0" "0"
## [183,] "1" "1" "1" "0" "1"
## [184,] "0" "0" "0" "0" "0"
## [185,] "0" "0" "0" "0" "0"
## [186,] "0" "0" "0" "0" "0"
## [187,] "0" "0" "0" "0" "0"
## [188,] "0" "0" "0" "1" "0"
## [189,] "0" "0" "0" "0" "0"
## [190,] "1" "1" "1" "0" "1"
## [191,] "0" "0" "0" "0" "0"
## [192,] "0" "0" "0" "0" "0"
## [193,] "0" "0" "0" "0" "0"
## [194,] "0" "0" "0" "0" "0"
## [195,] "0" "0" "0" "0" "0"
## [196,] "0" "0" "0" "0" "0"
## [197,] "1" "1" "1" "1" "1"
## [198,] "0" "0" "0" "0" "0"
## [199,] "1" "1" "1" "0" "1"
## [200,] "1" "1" "1" "1" "1"
## [201,] "0" "0" "0" "0" "0"
## [202,] "0" "0" "0" "0" "0"
## [203,] "0" "0" "0" "0" "0"
## [204,] "0" "0" "0" "0" "0"
## [205,] "0" "0" "0" "0" "0"
## [206,] "0" "0" "0" "0" "0"
## [207,] "0" "0" "0" "0" "0"
## [208,] "0" "0" "0" "0" "0"
## [209,] "0" "0" "0" "0" "0"
# let's aggregate by using mode value from the predictions
mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
ps = c()
for (i in 1:nrow(predictSurvivedStacking))
{
ps[i] = mode(predictSurvivedStacking[i,])
}
accuracy_Stacking = Accuracy(y_pred = ps, y_true = test$survived)
# accuracy from our different models
df_accuracy = data.frame("model" = c("Decision Tree", "Bagging-RF", "Stacking-Mode"), "accuracy" = c(accuracy_DT, accuracy_RF, accuracy_Stacking))
df_accuracy %>% arrange(accuracy)
## model accuracy
## 1 Stacking-Mode 0.7942584
## 2 Decision Tree 0.8086124
## 3 Bagging-RF 0.8277512
# Clustering using k means
# can also another example of k means in lesson 8 pdf of course notes on Canvas
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
head(wine)
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39
## Nonflavanoids Proanthocyanins Color Hue Dilution Proline
## 1 0.28 2.29 5.64 1.04 3.92 1065
## 2 0.26 1.28 4.38 1.05 3.40 1050
## 3 0.30 2.81 5.68 1.03 3.17 1185
## 4 0.24 2.18 7.80 0.86 3.45 1480
## 5 0.39 1.82 4.32 1.04 2.93 735
## 6 0.34 1.97 6.75 1.05 2.85 1450
wine = scale(wine[-1]) # scale before k means can run due to variance of values between variables
kmeansModel = kmeans(wine,3)
library(cluster)
clusplot(wine, kmeansModel$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)
