Lesson 8.2 - Random Forests

Robbie Beane

Load Packages

library(caret)
library(randomForest)

Example 1: Student Success Data

ss <- read.table('data/student_success_data.csv', header=TRUE, sep=',')
ss <- na.omit(ss)
ss <- ss[(ss$G3 <= 20) & (ss$G3 >= 0), ]
ss$Passed <- factor(ifelse(ss$G3 < 10, 0, 1))
ss$G1 <- NULL
ss$G2 <- NULL
ss$G3 <- NULL
ss$absences <- NULL
summary(ss)
##  school    sex          age        address famsize   Pstatus
##  GP :462   F:297   Min.   :15.00   R:155   GT3:393   A: 45  
##  MHS:101   M:266   1st Qu.:16.00   U:408   LE3:170   T:518  
##                    Median :16.00                            
##                    Mean   :16.61                            
##                    3rd Qu.:18.00                            
##                    Max.   :22.00                            
##       Medu            Fedu             Mjob           Fjob    
##  Min.   :1.000   Min.   :1.000   at_home : 66   at_home : 31  
##  1st Qu.:2.000   1st Qu.:2.000   health  : 39   health  : 25  
##  Median :3.000   Median :3.000   other   :184   other   :275  
##  Mean   :2.874   Mean   :2.686   services:182   services:173  
##  3rd Qu.:4.000   3rd Qu.:4.000   teacher : 92   teacher : 59  
##  Max.   :4.000   Max.   :4.000                                
##         reason      guardian     traveltime      studytime    
##  course    :215   father:129   Min.   :1.000   Min.   :1.000  
##  home      :158   mother:398   1st Qu.:1.000   1st Qu.:1.000  
##  other     : 48   other : 36   Median :1.000   Median :2.000  
##  reputation:142                Mean   :1.481   Mean   :1.986  
##                                3rd Qu.:2.000   3rd Qu.:2.000  
##                                Max.   :4.000   Max.   :4.000  
##     failures      schoolsup famsup     paid     activities nursery  
##  Min.   :0.0000   no :507   no :254   no :332   no :262    no :102  
##  1st Qu.:0.0000   yes: 56   yes:309   yes:231   yes:301    yes:461  
##  Median :0.0000                                                     
##  Mean   :0.2842                                                     
##  3rd Qu.:0.0000                                                     
##  Max.   :3.0000                                                     
##  higher    internet  romantic      famrel         freetime    
##  no : 22   no : 93   no :341   Min.   :1.000   Min.   :1.000  
##  yes:541   yes:470   yes:222   1st Qu.:4.000   1st Qu.:3.000  
##                                Median :4.000   Median :3.000  
##                                Mean   :3.938   Mean   :3.213  
##                                3rd Qu.:5.000   3rd Qu.:4.000  
##                                Max.   :5.000   Max.   :5.000  
##      goout            Dalc            Walc           health     Passed 
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   0:200  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.00   1:363  
##  Median :3.000   Median :1.000   Median :2.000   Median :4.00          
##  Mean   :3.021   Mean   :1.401   Mean   :2.176   Mean   :3.67          
##  3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:5.00          
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00

Basic Example of a Random Forest

set.seed(1)
m1 <- randomForest(Passed ~ ., ss, importance=TRUE)
m1
## 
## Call:
##  randomForest(formula = Passed ~ ., data = ss, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 16.7%
## Confusion matrix:
##     0   1 class.error
## 0 142  58  0.29000000
## 1  36 327  0.09917355

Tuning on Number of Trees

plot(m1)
legend("topright", cex =1, legend=colnames(m1$err.rate), lty=c(1,2,3), col=c(1,2,3))

Tuning on Number of Trees

m1_opt_ntrees <- which.min(m1$err.rate[,'OOB'])
m1_opt_err_rate <- min(m1$err.rate[,'OOB'])

cat("Optimal Number of Trees: ", m1_opt_ntrees, "\n",
    "Minimum Error Rate:      ", m1_opt_err_rate, sep="")
## Optimal Number of Trees: 124
## Minimum Error Rate:      0.1527531

Tuning on Number of Trees

set.seed(1)
m2 <- randomForest(Passed ~ ., ss, ntree=m1_opt_ntrees, importance=TRUE)
m2
## 
## Call:
##  randomForest(formula = Passed ~ ., data = ss, ntree = m1_opt_ntrees,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 124
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 15.28%
## Confusion matrix:
##     0   1 class.error
## 0 146  54  0.27000000
## 1  32 331  0.08815427

Training Accuracy

train_predict <- predict(m2, ss)
mean(train_predict == ss$Passed)
## [1] 1

Tuning on mtry and ntree

oob_acc_list <- c()
opt_ntree_list <- c()

for(i in 1:29){
  set.seed(1)
  temp_mod <- randomForest(Passed ~ ., ss, ntree=500, importance=TRUE, mtry=i)
  oob_acc_list <- c(oob_acc_list, min(temp_mod$err.rate[,'OOB']))
  opt_ntree_list <- c(opt_ntree_list, which.min(temp_mod$err.rate[,'OOB']))
}

opt_mtry <- which.min(oob_acc_list)
opt_ntree <- opt_ntree_list[opt_mtry]
min_oob_acc <- min(oob_acc_list)

cat("Optimal Value of mtry:  ", opt_mtry, "\n",
    "Optimal Value of ntree: ", opt_ntree, "\n",
    "Minimum OOB Accuracy:   ", min_oob_acc, sep="")
## Optimal Value of mtry:  3
## Optimal Value of ntree: 445
## Minimum OOB Accuracy:   0.1420959

Tuning on mtry and ntree

plot(1:29, oob_acc_list, xlab="Value of mtry", ylab="Minimum OOB Accuracy Score")
lines(1:29, oob_acc_list)
abline(v=which.min(oob_acc_list), col="red", lty=2, lwd=1)

Tuning on mtry and ntree

set.seed(1)
m3 <- randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE)
m3
## 
## Call:
##  randomForest(formula = Passed ~ ., data = ss, ntree = 445, mtry = 3,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 445
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 14.21%
## Confusion matrix:
##     0   1 class.error
## 0 145  55  0.27500000
## 1  25 338  0.06887052

Estimating Out-Of-Sample Performance

set.seed(1)
train(Passed ~ ., ss, method="rf", ntree=445,
      trControl = trainControl(method="cv", number=20), 
      tuneGrid = expand.grid(mtry=c(3)))
## Random Forest 
## 
## 563 samples
##  29 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 535, 535, 535, 535, 535, 534, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8349754  0.6226192
## 
## Tuning parameter 'mtry' was held constant at a value of 3

Estimating Out-Of-Sample Performance

set.seed(1)
randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE, 
  replace=FALSE, sampsize=floor(0.8*nrow(ss)))
## 
## Call:
##  randomForest(formula = Passed ~ ., data = ss, ntree = 445, mtry = 3,      importance = TRUE, replace = FALSE, sampsize = floor(0.8 *          nrow(ss))) 
##                Type of random forest: classification
##                      Number of trees: 445
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 14.92%
## Confusion matrix:
##     0   1 class.error
## 0 147  53  0.26500000
## 1  31 332  0.08539945

Estimating Out-Of-Sample Performance

error_rate_list <- c()

for (i in 1:20){

  set.seed(i)
  temp_mod <- randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE, 
                           replace=FALSE, sampsize=floor(0.8*nrow(ss)))
  
  error_rate_list <- c(error_rate_list, temp_mod$err.rate[445,"OOB"])
}

mean(error_rate_list)
## [1] 0.1590586
boxplot(error_rate_list)

Feature Importance

m3$importance
##                        0            1 MeanDecreaseAccuracy
## school      0.0111720672 0.0025577942          0.005694658
## sex         0.0233632155 0.0012521691          0.009095097
## age         0.0264599152 0.0032338034          0.011469429
## address     0.0149627498 0.0023202583          0.006837335
## famsize     0.0199481289 0.0008092124          0.007596645
## Pstatus     0.0046869379 0.0001193006          0.001765697
## Medu        0.0541793952 0.0174901597          0.030577784
## Fedu        0.0491084012 0.0177485314          0.028881618
## Mjob        0.0599594814 0.0131151192          0.029743839
## Fjob        0.0293255539 0.0039668412          0.013065376
## reason      0.0354658291 0.0025441594          0.014226229
## guardian    0.0156615270 0.0019149910          0.006817719
## traveltime  0.0184574985 0.0006176917          0.006975296
## studytime   0.0255475734 0.0029151586          0.010989474
## failures    0.0492611771 0.0219989065          0.031765734
## schoolsup  -0.0004538367 0.0037351486          0.002217851
## famsup      0.0399601697 0.0120905935          0.021994149
## paid        0.0226342909 0.0050237302          0.011328281
## activities  0.0156916157 0.0038482942          0.008056800
## nursery     0.0085281869 0.0028213463          0.004885437
## higher      0.0031016391 0.0013766224          0.001956130
## internet    0.0218552420 0.0046808013          0.010802754
## romantic    0.0175522927 0.0052956381          0.009702732
## famrel      0.0382173216 0.0047969335          0.016753294
## freetime    0.0336506282 0.0042443630          0.014790784
## goout       0.0454265167 0.0091057532          0.022041033
## Dalc        0.0188676402 0.0076632578          0.011721844
## Walc        0.0266031162 0.0072780700          0.014199180
## health      0.0290641642 0.0062458468          0.014451498
##            MeanDecreaseGini
## school             3.155759
## sex                5.437597
## age               11.435674
## address            4.639802
## famsize            4.387064
## Pstatus            2.380665
## Medu              14.968563
## Fedu              15.012602
## Mjob              17.688261
## Fjob              10.509067
## reason            11.883354
## guardian           6.183318
## traveltime         6.389089
## studytime          9.277214
## failures          16.271369
## schoolsup          3.580239
## famsup             7.981877
## paid               5.063839
## activities         5.261459
## nursery            4.128403
## higher             2.496847
## internet           5.095232
## romantic           4.948335
## famrel            10.655698
## freetime          11.351832
## goout             14.025130
## Dalc               7.596320
## Walc              11.637881
## health            11.363448

Feature Importance

varImpPlot(m3)

Example 2: Ames Housing

ames <- read.table('data/AmesHousing.txt', header=TRUE, sep='\t')
sapply(ames, function(x) sum(is.na(x)))
##           Order             PID     MS.SubClass       MS.Zoning 
##               0               0               0               0 
##    Lot.Frontage        Lot.Area          Street           Alley 
##             490               0               0            2732 
##       Lot.Shape    Land.Contour       Utilities      Lot.Config 
##               0               0               0               0 
##      Land.Slope    Neighborhood     Condition.1     Condition.2 
##               0               0               0               0 
##       Bldg.Type     House.Style    Overall.Qual    Overall.Cond 
##               0               0               0               0 
##      Year.Built  Year.Remod.Add      Roof.Style       Roof.Matl 
##               0               0               0               0 
##    Exterior.1st    Exterior.2nd    Mas.Vnr.Type    Mas.Vnr.Area 
##               0               0               0              23 
##      Exter.Qual      Exter.Cond      Foundation       Bsmt.Qual 
##               0               0               0              79 
##       Bsmt.Cond   Bsmt.Exposure  BsmtFin.Type.1    BsmtFin.SF.1 
##              79              79              79               1 
##  BsmtFin.Type.2    BsmtFin.SF.2     Bsmt.Unf.SF   Total.Bsmt.SF 
##              79               1               1               1 
##         Heating      Heating.QC     Central.Air      Electrical 
##               0               0               0               0 
##     X1st.Flr.SF     X2nd.Flr.SF Low.Qual.Fin.SF     Gr.Liv.Area 
##               0               0               0               0 
##  Bsmt.Full.Bath  Bsmt.Half.Bath       Full.Bath       Half.Bath 
##               2               2               0               0 
##   Bedroom.AbvGr   Kitchen.AbvGr    Kitchen.Qual   TotRms.AbvGrd 
##               0               0               0               0 
##      Functional      Fireplaces    Fireplace.Qu     Garage.Type 
##               0               0            1422             157 
##   Garage.Yr.Blt   Garage.Finish     Garage.Cars     Garage.Area 
##             159             157               1               1 
##     Garage.Qual     Garage.Cond     Paved.Drive    Wood.Deck.SF 
##             158             158               0               0 
##   Open.Porch.SF  Enclosed.Porch     X3Ssn.Porch    Screen.Porch 
##               0               0               0               0 
##       Pool.Area         Pool.QC           Fence    Misc.Feature 
##               0            2917            2358            2824 
##        Misc.Val         Mo.Sold         Yr.Sold       Sale.Type 
##               0               0               0               0 
##  Sale.Condition       SalePrice 
##               0               0

Example 2: Ames Housing

ames$Order <- NULL
ames$PID <- NULL
ames$Lot.Frontage <- NULL
ames$Alley <- NULL
ames$Misc.Feature <- NULL
ames$Pool.QC <- NULL
ames$Fireplace.Qu <- NULL
ames$Fence <- NULL
ames <- na.omit(ames)
nrow(ames)
## [1] 2683

Elasticnet Model

set.seed(1)
en_mod <- train(SalePrice ~ ., ames, method="glmnet", metric="Rsquared", 
                trControl = trainControl(method="cv", number=10),
                tuneGrid = expand.grid(alpha=seq(0, 1, by=0.2), 
                                        lambda=exp(seq(2,14,length=100))
                                       )
                )
                 

best_ix <- which.max(en_mod$results$Rsquared)
en_mod$results[best_ix, ]
##     alpha   lambda     RMSE  Rsquared      MAE   RMSESD RsquaredSD
## 149   0.2 2485.382 29257.91 0.8655234 16441.26 7145.613 0.06469346
##        MAESD
## 149 1612.336

Elasticnet Model

plot(en_mod, pch="", xTrans=log)

Random Forest Model

set.seed(1)
m4 <- randomForest(SalePrice ~ ., ames, importance=TRUE)
m4
## 
## Call:
##  randomForest(formula = SalePrice ~ ., data = ames, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 24
## 
##           Mean of squared residuals: 607927012
##                     % Var explained: 90.37

Random Forest Model

plot(m4)

Feature Importance

varImpPlot(m4)