# setwd("G:/My Drive/CAPSTONE/working")
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(devtools)
library(readr)
library(caTools)
library(randomForest)
train_df <- read.csv("train_df.csv", header = T, stringsAsFactors = F)
train_iq <- read.csv("train_iq.csv", header = T, stringsAsFactors = F)
train_sj <- read.csv("train_sj.csv", header = T, stringsAsFactors = F)
test_df <- read.csv("test_df.csv", header = T, stringsAsFactors = F)
test_iq <- read.csv("test_iq.csv", header = T, stringsAsFactors = F)
test_sj <- read.csv("test_sj.csv", header = T, stringsAsFactors = F)
submissions = read.csv("submission_format.csv", header = T, stringsAsFactors = F)
Load splitted test & training sets.
load("dengue_test_train_split.RData")
Random forest is a bagging algorithm which creates multiple Decision trees by selecting few of the variables randomly.
It simultanesously develops multiple trees in combination and finally averages the error to bring out the best possible results
We will apply Random forest on the entire train_df dataset. This is a basic Random Forest Model which has created 500 trees & selected 6 independent variables randomly.
rf_df_regressor = randomForest(total_cases ~ ., data = subset(df_training, select = -c(1:4, 26:28)))
print(rf_df_regressor)
##
## Call:
## randomForest(formula = total_cases ~ ., data = subset(df_training, select = -c(1:4, 26:28)))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 1304.461
## % Var explained: 41.91
Plotting the model will help us identify the optimal number of trees. We then identify the model of the optimal number of trees which has least Mean Squared Error.
plot(rf_df_regressor)
n1 = which.min(rf_df_regressor$mse)
n1
## [1] 395
Use this optimal tree number to prune the model. #Pruning is a technique in machine learning and search algorithms that reduces the size of decision trees by removing sections of the tree that provide little power to classify instances. Pruning reduces the complexity of the final classifier, and hence improves predictive accuracy by the reduction of overfitting.
If we are not able to see great improvement in model performance, we can try Feature Selection based on Node purity. VarImp Plot will help us to understand the Variables with best node purity.
rf_df_regressor2 = randomForest(total_cases ~ ., data = subset(df_training, select = -c(1:4, 26:28)), ntree = n1)
print(rf_df_regressor2)
##
## Call:
## randomForest(formula = total_cases ~ ., data = subset(df_training, select = -c(1:4, 26:28)), ntree = n1)
## Type of random forest: regression
## Number of trees: 395
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 1260.36
## % Var explained: 43.87
varImpPlot(rf_df_regressor2, main = "Variable Importance Plot Train DF- PSA Score")
importance(rf_df_regressor2)
## IncNodePurity
## ndvi_ne 148935.76
## ndvi_nw 523355.63
## ndvi_se 133497.04
## ndvi_sw 116146.20
## precipitation_amt_mm 49181.35
## reanalysis_air_temp_c 94587.17
## reanalysis_avg_temp_c 65734.81
## reanalysis_dew_point_temp_c 126458.61
## reanalysis_max_air_temp_c 79548.85
## reanalysis_min_air_temp_c 149352.02
## reanalysis_precip_amt_kg_per_m2 113871.76
## reanalysis_relative_humidity_percent 86570.09
## reanalysis_sat_precip_amt_mm 45893.65
## reanalysis_specific_humidity_g_per_kg 134897.56
## reanalysis_tdtr_c 131353.64
## station_avg_temp_c 106852.30
## station_diur_temp_rng_c 98720.84
## station_max_temp_c 99586.24
## station_min_temp_c 78818.70
## station_precip_mm 67242.94
rf_df_imp <- importance(rf_df_regressor2)
After building this RF algorithm, it tries to fit the model to the data, which may cause overfitting. As the model has generated 500 trees, it may generate a higher MSE. We developed this model according to the number of trees generating the least MSE.
Return 6 variables with the highest Node Purity, and call it to the regressor model.
rf_df_imp<-cbind(variables=rownames(rf_df_imp),as.data.frame(rf_df_imp))
str(rf_df_imp)
## 'data.frame': 20 obs. of 2 variables:
## $ variables : Factor w/ 20 levels "ndvi_ne","ndvi_nw",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ IncNodePurity: num 148936 523356 133497 116146 49181 ...
a=rf_df_imp[order(-rf_df_imp$IncNodePurity),]
count<-0
input<- a[1,1]
for (i in seq(2,6)){
input<-paste(input,a[i,1],sep="+")
count=count+1
}
count
## [1] 5
input
## [1] "ndvi_nw+reanalysis_min_air_temp_c+ndvi_ne+reanalysis_specific_humidity_g_per_kg+ndvi_se+reanalysis_tdtr_c"
rf_df_imp_top6 <- as.formula(paste("total_cases~",input,sep=" "))
rf_df_imp_top6
## total_cases ~ ndvi_nw + reanalysis_min_air_temp_c + ndvi_ne +
## reanalysis_specific_humidity_g_per_kg + ndvi_se + reanalysis_tdtr_c
As the mean square error doesn’t improve much, we will go for important variables based on Node Purity (indicates the ease of identifying the variable to a class or value). We use the top 6 variables in our previous model. Note that these values are randomized after each run, and thus, we cannot manually input the independent features everytime the model is trained:
rf_df_imp_top6
## total_cases ~ ndvi_nw + reanalysis_min_air_temp_c + ndvi_ne +
## reanalysis_specific_humidity_g_per_kg + ndvi_se + reanalysis_tdtr_c
rf_df_regressor3 = randomForest(rf_df_imp_top6, data = subset(df_training, select = -c(1:4, 26:28)))
print(rf_df_regressor3)
##
## Call:
## randomForest(formula = rf_df_imp_top6, data = subset(df_training, select = -c(1:4, 26:28)))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 1092.494
## % Var explained: 51.35
rf_sj_regressor<- randomForest(total_cases ~ ., data = subset(sj_training, select = -c(1:4, 26:28)))
print(rf_sj_regressor)
##
## Call:
## randomForest(formula = total_cases ~ ., data = subset(sj_training, select = -c(1:4, 26:28)))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 1867.153
## % Var explained: 40.14
plot(rf_sj_regressor)
n2 = which.min(rf_sj_regressor$mse)
rf_sj_regressor2 = randomForest(total_cases ~ ., data = subset(sj_training, select = -c(1:4, 26:28)),ntree=n2)
print(rf_sj_regressor2)
##
## Call:
## randomForest(formula = total_cases ~ ., data = subset(sj_training, select = -c(1:4, 26:28)), ntree = n2)
## Type of random forest: regression
## Number of trees: 236
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 1938.588
## % Var explained: 37.85
varImpPlot(rf_sj_regressor, main="Variable Importance Plot San Juan - PSA Score")
importance(rf_sj_regressor2)
## IncNodePurity
## ndvi_ne 114662.64
## ndvi_nw 505805.51
## ndvi_se 155410.37
## ndvi_sw 110971.98
## precipitation_amt_mm 55139.23
## reanalysis_air_temp_c 65521.99
## reanalysis_avg_temp_c 54128.07
## reanalysis_dew_point_temp_c 147024.46
## reanalysis_max_air_temp_c 64920.16
## reanalysis_min_air_temp_c 54862.52
## reanalysis_precip_amt_kg_per_m2 104108.23
## reanalysis_relative_humidity_percent 107336.46
## reanalysis_sat_precip_amt_mm 58430.50
## reanalysis_specific_humidity_g_per_kg 162656.23
## reanalysis_tdtr_c 83144.81
## station_avg_temp_c 97217.64
## station_diur_temp_rng_c 65699.49
## station_max_temp_c 86958.45
## station_min_temp_c 40333.33
## station_precip_mm 72419.53
rf_sj_imp <- importance(rf_sj_regressor2)
Return 6 variables with the highest Node Purity, and call it to the regressor model.
rf_sj_imp<-cbind(variables=rownames(rf_sj_imp),as.data.frame(rf_sj_imp))
str(rf_sj_imp)
## 'data.frame': 20 obs. of 2 variables:
## $ variables : Factor w/ 20 levels "ndvi_ne","ndvi_nw",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ IncNodePurity: num 114663 505806 155410 110972 55139 ...
a=rf_sj_imp[order(-rf_sj_imp$IncNodePurity),]
count<-0
input<- a[1,1]
for (i in seq(2,6)){
input<-paste(input,a[i,1],sep="+")
count=count+1
}
count
## [1] 5
input
## [1] "ndvi_nw+reanalysis_specific_humidity_g_per_kg+ndvi_se+reanalysis_dew_point_temp_c+ndvi_ne+ndvi_sw"
rf_sj_imp_top6 <- as.formula(paste("total_cases~",input,sep=" "))
rf_sj_imp_top6
## total_cases ~ ndvi_nw + reanalysis_specific_humidity_g_per_kg +
## ndvi_se + reanalysis_dew_point_temp_c + ndvi_ne + ndvi_sw
rf_sj_imp_top6
## total_cases ~ ndvi_nw + reanalysis_specific_humidity_g_per_kg +
## ndvi_se + reanalysis_dew_point_temp_c + ndvi_ne + ndvi_sw
rf_sj_regressor3 = randomForest(rf_sj_imp_top6, data = subset(sj_training, select = -c(1:4, 26:28)))
print(rf_sj_regressor3)
##
## Call:
## randomForest(formula = rf_sj_imp_top6, data = subset(sj_training, select = -c(1:4, 26:28)))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 1464.458
## % Var explained: 53.05
rf_iq_regressor<- randomForest(total_cases ~ ., data = subset(iq_training, select = -c(1:4, 26:28)))
print(rf_iq_regressor)
##
## Call:
## randomForest(formula = total_cases ~ ., data = subset(iq_training, select = -c(1:4, 26:28)))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 135.0008
## % Var explained: -3.23
plot(rf_iq_regressor)
n3 = which.min(rf_iq_regressor$mse)
rf_iq_regressor2<- randomForest(total_cases ~ ., data = subset(iq_training, select = -c(1:4, 26:28)),ntree=203)
print(rf_iq_regressor2)
##
## Call:
## randomForest(formula = total_cases ~ ., data = subset(iq_training, select = -c(1:4, 26:28)), ntree = 203)
## Type of random forest: regression
## Number of trees: 203
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 137.1541
## % Var explained: -4.87
varImpPlot(rf_iq_regressor2, main="Variable Importance Plot Iquitos - PSA Score")
importance(rf_iq_regressor2)
## IncNodePurity
## ndvi_ne 1592.423
## ndvi_nw 2060.754
## ndvi_se 1870.771
## ndvi_sw 2189.477
## precipitation_amt_mm 1300.624
## reanalysis_air_temp_c 2333.803
## reanalysis_avg_temp_c 1899.155
## reanalysis_dew_point_temp_c 3028.794
## reanalysis_max_air_temp_c 3561.491
## reanalysis_min_air_temp_c 2523.404
## reanalysis_precip_amt_kg_per_m2 3027.565
## reanalysis_relative_humidity_percent 2501.286
## reanalysis_sat_precip_amt_mm 1604.454
## reanalysis_specific_humidity_g_per_kg 3280.563
## reanalysis_tdtr_c 2865.215
## station_avg_temp_c 3884.783
## station_diur_temp_rng_c 2781.366
## station_max_temp_c 3406.106
## station_min_temp_c 2142.312
## station_precip_mm 2464.653
rf_iq_imp <- importance(rf_iq_regressor2)
Return 6 variables with the highest Node Purity, and call it to the regressor model.
rf_iq_imp<-cbind(variables=rownames(rf_iq_imp),as.data.frame(rf_iq_imp))
str(rf_iq_imp)
## 'data.frame': 20 obs. of 2 variables:
## $ variables : Factor w/ 20 levels "ndvi_ne","ndvi_nw",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ IncNodePurity: num 1592 2061 1871 2189 1301 ...
a=rf_iq_imp[order(-rf_iq_imp$IncNodePurity),]
count<-0
input<- a[1,1]
for (i in seq(2,6)){
input<-paste(input,a[i,1],sep="+")
count=count+1
}
count
## [1] 5
input
## [1] "station_avg_temp_c+reanalysis_max_air_temp_c+station_max_temp_c+reanalysis_specific_humidity_g_per_kg+reanalysis_dew_point_temp_c+reanalysis_precip_amt_kg_per_m2"
rf_iq_imp_top6 <- as.formula(paste("total_cases~",input,sep=" "))
rf_iq_imp_top6
## total_cases ~ station_avg_temp_c + reanalysis_max_air_temp_c +
## station_max_temp_c + reanalysis_specific_humidity_g_per_kg +
## reanalysis_dew_point_temp_c + reanalysis_precip_amt_kg_per_m2
rf_iq_imp_top6
## total_cases ~ station_avg_temp_c + reanalysis_max_air_temp_c +
## station_max_temp_c + reanalysis_specific_humidity_g_per_kg +
## reanalysis_dew_point_temp_c + reanalysis_precip_amt_kg_per_m2
rf_iq_regressor3 = randomForest(rf_iq_imp_top6, data = subset(iq_training, select = -c(1:4, 26:28)))
print(rf_iq_regressor3)
##
## Call:
## randomForest(formula = rf_iq_imp_top6, data = subset(iq_training, select = -c(1:4, 26:28)))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 127.5276
## % Var explained: 2.49
Save our models to a file called dengue_random_forest
save(rf_df_regressor, rf_df_regressor2, rf_df_regressor3, rf_sj_regressor, rf_sj_regressor2, rf_sj_regressor3, rf_iq_regressor, rf_iq_regressor2, rf_iq_regressor3, file = "dengue_random_forest.RData")