# setwd("G:/My Drive/CAPSTONE/working")
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(devtools)
library(readr)
library(caTools)
library(randomForest)
train_df <- read.csv("train_df.csv", header = T, stringsAsFactors = F)
train_iq <- read.csv("train_iq.csv", header = T, stringsAsFactors = F)
train_sj <- read.csv("train_sj.csv", header = T, stringsAsFactors = F)
test_df <- read.csv("test_df.csv", header = T, stringsAsFactors = F)
test_iq <- read.csv("test_iq.csv", header = T, stringsAsFactors = F)
test_sj <- read.csv("test_sj.csv", header = T, stringsAsFactors = F)
submissions = read.csv("submission_format.csv", header = T, stringsAsFactors = F)

Load splitted test & training sets.

load("dengue_test_train_split.RData")

Random Forest

Random forest is a bagging algorithm which creates multiple Decision trees by selecting few of the variables randomly.

It simultanesously develops multiple trees in combination and finally averages the error to bring out the best possible results

We will apply Random forest on the entire train_df dataset. This is a basic Random Forest Model which has created 500 trees & selected 6 independent variables randomly.

rf_df_regressor = randomForest(total_cases ~ ., data = subset(df_training, select = -c(1:4, 26:28)))
print(rf_df_regressor)
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = subset(df_training,      select = -c(1:4, 26:28))) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 1304.461
##                     % Var explained: 41.91

Plotting the model will help us identify the optimal number of trees. We then identify the model of the optimal number of trees which has least Mean Squared Error.

plot(rf_df_regressor)

n1 = which.min(rf_df_regressor$mse)
n1
## [1] 395

Use this optimal tree number to prune the model. #Pruning is a technique in machine learning and search algorithms that reduces the size of decision trees by removing sections of the tree that provide little power to classify instances. Pruning reduces the complexity of the final classifier, and hence improves predictive accuracy by the reduction of overfitting.

If we are not able to see great improvement in model performance, we can try Feature Selection based on Node purity. VarImp Plot will help us to understand the Variables with best node purity.

rf_df_regressor2 = randomForest(total_cases ~ ., data = subset(df_training, select = -c(1:4, 26:28)), ntree = n1)
print(rf_df_regressor2)
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = subset(df_training,      select = -c(1:4, 26:28)), ntree = n1) 
##                Type of random forest: regression
##                      Number of trees: 395
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 1260.36
##                     % Var explained: 43.87
varImpPlot(rf_df_regressor2, main = "Variable Importance Plot Train DF- PSA Score")

importance(rf_df_regressor2)
##                                       IncNodePurity
## ndvi_ne                                   148935.76
## ndvi_nw                                   523355.63
## ndvi_se                                   133497.04
## ndvi_sw                                   116146.20
## precipitation_amt_mm                       49181.35
## reanalysis_air_temp_c                      94587.17
## reanalysis_avg_temp_c                      65734.81
## reanalysis_dew_point_temp_c               126458.61
## reanalysis_max_air_temp_c                  79548.85
## reanalysis_min_air_temp_c                 149352.02
## reanalysis_precip_amt_kg_per_m2           113871.76
## reanalysis_relative_humidity_percent       86570.09
## reanalysis_sat_precip_amt_mm               45893.65
## reanalysis_specific_humidity_g_per_kg     134897.56
## reanalysis_tdtr_c                         131353.64
## station_avg_temp_c                        106852.30
## station_diur_temp_rng_c                    98720.84
## station_max_temp_c                         99586.24
## station_min_temp_c                         78818.70
## station_precip_mm                          67242.94
rf_df_imp <- importance(rf_df_regressor2)

After building this RF algorithm, it tries to fit the model to the data, which may cause overfitting. As the model has generated 500 trees, it may generate a higher MSE. We developed this model according to the number of trees generating the least MSE.

Return 6 variables with the highest Node Purity, and call it to the regressor model.

rf_df_imp<-cbind(variables=rownames(rf_df_imp),as.data.frame(rf_df_imp))
str(rf_df_imp)
## 'data.frame':    20 obs. of  2 variables:
##  $ variables    : Factor w/ 20 levels "ndvi_ne","ndvi_nw",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ IncNodePurity: num  148936 523356 133497 116146 49181 ...
a=rf_df_imp[order(-rf_df_imp$IncNodePurity),]
count<-0
input<- a[1,1]
for (i in seq(2,6)){
  input<-paste(input,a[i,1],sep="+")
  count=count+1
}
count
## [1] 5
input
## [1] "ndvi_nw+reanalysis_min_air_temp_c+ndvi_ne+reanalysis_specific_humidity_g_per_kg+ndvi_se+reanalysis_tdtr_c"
rf_df_imp_top6 <- as.formula(paste("total_cases~",input,sep=" "))
rf_df_imp_top6
## total_cases ~ ndvi_nw + reanalysis_min_air_temp_c + ndvi_ne + 
##     reanalysis_specific_humidity_g_per_kg + ndvi_se + reanalysis_tdtr_c

As the mean square error doesn’t improve much, we will go for important variables based on Node Purity (indicates the ease of identifying the variable to a class or value). We use the top 6 variables in our previous model. Note that these values are randomized after each run, and thus, we cannot manually input the independent features everytime the model is trained:

rf_df_imp_top6
## total_cases ~ ndvi_nw + reanalysis_min_air_temp_c + ndvi_ne + 
##     reanalysis_specific_humidity_g_per_kg + ndvi_se + reanalysis_tdtr_c
rf_df_regressor3 = randomForest(rf_df_imp_top6, data = subset(df_training, select = -c(1:4, 26:28)))
print(rf_df_regressor3)
## 
## Call:
##  randomForest(formula = rf_df_imp_top6, data = subset(df_training,      select = -c(1:4, 26:28))) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 1092.494
##                     % Var explained: 51.35

In the same way, we apply our Random forest technique to the training datasets split by city.

Fit the model on San Juan

rf_sj_regressor<- randomForest(total_cases ~ ., data = subset(sj_training, select = -c(1:4, 26:28)))
print(rf_sj_regressor)
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = subset(sj_training,      select = -c(1:4, 26:28))) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 1867.153
##                     % Var explained: 40.14
plot(rf_sj_regressor)

n2 = which.min(rf_sj_regressor$mse)
rf_sj_regressor2 = randomForest(total_cases ~ ., data = subset(sj_training, select = -c(1:4, 26:28)),ntree=n2)
print(rf_sj_regressor2)
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = subset(sj_training,      select = -c(1:4, 26:28)), ntree = n2) 
##                Type of random forest: regression
##                      Number of trees: 236
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 1938.588
##                     % Var explained: 37.85
varImpPlot(rf_sj_regressor, main="Variable Importance Plot San Juan - PSA Score")

importance(rf_sj_regressor2)
##                                       IncNodePurity
## ndvi_ne                                   114662.64
## ndvi_nw                                   505805.51
## ndvi_se                                   155410.37
## ndvi_sw                                   110971.98
## precipitation_amt_mm                       55139.23
## reanalysis_air_temp_c                      65521.99
## reanalysis_avg_temp_c                      54128.07
## reanalysis_dew_point_temp_c               147024.46
## reanalysis_max_air_temp_c                  64920.16
## reanalysis_min_air_temp_c                  54862.52
## reanalysis_precip_amt_kg_per_m2           104108.23
## reanalysis_relative_humidity_percent      107336.46
## reanalysis_sat_precip_amt_mm               58430.50
## reanalysis_specific_humidity_g_per_kg     162656.23
## reanalysis_tdtr_c                          83144.81
## station_avg_temp_c                         97217.64
## station_diur_temp_rng_c                    65699.49
## station_max_temp_c                         86958.45
## station_min_temp_c                         40333.33
## station_precip_mm                          72419.53
rf_sj_imp <- importance(rf_sj_regressor2)

Return 6 variables with the highest Node Purity, and call it to the regressor model.

rf_sj_imp<-cbind(variables=rownames(rf_sj_imp),as.data.frame(rf_sj_imp))
str(rf_sj_imp)
## 'data.frame':    20 obs. of  2 variables:
##  $ variables    : Factor w/ 20 levels "ndvi_ne","ndvi_nw",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ IncNodePurity: num  114663 505806 155410 110972 55139 ...
a=rf_sj_imp[order(-rf_sj_imp$IncNodePurity),]
count<-0
input<- a[1,1]
for (i in seq(2,6)){
  input<-paste(input,a[i,1],sep="+")
  count=count+1
}
count
## [1] 5
input
## [1] "ndvi_nw+reanalysis_specific_humidity_g_per_kg+ndvi_se+reanalysis_dew_point_temp_c+ndvi_ne+ndvi_sw"
rf_sj_imp_top6 <- as.formula(paste("total_cases~",input,sep=" "))
rf_sj_imp_top6
## total_cases ~ ndvi_nw + reanalysis_specific_humidity_g_per_kg + 
##     ndvi_se + reanalysis_dew_point_temp_c + ndvi_ne + ndvi_sw
rf_sj_imp_top6
## total_cases ~ ndvi_nw + reanalysis_specific_humidity_g_per_kg + 
##     ndvi_se + reanalysis_dew_point_temp_c + ndvi_ne + ndvi_sw
rf_sj_regressor3 = randomForest(rf_sj_imp_top6, data = subset(sj_training, select = -c(1:4, 26:28)))
print(rf_sj_regressor3)
## 
## Call:
##  randomForest(formula = rf_sj_imp_top6, data = subset(sj_training,      select = -c(1:4, 26:28))) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 1464.458
##                     % Var explained: 53.05

Fit the model on Iquitos

rf_iq_regressor<-  randomForest(total_cases ~ ., data = subset(iq_training, select = -c(1:4, 26:28)))
print(rf_iq_regressor)
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = subset(iq_training,      select = -c(1:4, 26:28))) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 135.0008
##                     % Var explained: -3.23
plot(rf_iq_regressor)

n3 = which.min(rf_iq_regressor$mse)
rf_iq_regressor2<-  randomForest(total_cases ~ ., data = subset(iq_training, select = -c(1:4, 26:28)),ntree=203)                              
print(rf_iq_regressor2)
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = subset(iq_training,      select = -c(1:4, 26:28)), ntree = 203) 
##                Type of random forest: regression
##                      Number of trees: 203
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 137.1541
##                     % Var explained: -4.87
varImpPlot(rf_iq_regressor2, main="Variable Importance Plot Iquitos - PSA Score")

importance(rf_iq_regressor2)
##                                       IncNodePurity
## ndvi_ne                                    1592.423
## ndvi_nw                                    2060.754
## ndvi_se                                    1870.771
## ndvi_sw                                    2189.477
## precipitation_amt_mm                       1300.624
## reanalysis_air_temp_c                      2333.803
## reanalysis_avg_temp_c                      1899.155
## reanalysis_dew_point_temp_c                3028.794
## reanalysis_max_air_temp_c                  3561.491
## reanalysis_min_air_temp_c                  2523.404
## reanalysis_precip_amt_kg_per_m2            3027.565
## reanalysis_relative_humidity_percent       2501.286
## reanalysis_sat_precip_amt_mm               1604.454
## reanalysis_specific_humidity_g_per_kg      3280.563
## reanalysis_tdtr_c                          2865.215
## station_avg_temp_c                         3884.783
## station_diur_temp_rng_c                    2781.366
## station_max_temp_c                         3406.106
## station_min_temp_c                         2142.312
## station_precip_mm                          2464.653
rf_iq_imp <- importance(rf_iq_regressor2)

Return 6 variables with the highest Node Purity, and call it to the regressor model.

rf_iq_imp<-cbind(variables=rownames(rf_iq_imp),as.data.frame(rf_iq_imp))
str(rf_iq_imp)
## 'data.frame':    20 obs. of  2 variables:
##  $ variables    : Factor w/ 20 levels "ndvi_ne","ndvi_nw",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ IncNodePurity: num  1592 2061 1871 2189 1301 ...
a=rf_iq_imp[order(-rf_iq_imp$IncNodePurity),]
count<-0
input<- a[1,1]
for (i in seq(2,6)){
  input<-paste(input,a[i,1],sep="+")
  count=count+1
}
count
## [1] 5
input
## [1] "station_avg_temp_c+reanalysis_max_air_temp_c+station_max_temp_c+reanalysis_specific_humidity_g_per_kg+reanalysis_dew_point_temp_c+reanalysis_precip_amt_kg_per_m2"
rf_iq_imp_top6 <- as.formula(paste("total_cases~",input,sep=" "))
rf_iq_imp_top6
## total_cases ~ station_avg_temp_c + reanalysis_max_air_temp_c + 
##     station_max_temp_c + reanalysis_specific_humidity_g_per_kg + 
##     reanalysis_dew_point_temp_c + reanalysis_precip_amt_kg_per_m2
rf_iq_imp_top6
## total_cases ~ station_avg_temp_c + reanalysis_max_air_temp_c + 
##     station_max_temp_c + reanalysis_specific_humidity_g_per_kg + 
##     reanalysis_dew_point_temp_c + reanalysis_precip_amt_kg_per_m2
rf_iq_regressor3 = randomForest(rf_iq_imp_top6, data = subset(iq_training, select = -c(1:4, 26:28)))
print(rf_iq_regressor3)
## 
## Call:
##  randomForest(formula = rf_iq_imp_top6, data = subset(iq_training,      select = -c(1:4, 26:28))) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 127.5276
##                     % Var explained: 2.49

Save our models to a file called dengue_random_forest

save(rf_df_regressor, rf_df_regressor2, rf_df_regressor3, rf_sj_regressor, rf_sj_regressor2, rf_sj_regressor3, rf_iq_regressor, rf_iq_regressor2, rf_iq_regressor3, file = "dengue_random_forest.RData")

We validate our models in “DengAI Best Model Validation & Submission.RMD”