Introduction

train_labels<-read.csv("dengue_labels_train.csv")
test_features<-read.csv("dengue_features_test.csv")
train_features<-read.csv("dengue_features_train.csv")
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.5.3
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall

Merging

Train<-merge.data.frame(train_features,train_labels)

Checking if the merging is done correctly.

paste("Number of Rows/columns in Train features:",nrow(train_features),ncol(train_features),sep=" ")
## [1] "Number of Rows/columns in Train features: 1456 24"
paste("Number of Rows/columns in Train labels:",nrow(train_labels),ncol(train_labels),sep=" ")
## [1] "Number of Rows/columns in Train labels: 1456 4"
paste("Number of Rows/columns in Train combined dataset:",nrow(Train),ncol(Train),sep=" ")
## [1] "Number of Rows/columns in Train combined dataset: 1456 25"

Combining the Train and Test data for better data cleaning.

paste("Dimensions of Train data",dim(Train),sep=" ")
## [1] "Dimensions of Train data 1456" "Dimensions of Train data 25"
paste("Dimensions of Test Data",dim(test_features),sep=" ")
## [1] "Dimensions of Test Data 416" "Dimensions of Test Data 24"

Looking at the names of Columns -Train

names(Train)
##  [1] "city"                                 
##  [2] "year"                                 
##  [3] "weekofyear"                           
##  [4] "week_start_date"                      
##  [5] "ndvi_ne"                              
##  [6] "ndvi_nw"                              
##  [7] "ndvi_se"                              
##  [8] "ndvi_sw"                              
##  [9] "precipitation_amt_mm"                 
## [10] "reanalysis_air_temp_k"                
## [11] "reanalysis_avg_temp_k"                
## [12] "reanalysis_dew_point_temp_k"          
## [13] "reanalysis_max_air_temp_k"            
## [14] "reanalysis_min_air_temp_k"            
## [15] "reanalysis_precip_amt_kg_per_m2"      
## [16] "reanalysis_relative_humidity_percent" 
## [17] "reanalysis_sat_precip_amt_mm"         
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"                    
## [20] "station_avg_temp_c"                   
## [21] "station_diur_temp_rng_c"              
## [22] "station_max_temp_c"                   
## [23] "station_min_temp_c"                   
## [24] "station_precip_mm"                    
## [25] "total_cases"

Looking at the names of Columns -Test

names(test_features)
##  [1] "city"                                 
##  [2] "year"                                 
##  [3] "weekofyear"                           
##  [4] "week_start_date"                      
##  [5] "ndvi_ne"                              
##  [6] "ndvi_nw"                              
##  [7] "ndvi_se"                              
##  [8] "ndvi_sw"                              
##  [9] "precipitation_amt_mm"                 
## [10] "reanalysis_air_temp_k"                
## [11] "reanalysis_avg_temp_k"                
## [12] "reanalysis_dew_point_temp_k"          
## [13] "reanalysis_max_air_temp_k"            
## [14] "reanalysis_min_air_temp_k"            
## [15] "reanalysis_precip_amt_kg_per_m2"      
## [16] "reanalysis_relative_humidity_percent" 
## [17] "reanalysis_sat_precip_amt_mm"         
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"                    
## [20] "station_avg_temp_c"                   
## [21] "station_diur_temp_rng_c"              
## [22] "station_max_temp_c"                   
## [23] "station_min_temp_c"                   
## [24] "station_precip_mm"

Adding the total cases to test and adding a identifier column

test_features$total_cases<-NA
test_features$Set<-"Test"
Train$Set<-"Train"

Combining both Train and Test Dataset.

data<-rbind(Train,test_features)
str(data)
## 'data.frame':    1872 obs. of  26 variables:
##  $ city                                 : Factor w/ 2 levels "iq","sj": 1 1 1 1 1 1 1 1 1 1 ...
##  $ year                                 : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ weekofyear                           : int  26 27 28 29 30 31 32 33 34 35 ...
##  $ week_start_date                      : Factor w/ 1205 levels "1990-04-30","1990-05-07",..: 530 531 532 533 534 535 536 537 538 539 ...
##  $ ndvi_ne                              : num  0.193 0.217 0.177 0.228 0.329 ...
##  $ ndvi_nw                              : num  0.132 0.276 0.173 0.145 0.322 ...
##  $ ndvi_se                              : num  0.341 0.289 0.204 0.254 0.254 ...
##  $ ndvi_sw                              : num  0.247 0.242 0.128 0.2 0.361 ...
##  $ precipitation_amt_mm                 : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_air_temp_k                : num  297 297 296 295 296 ...
##  $ reanalysis_avg_temp_k                : num  298 298 297 296 298 ...
##  $ reanalysis_dew_point_temp_k          : num  295 295 296 293 294 ...
##  $ reanalysis_max_air_temp_k            : num  307 307 304 304 307 ...
##  $ reanalysis_min_air_temp_k            : num  293 291 293 289 292 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  43.2 46 64.8 24 31.8 ...
##  $ reanalysis_relative_humidity_percent : num  92.4 93.6 95.8 87.2 88.2 ...
##  $ reanalysis_sat_precip_amt_mm         : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  16.7 16.9 17.1 14.4 15.4 ...
##  $ reanalysis_tdtr_k                    : num  8.93 10.31 7.39 9.11 9.5 ...
##  $ station_avg_temp_c                   : num  26.4 26.9 26.8 25.8 26.6 ...
##  $ station_diur_temp_rng_c              : num  10.8 11.6 11.5 10.5 11.5 ...
##  $ station_max_temp_c                   : num  32.5 34 33 31.5 33.3 32 34 33 34 34 ...
##  $ station_min_temp_c                   : num  20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
##  $ station_precip_mm                    : num  3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
##  $ total_cases                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Set                                  : chr  "Train" "Train" "Train" "Train" ...

Data cleaning

Looking for Missing Values

missing_values<-summarise_all(data,funs(sum(is.na(.))/n()))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
##   # Before:
##   funs(name = f(.))
## 
##   # After: 
##   list(name = ~ f(.))
## This warning is displayed once per session.
missing_values<-gather(missing_values,key="Feature",value = "Missing_percentage")

g<-ggplot(data=missing_values)
g<-g+geom_bar(stat="identity",aes(x=reorder(Feature,-Missing_percentage),y=Missing_percentage))
g<-g+coord_flip()+ggtitle("Missing %-Features")+ylab("Features")+xlab("Missing Percentage")
g

Replaceing Missing Values with mean values

for(i in 5:24){
  data[,i]<-replace_na(data[,i],mean(data[,i],na.rm = T))
}

Checking of Missing Values have been removed

missing_values<-summarise_all(data,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="Feature",value = "Missing_percentage")

g<-ggplot(data=missing_values)
g<-g+geom_bar(stat="identity",aes(x=reorder(Feature,-Missing_percentage),y=Missing_percentage))
g<-g+coord_flip()+ggtitle("Missin %-Features")+ylab("Features")+xlab("Missing Percentage")
g

Changing the data types to appropriate classes.

str(data)
## 'data.frame':    1872 obs. of  26 variables:
##  $ city                                 : Factor w/ 2 levels "iq","sj": 1 1 1 1 1 1 1 1 1 1 ...
##  $ year                                 : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ weekofyear                           : int  26 27 28 29 30 31 32 33 34 35 ...
##  $ week_start_date                      : Factor w/ 1205 levels "1990-04-30","1990-05-07",..: 530 531 532 533 534 535 536 537 538 539 ...
##  $ ndvi_ne                              : num  0.193 0.217 0.177 0.228 0.329 ...
##  $ ndvi_nw                              : num  0.132 0.276 0.173 0.145 0.322 ...
##  $ ndvi_se                              : num  0.341 0.289 0.204 0.254 0.254 ...
##  $ ndvi_sw                              : num  0.247 0.242 0.128 0.2 0.361 ...
##  $ precipitation_amt_mm                 : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_air_temp_k                : num  297 297 296 295 296 ...
##  $ reanalysis_avg_temp_k                : num  298 298 297 296 298 ...
##  $ reanalysis_dew_point_temp_k          : num  295 295 296 293 294 ...
##  $ reanalysis_max_air_temp_k            : num  307 307 304 304 307 ...
##  $ reanalysis_min_air_temp_k            : num  293 291 293 289 292 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  43.2 46 64.8 24 31.8 ...
##  $ reanalysis_relative_humidity_percent : num  92.4 93.6 95.8 87.2 88.2 ...
##  $ reanalysis_sat_precip_amt_mm         : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  16.7 16.9 17.1 14.4 15.4 ...
##  $ reanalysis_tdtr_k                    : num  8.93 10.31 7.39 9.11 9.5 ...
##  $ station_avg_temp_c                   : num  26.4 26.9 26.8 25.8 26.6 ...
##  $ station_diur_temp_rng_c              : num  10.8 11.6 11.5 10.5 11.5 ...
##  $ station_max_temp_c                   : num  32.5 34 33 31.5 33.3 32 34 33 34 34 ...
##  $ station_min_temp_c                   : num  20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
##  $ station_precip_mm                    : num  3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
##  $ total_cases                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Set                                  : chr  "Train" "Train" "Train" "Train" ...
data$week_start_date<-as.POSIXct((strptime(data$week_start_date,"%Y-%m-%d")))
data$city<-as.factor(ifelse(data$city=="iq",0,1))
#names(data)[5:24]<-c(paste("F",5:24,sep=""))
str(data)
## 'data.frame':    1872 obs. of  26 variables:
##  $ city                                 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ year                                 : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ weekofyear                           : int  26 27 28 29 30 31 32 33 34 35 ...
##  $ week_start_date                      : POSIXct, format: "2000-07-01" "2000-07-08" ...
##  $ ndvi_ne                              : num  0.193 0.217 0.177 0.228 0.329 ...
##  $ ndvi_nw                              : num  0.132 0.276 0.173 0.145 0.322 ...
##  $ ndvi_se                              : num  0.341 0.289 0.204 0.254 0.254 ...
##  $ ndvi_sw                              : num  0.247 0.242 0.128 0.2 0.361 ...
##  $ precipitation_amt_mm                 : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_air_temp_k                : num  297 297 296 295 296 ...
##  $ reanalysis_avg_temp_k                : num  298 298 297 296 298 ...
##  $ reanalysis_dew_point_temp_k          : num  295 295 296 293 294 ...
##  $ reanalysis_max_air_temp_k            : num  307 307 304 304 307 ...
##  $ reanalysis_min_air_temp_k            : num  293 291 293 289 292 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  43.2 46 64.8 24 31.8 ...
##  $ reanalysis_relative_humidity_percent : num  92.4 93.6 95.8 87.2 88.2 ...
##  $ reanalysis_sat_precip_amt_mm         : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  16.7 16.9 17.1 14.4 15.4 ...
##  $ reanalysis_tdtr_k                    : num  8.93 10.31 7.39 9.11 9.5 ...
##  $ station_avg_temp_c                   : num  26.4 26.9 26.8 25.8 26.6 ...
##  $ station_diur_temp_rng_c              : num  10.8 11.6 11.5 10.5 11.5 ...
##  $ station_max_temp_c                   : num  32.5 34 33 31.5 33.3 32 34 33 34 34 ...
##  $ station_min_temp_c                   : num  20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
##  $ station_precip_mm                    : num  3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
##  $ total_cases                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Set                                  : chr  "Train" "Train" "Train" "Train" ...

Checking for Correlation

co<-cor(data[,5:24],method=("spearman"))
co<-round(co,1)
corrplot(co,tl.cex = .5)

Month Column

data$Month<-format(data$week_start_date,"%m")
data$Month<-as.numeric(data$Month)

Exploratory Data Analysis.

Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")
levels(Train$city)<-c("IQ","SJ")

Month

g<-ggplot(data=Train,aes(x=Month,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")+facet_wrap(.~city)
g

Looking at the Total Cases of Dengue

g<-ggplot(data=Train,aes(x=Train$total_cases,fill=city))
g<-g+geom_bar(stat="count")+xlab("Total Dengue Cases")
g

In IQ

g<-ggplot(data=Train[Train$city=="IQ",],aes(x=total_cases,fill=city))
g<-g+geom_bar(stat="count")+xlab("Total Dengue Cases in IQ")
g

Looking closes for IQ, checking cases more than 35.

iqd<-Train[Train$city=="IQ" & Train$total_cases>35,]

In SJ

g<-ggplot(data=Train[Train$city=="SJ",],aes(x=total_cases,fill=city))
g<-g+geom_bar(stat="count")+xlab("Total Dengue Cases in SJ")
g

City

g<-ggplot(data=Train,aes(x=city,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")
g

Year

g<-ggplot(data=Train,aes(x=year,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")+facet_wrap(.~city)
g

Week of the year

g<-ggplot(data=Train,aes(x=weekofyear,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")+facet_wrap(.~city)
g

Correlation with Total Cases.

In SJ

Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")


sj<-filter(Train,city=="1")
sj<-sj[,5:25]
co_sj<-cor(as.matrix(sj[,-21]),as.matrix(sj[,21]))
co_sj<-as.data.frame(co_sj)
co_sj$Features<-rownames(co_sj)


g<-ggplot(data=co_sj)
g<-g+geom_bar(stat="identity",aes(y=V1,x=reorder(Features,V1)))+ylab("Correlation")+xlab("Features")
g<-g+coord_flip()+ggtitle("Correlation with Cases in SJ")
g

correlation within variables

co_v<-cor(sj)
corrplot(co_v,tl.cex=.7)

In IQ

iq<-filter(Train,city=="0")
iq<-iq[,5:25]
co_iq<-cor(as.matrix(iq[,-21]),as.matrix(iq[,21]))
co_iq<-as.data.frame(co_iq)
co_iq$Features<-rownames(co_iq)


g<-ggplot(data=co_iq)
g<-g+geom_bar(stat="identity",aes(y=V1,x=reorder(Features,V1)))+ylab("Correlation")+xlab("Features")
g<-g+coord_flip()+ggtitle("Correlation with Cases in IQ")
g

Correlation within the variables

co_v<-cor(iq)
corrplot(co_v,tl.cex=.7)

Prediction

Predictors for Iq and SJ and PreProcessing

#Filtering the cases in IQ with more than 50 cases- as they are out of ordinary and outliers.
a<-as.numeric(row.names(data[data$Set=="Train" & data$city=="0" & data$total_cases>35,]))
b<-as.numeric(row.names(data[data$Set=="Train" & data$city=="1" & data$total_cases>70,]))
data<-data[-c(a,b),]

Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")

Train$Set<-NULL
Test$Set<-NULL

split<-sample(nrow(Train),nrow(Train)*0.7)
Train<-Train[split,]
Train_validation<-Train[-split,]
Train_validation_iq<-filter(Train_validation,city=="0")
Train_validation_sj<-filter(Train_validation,city=="1")


Train_iq<-filter(Train,city=="0")
Train_sj<-filter(Train,city=="1")

Test_iq<-filter(Test,city=="0")
Test_sj<-filter(Test,city=="1")

Train_iq<-select(Train_iq,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))
Test_iq<-select(Test_iq,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))

Train_sj<-select(Train_sj,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))
Test_sj<-select(Test_sj,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))

Using Random Forest

tr<-trainControl(method = "repeatedcv",number = 10,repeats = 5)
rf_iq<-train(total_cases~.,data=Train_iq,ntree=1000,trControl=tr)
#rf_iq<-randomForest(total_cases~.,data=Train_iq)
predicted_rf_iq<-predict(rf_iq,data=Train_iq)

rf_sj<-train(total_cases~.,data=Train_sj,ntree=1000,trControl=tr)
#rf_sj<-randomForest(total_cases~.,data=Train_sj)
predicted_rf_sj<-predict(rf_sj,data=Train_sj)

Metrics of Random Forest

predicted_rf_iq<-ifelse(predicted_rf_iq<0, 0 , predicted_rf_iq)

errors<-as.data.frame(ae(Train_iq$total_cases,predicted_rf_iq))
names(errors)<-"Residual error"

g<-ggplot(data=errors)
g<-g+geom_histogram(aes(x=errors$`Residual error`))+xlab("Residual error in IQ")
g

predicted_rf_sj<-ifelse(predicted_rf_sj<0, 0 , predicted_rf_sj)

errors<-as.data.frame(ae(Train_sj$total_cases,predicted_rf_sj))
names(errors)<-"Residual error"

g<-ggplot(data=errors)
g<-g+geom_histogram(aes(x=errors$`Residual error`))+xlab("Residual error in SJ")
g

RMSE ,R2, MAE

paste("The Root Mean Squared Error in IQ is:",RMSE(Train_iq$total_cases,predicted_rf_iq),sep=" ")
## [1] "The Root Mean Squared Error in IQ is: 2.33830883911239"
paste("The R2 statistic in IQ is:",R2(predicted_rf_iq,Train_iq$total_cases))
## [1] "The R2 statistic in IQ is: 0.925784323899761"
paste("The Mean absolute error of IQ is:",mae(predicted_rf_iq,Train_iq$total_cases),sep=" ")
## [1] "The Mean absolute error of IQ is: 1.52910462700661"
paste("The Root Mean Squared Error in SJ is:",RMSE(Train_sj$total_cases,predicted_rf_sj),sep=" ")
## [1] "The Root Mean Squared Error in SJ is: 3.82629738625393"
paste("The R2 statistic in SJ is:",R2(predicted_rf_sj,Train_sj$total_cases))
## [1] "The R2 statistic in SJ is: 0.958922791276053"
paste("The Mean absolute error of SJ is:",mae(predicted_rf_sj,Train_sj$total_cases),sep=" ")
## [1] "The Mean absolute error of SJ is: 2.82575209394454"

Predicting for Test.

test_predict_iq<-predict(rf_iq,newdata = Test_iq)
test_predict_iq<-round(test_predict_iq,0)
test_predict_iq<-ifelse(test_predict_iq<0,0,test_predict_iq)

test_predict_sj<-predict(rf_sj,newdata = Test_sj)
test_predict_sj<-round(test_predict_sj,0)
test_predict_sj<-ifelse(test_predict_sj<0,0,test_predict_sj)

Creating a submission file

levels(Test$city)<-c("iq","sj")
test_predict<-c(test_predict_sj,test_predict_iq)
sub<-data.frame(city=Test$city,year=Test$year,weekofyear=Test$weekofyear,total_cases=test_predict)


write.csv(sub,"rf.csv",row.names = FALSE)

Visualizing the Prediction

In SJ

In IQ

Visualizing the Validations

pred_iq_valid<-predict(rf_iq,newdata=Train_validation_iq)
pred_iq_valid<-round(pred_iq_valid,0)
pred_iq_valid<-ifelse(pred_iq_valid<0,0,pred_iq_valid)

pred_sj_valid<-predict(rf_sj,newdata = Train_validation_sj)
pred_sj_valid<-round(pred_sj_valid,0)
pred_sj_valid<-ifelse(pred_sj_valid<0,0,pred_sj_valid)

In SJ

visual_valid<-data.frame(weekDateValidation=Train_validation_sj$week_start_date,Cases=Train_validation_sj$total_cases,Predicted=pred_sj_valid)

g<-ggplot(data=visual_valid)
g<-g+geom_line(aes(x=weekDateValidation,y=Cases,col="Train"))
g<-g+geom_line(aes(x=weekDateValidation,y=pred_sj_valid,col="Predicted"))
g<-g+ggtitle("Comparing the predicted and Train validated set in SJ.")
g

In IQ

visual_valid<-data.frame(weekDateValidation=Train_validation_iq$week_start_date,Cases=Train_validation_iq$total_cases,Predicted=pred_iq_valid)

g<-ggplot(data=visual_valid)
g<-g+geom_line(aes(x=weekDateValidation,y=Cases,col="Train"))
g<-g+geom_line(aes(x=weekDateValidation,y=pred_iq_valid,col="Predicted"))
g<-g+ggtitle("Comparing the predicted and Train validated set in IQ.")
g