train_labels<-read.csv("dengue_labels_train.csv")
test_features<-read.csv("dengue_features_test.csv")
train_features<-read.csv("dengue_features_train.csv")
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.5.3
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
Train<-merge.data.frame(train_features,train_labels)
Checking if the merging is done correctly.
paste("Number of Rows/columns in Train features:",nrow(train_features),ncol(train_features),sep=" ")
## [1] "Number of Rows/columns in Train features: 1456 24"
paste("Number of Rows/columns in Train labels:",nrow(train_labels),ncol(train_labels),sep=" ")
## [1] "Number of Rows/columns in Train labels: 1456 4"
paste("Number of Rows/columns in Train combined dataset:",nrow(Train),ncol(Train),sep=" ")
## [1] "Number of Rows/columns in Train combined dataset: 1456 25"
Combining the Train and Test data for better data cleaning.
paste("Dimensions of Train data",dim(Train),sep=" ")
## [1] "Dimensions of Train data 1456" "Dimensions of Train data 25"
paste("Dimensions of Test Data",dim(test_features),sep=" ")
## [1] "Dimensions of Test Data 416" "Dimensions of Test Data 24"
Looking at the names of Columns -Train
names(Train)
## [1] "city"
## [2] "year"
## [3] "weekofyear"
## [4] "week_start_date"
## [5] "ndvi_ne"
## [6] "ndvi_nw"
## [7] "ndvi_se"
## [8] "ndvi_sw"
## [9] "precipitation_amt_mm"
## [10] "reanalysis_air_temp_k"
## [11] "reanalysis_avg_temp_k"
## [12] "reanalysis_dew_point_temp_k"
## [13] "reanalysis_max_air_temp_k"
## [14] "reanalysis_min_air_temp_k"
## [15] "reanalysis_precip_amt_kg_per_m2"
## [16] "reanalysis_relative_humidity_percent"
## [17] "reanalysis_sat_precip_amt_mm"
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"
## [20] "station_avg_temp_c"
## [21] "station_diur_temp_rng_c"
## [22] "station_max_temp_c"
## [23] "station_min_temp_c"
## [24] "station_precip_mm"
## [25] "total_cases"
Looking at the names of Columns -Test
names(test_features)
## [1] "city"
## [2] "year"
## [3] "weekofyear"
## [4] "week_start_date"
## [5] "ndvi_ne"
## [6] "ndvi_nw"
## [7] "ndvi_se"
## [8] "ndvi_sw"
## [9] "precipitation_amt_mm"
## [10] "reanalysis_air_temp_k"
## [11] "reanalysis_avg_temp_k"
## [12] "reanalysis_dew_point_temp_k"
## [13] "reanalysis_max_air_temp_k"
## [14] "reanalysis_min_air_temp_k"
## [15] "reanalysis_precip_amt_kg_per_m2"
## [16] "reanalysis_relative_humidity_percent"
## [17] "reanalysis_sat_precip_amt_mm"
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"
## [20] "station_avg_temp_c"
## [21] "station_diur_temp_rng_c"
## [22] "station_max_temp_c"
## [23] "station_min_temp_c"
## [24] "station_precip_mm"
Adding the total cases to test and adding a identifier column
test_features$total_cases<-NA
test_features$Set<-"Test"
Train$Set<-"Train"
Combining both Train and Test Dataset.
data<-rbind(Train,test_features)
str(data)
## 'data.frame': 1872 obs. of 26 variables:
## $ city : Factor w/ 2 levels "iq","sj": 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ weekofyear : int 26 27 28 29 30 31 32 33 34 35 ...
## $ week_start_date : Factor w/ 1205 levels "1990-04-30","1990-05-07",..: 530 531 532 533 534 535 536 537 538 539 ...
## $ ndvi_ne : num 0.193 0.217 0.177 0.228 0.329 ...
## $ ndvi_nw : num 0.132 0.276 0.173 0.145 0.322 ...
## $ ndvi_se : num 0.341 0.289 0.204 0.254 0.254 ...
## $ ndvi_sw : num 0.247 0.242 0.128 0.2 0.361 ...
## $ precipitation_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_air_temp_k : num 297 297 296 295 296 ...
## $ reanalysis_avg_temp_k : num 298 298 297 296 298 ...
## $ reanalysis_dew_point_temp_k : num 295 295 296 293 294 ...
## $ reanalysis_max_air_temp_k : num 307 307 304 304 307 ...
## $ reanalysis_min_air_temp_k : num 293 291 293 289 292 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 43.2 46 64.8 24 31.8 ...
## $ reanalysis_relative_humidity_percent : num 92.4 93.6 95.8 87.2 88.2 ...
## $ reanalysis_sat_precip_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_specific_humidity_g_per_kg: num 16.7 16.9 17.1 14.4 15.4 ...
## $ reanalysis_tdtr_k : num 8.93 10.31 7.39 9.11 9.5 ...
## $ station_avg_temp_c : num 26.4 26.9 26.8 25.8 26.6 ...
## $ station_diur_temp_rng_c : num 10.8 11.6 11.5 10.5 11.5 ...
## $ station_max_temp_c : num 32.5 34 33 31.5 33.3 32 34 33 34 34 ...
## $ station_min_temp_c : num 20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
## $ station_precip_mm : num 3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
## $ total_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
missing_values<-summarise_all(data,funs(sum(is.na(.))/n()))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
missing_values<-gather(missing_values,key="Feature",value = "Missing_percentage")
g<-ggplot(data=missing_values)
g<-g+geom_bar(stat="identity",aes(x=reorder(Feature,-Missing_percentage),y=Missing_percentage))
g<-g+coord_flip()+ggtitle("Missing %-Features")+ylab("Features")+xlab("Missing Percentage")
g
Replaceing Missing Values with mean values
for(i in 5:24){
data[,i]<-replace_na(data[,i],mean(data[,i],na.rm = T))
}
Checking of Missing Values have been removed
missing_values<-summarise_all(data,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="Feature",value = "Missing_percentage")
g<-ggplot(data=missing_values)
g<-g+geom_bar(stat="identity",aes(x=reorder(Feature,-Missing_percentage),y=Missing_percentage))
g<-g+coord_flip()+ggtitle("Missin %-Features")+ylab("Features")+xlab("Missing Percentage")
g
Changing the data types to appropriate classes.
str(data)
## 'data.frame': 1872 obs. of 26 variables:
## $ city : Factor w/ 2 levels "iq","sj": 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ weekofyear : int 26 27 28 29 30 31 32 33 34 35 ...
## $ week_start_date : Factor w/ 1205 levels "1990-04-30","1990-05-07",..: 530 531 532 533 534 535 536 537 538 539 ...
## $ ndvi_ne : num 0.193 0.217 0.177 0.228 0.329 ...
## $ ndvi_nw : num 0.132 0.276 0.173 0.145 0.322 ...
## $ ndvi_se : num 0.341 0.289 0.204 0.254 0.254 ...
## $ ndvi_sw : num 0.247 0.242 0.128 0.2 0.361 ...
## $ precipitation_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_air_temp_k : num 297 297 296 295 296 ...
## $ reanalysis_avg_temp_k : num 298 298 297 296 298 ...
## $ reanalysis_dew_point_temp_k : num 295 295 296 293 294 ...
## $ reanalysis_max_air_temp_k : num 307 307 304 304 307 ...
## $ reanalysis_min_air_temp_k : num 293 291 293 289 292 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 43.2 46 64.8 24 31.8 ...
## $ reanalysis_relative_humidity_percent : num 92.4 93.6 95.8 87.2 88.2 ...
## $ reanalysis_sat_precip_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_specific_humidity_g_per_kg: num 16.7 16.9 17.1 14.4 15.4 ...
## $ reanalysis_tdtr_k : num 8.93 10.31 7.39 9.11 9.5 ...
## $ station_avg_temp_c : num 26.4 26.9 26.8 25.8 26.6 ...
## $ station_diur_temp_rng_c : num 10.8 11.6 11.5 10.5 11.5 ...
## $ station_max_temp_c : num 32.5 34 33 31.5 33.3 32 34 33 34 34 ...
## $ station_min_temp_c : num 20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
## $ station_precip_mm : num 3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
## $ total_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
data$week_start_date<-as.POSIXct((strptime(data$week_start_date,"%Y-%m-%d")))
data$city<-as.factor(ifelse(data$city=="iq",0,1))
#names(data)[5:24]<-c(paste("F",5:24,sep=""))
str(data)
## 'data.frame': 1872 obs. of 26 variables:
## $ city : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ weekofyear : int 26 27 28 29 30 31 32 33 34 35 ...
## $ week_start_date : POSIXct, format: "2000-07-01" "2000-07-08" ...
## $ ndvi_ne : num 0.193 0.217 0.177 0.228 0.329 ...
## $ ndvi_nw : num 0.132 0.276 0.173 0.145 0.322 ...
## $ ndvi_se : num 0.341 0.289 0.204 0.254 0.254 ...
## $ ndvi_sw : num 0.247 0.242 0.128 0.2 0.361 ...
## $ precipitation_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_air_temp_k : num 297 297 296 295 296 ...
## $ reanalysis_avg_temp_k : num 298 298 297 296 298 ...
## $ reanalysis_dew_point_temp_k : num 295 295 296 293 294 ...
## $ reanalysis_max_air_temp_k : num 307 307 304 304 307 ...
## $ reanalysis_min_air_temp_k : num 293 291 293 289 292 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 43.2 46 64.8 24 31.8 ...
## $ reanalysis_relative_humidity_percent : num 92.4 93.6 95.8 87.2 88.2 ...
## $ reanalysis_sat_precip_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_specific_humidity_g_per_kg: num 16.7 16.9 17.1 14.4 15.4 ...
## $ reanalysis_tdtr_k : num 8.93 10.31 7.39 9.11 9.5 ...
## $ station_avg_temp_c : num 26.4 26.9 26.8 25.8 26.6 ...
## $ station_diur_temp_rng_c : num 10.8 11.6 11.5 10.5 11.5 ...
## $ station_max_temp_c : num 32.5 34 33 31.5 33.3 32 34 33 34 34 ...
## $ station_min_temp_c : num 20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
## $ station_precip_mm : num 3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
## $ total_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
co<-cor(data[,5:24],method=("spearman"))
co<-round(co,1)
corrplot(co,tl.cex = .5)
data$Month<-format(data$week_start_date,"%m")
data$Month<-as.numeric(data$Month)
Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")
levels(Train$city)<-c("IQ","SJ")
g<-ggplot(data=Train,aes(x=Month,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")+facet_wrap(.~city)
g
g<-ggplot(data=Train,aes(x=Train$total_cases,fill=city))
g<-g+geom_bar(stat="count")+xlab("Total Dengue Cases")
g
g<-ggplot(data=Train[Train$city=="IQ",],aes(x=total_cases,fill=city))
g<-g+geom_bar(stat="count")+xlab("Total Dengue Cases in IQ")
g
Looking closes for IQ, checking cases more than 35.
iqd<-Train[Train$city=="IQ" & Train$total_cases>35,]
g<-ggplot(data=Train[Train$city=="SJ",],aes(x=total_cases,fill=city))
g<-g+geom_bar(stat="count")+xlab("Total Dengue Cases in SJ")
g
g<-ggplot(data=Train,aes(x=city,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=year,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")+facet_wrap(.~city)
g
g<-ggplot(data=Train,aes(x=weekofyear,y=total_cases,fill=city))
g<-g+geom_bar(stat="identity")+facet_wrap(.~city)
g
Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")
sj<-filter(Train,city=="1")
sj<-sj[,5:25]
co_sj<-cor(as.matrix(sj[,-21]),as.matrix(sj[,21]))
co_sj<-as.data.frame(co_sj)
co_sj$Features<-rownames(co_sj)
g<-ggplot(data=co_sj)
g<-g+geom_bar(stat="identity",aes(y=V1,x=reorder(Features,V1)))+ylab("Correlation")+xlab("Features")
g<-g+coord_flip()+ggtitle("Correlation with Cases in SJ")
g
co_v<-cor(sj)
corrplot(co_v,tl.cex=.7)
iq<-filter(Train,city=="0")
iq<-iq[,5:25]
co_iq<-cor(as.matrix(iq[,-21]),as.matrix(iq[,21]))
co_iq<-as.data.frame(co_iq)
co_iq$Features<-rownames(co_iq)
g<-ggplot(data=co_iq)
g<-g+geom_bar(stat="identity",aes(y=V1,x=reorder(Features,V1)))+ylab("Correlation")+xlab("Features")
g<-g+coord_flip()+ggtitle("Correlation with Cases in IQ")
g
co_v<-cor(iq)
corrplot(co_v,tl.cex=.7)
Predictors for Iq and SJ and PreProcessing
#Filtering the cases in IQ with more than 50 cases- as they are out of ordinary and outliers.
a<-as.numeric(row.names(data[data$Set=="Train" & data$city=="0" & data$total_cases>35,]))
b<-as.numeric(row.names(data[data$Set=="Train" & data$city=="1" & data$total_cases>70,]))
data<-data[-c(a,b),]
Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")
Train$Set<-NULL
Test$Set<-NULL
split<-sample(nrow(Train),nrow(Train)*0.7)
Train<-Train[split,]
Train_validation<-Train[-split,]
Train_validation_iq<-filter(Train_validation,city=="0")
Train_validation_sj<-filter(Train_validation,city=="1")
Train_iq<-filter(Train,city=="0")
Train_sj<-filter(Train,city=="1")
Test_iq<-filter(Test,city=="0")
Test_sj<-filter(Test,city=="1")
Train_iq<-select(Train_iq,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))
Test_iq<-select(Test_iq,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))
Train_sj<-select(Train_sj,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))
Test_sj<-select(Test_sj,c(weekofyear,year,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,station_min_temp_c,total_cases))
tr<-trainControl(method = "repeatedcv",number = 10,repeats = 5)
rf_iq<-train(total_cases~.,data=Train_iq,ntree=1000,trControl=tr)
#rf_iq<-randomForest(total_cases~.,data=Train_iq)
predicted_rf_iq<-predict(rf_iq,data=Train_iq)
rf_sj<-train(total_cases~.,data=Train_sj,ntree=1000,trControl=tr)
#rf_sj<-randomForest(total_cases~.,data=Train_sj)
predicted_rf_sj<-predict(rf_sj,data=Train_sj)
predicted_rf_iq<-ifelse(predicted_rf_iq<0, 0 , predicted_rf_iq)
errors<-as.data.frame(ae(Train_iq$total_cases,predicted_rf_iq))
names(errors)<-"Residual error"
g<-ggplot(data=errors)
g<-g+geom_histogram(aes(x=errors$`Residual error`))+xlab("Residual error in IQ")
g
predicted_rf_sj<-ifelse(predicted_rf_sj<0, 0 , predicted_rf_sj)
errors<-as.data.frame(ae(Train_sj$total_cases,predicted_rf_sj))
names(errors)<-"Residual error"
g<-ggplot(data=errors)
g<-g+geom_histogram(aes(x=errors$`Residual error`))+xlab("Residual error in SJ")
g
paste("The Root Mean Squared Error in IQ is:",RMSE(Train_iq$total_cases,predicted_rf_iq),sep=" ")
## [1] "The Root Mean Squared Error in IQ is: 2.33830883911239"
paste("The R2 statistic in IQ is:",R2(predicted_rf_iq,Train_iq$total_cases))
## [1] "The R2 statistic in IQ is: 0.925784323899761"
paste("The Mean absolute error of IQ is:",mae(predicted_rf_iq,Train_iq$total_cases),sep=" ")
## [1] "The Mean absolute error of IQ is: 1.52910462700661"
paste("The Root Mean Squared Error in SJ is:",RMSE(Train_sj$total_cases,predicted_rf_sj),sep=" ")
## [1] "The Root Mean Squared Error in SJ is: 3.82629738625393"
paste("The R2 statistic in SJ is:",R2(predicted_rf_sj,Train_sj$total_cases))
## [1] "The R2 statistic in SJ is: 0.958922791276053"
paste("The Mean absolute error of SJ is:",mae(predicted_rf_sj,Train_sj$total_cases),sep=" ")
## [1] "The Mean absolute error of SJ is: 2.82575209394454"
Predicting for Test.
test_predict_iq<-predict(rf_iq,newdata = Test_iq)
test_predict_iq<-round(test_predict_iq,0)
test_predict_iq<-ifelse(test_predict_iq<0,0,test_predict_iq)
test_predict_sj<-predict(rf_sj,newdata = Test_sj)
test_predict_sj<-round(test_predict_sj,0)
test_predict_sj<-ifelse(test_predict_sj<0,0,test_predict_sj)
levels(Test$city)<-c("iq","sj")
test_predict<-c(test_predict_sj,test_predict_iq)
sub<-data.frame(city=Test$city,year=Test$year,weekofyear=Test$weekofyear,total_cases=test_predict)
write.csv(sub,"rf.csv",row.names = FALSE)
pred_iq_valid<-predict(rf_iq,newdata=Train_validation_iq)
pred_iq_valid<-round(pred_iq_valid,0)
pred_iq_valid<-ifelse(pred_iq_valid<0,0,pred_iq_valid)
pred_sj_valid<-predict(rf_sj,newdata = Train_validation_sj)
pred_sj_valid<-round(pred_sj_valid,0)
pred_sj_valid<-ifelse(pred_sj_valid<0,0,pred_sj_valid)
visual_valid<-data.frame(weekDateValidation=Train_validation_sj$week_start_date,Cases=Train_validation_sj$total_cases,Predicted=pred_sj_valid)
g<-ggplot(data=visual_valid)
g<-g+geom_line(aes(x=weekDateValidation,y=Cases,col="Train"))
g<-g+geom_line(aes(x=weekDateValidation,y=pred_sj_valid,col="Predicted"))
g<-g+ggtitle("Comparing the predicted and Train validated set in SJ.")
g
visual_valid<-data.frame(weekDateValidation=Train_validation_iq$week_start_date,Cases=Train_validation_iq$total_cases,Predicted=pred_iq_valid)
g<-ggplot(data=visual_valid)
g<-g+geom_line(aes(x=weekDateValidation,y=Cases,col="Train"))
g<-g+geom_line(aes(x=weekDateValidation,y=pred_iq_valid,col="Predicted"))
g<-g+ggtitle("Comparing the predicted and Train validated set in IQ.")
g