Predicting cases of dengue

Model SJ
Model IQ
Submission
Prophet try

###Import and preprocessing

# We first import the packages that we need to do the task:
library(readr)
library(Metrics)
library(ggplot2)
library(markdown)
library(corrplot)
library(dplyr)
library(caret)
# We import our data and rename the columns:
set.seed(203)
setwd("..")
dengue <- read_csv("./Datasets/dengue_features_train.csv")
dengue_label <- read_csv("./Datasets/dengue_labels_train.csv")
total_dengue <- merge(dengue, dengue_label)
any(is.na(total_dengue))

[1] TRUE

# Dengue with missing values erased
dengue2 <- total_dengue
str(dengue2)

'data.frame':   1456 obs. of  25 variables:
 $ city                                 : chr  "iq" "iq" "iq" "iq" ...
 $ year                                 : num  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
 $ weekofyear                           : num  26 27 28 29 30 31 32 33 34 35 ...
 $ week_start_date                      : Date, format: "2000-07-01" "2000-07-08" ...
 $ ndvi_ne                              : num  0.193 0.217 0.177 0.228 0.329 ...
 $ ndvi_nw                              : num  0.132 0.276 0.173 0.145 0.322 ...
 $ ndvi_se                              : num  0.341 0.289 0.204 0.254 0.254 ...
 $ ndvi_sw                              : num  0.247 0.242 0.128 0.2 0.361 ...
 $ precipitation_amt_mm                 : num  25.4 60.6 55.5 5.6 62.8 ...
 $ reanalysis_air_temp_k                : num  297 297 296 295 296 ...
 $ reanalysis_avg_temp_k                : num  298 298 297 296 298 ...
 $ reanalysis_dew_point_temp_k          : num  295 295 296 293 294 ...
 $ reanalysis_max_air_temp_k            : num  307 307 304 304 307 ...
 $ reanalysis_min_air_temp_k            : num  293 291 293 289 292 ...
 $ reanalysis_precip_amt_kg_per_m2      : num  43.2 46 64.8 24 31.8 ...
 $ reanalysis_relative_humidity_percent : num  92.4 93.6 95.8 87.2 88.2 ...
 $ reanalysis_sat_precip_amt_mm         : num  25.4 60.6 55.5 5.6 62.8 ...
 $ reanalysis_specific_humidity_g_per_kg: num  16.7 16.9 17.1 14.4 15.4 ...
 $ reanalysis_tdtr_k                    : num  8.93 10.31 7.39 9.11 9.5 ...
 $ station_avg_temp_c                   : num  26.4 26.9 26.8 25.8 26.6 ...
 $ station_diur_temp_rng_c              : num  10.8 11.6 11.5 10.5 11.5 ...
 $ station_max_temp_c                   : num  32.5 34 33 31.5 33.3 32 34 33 34 34 ...
 $ station_min_temp_c                   : num  20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
 $ station_precip_mm                    : num  3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
 $ total_cases                          : num  0 0 0 0 0 0 0 0 0 0 ...

dengue_iq <- filter(dengue2, city == "iq")
dengue_sj <- filter(dengue2, city == "sj")
dengue_iq <- dengue_iq[-1]
# dengue_iq <- dengue_iq[-3]
dengue_sj <- dengue_sj[-1]

# cor_iq <- round(digits = 2 , cor(dengue_iq, method = c('pearson',
# 'kendall', 'spearman'))) No correlations found

Model SJ

ndengue_sj <- as.data.frame(dengue_sj$total_cases)
ndengue_sj$year <- dengue_sj$year
ndengue_sj$weekofyear <- dengue_sj$weekofyear
ndengue_sj$week_start_date <- dengue_sj$week_start_date
ndengue_sj$total_cases <- dengue_sj$total_cases
ndengue_sj <- ndengue_sj[-1]
sj_training_index <- createDataPartition(y = ndengue_sj$total_cases, p = 0.75, 
    list = FALSE)
sj_trainSet <- ndengue_sj[sj_training_index, ]
sj_testSet <- ndengue_sj[-sj_training_index, ]

#We'll try a tree model
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)
sj_gbm_model <- train(total_cases ~., data = sj_trainSet, method = "gbm", 
                    trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE, metric = "MAE")

sj_rf_model <- train(total_cases ~., data = sj_trainSet, method = "rf", 
                    trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE, metric = "MAE")

note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

# Models

sj_rf_model

Random Forest 

704 samples
  3 predictor

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 633, 633, 633, 634, 633, 634, ... 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE     
  2     14.69925  0.8933123  7.823636
  3     15.93092  0.8734293  8.422975

MAE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.

sj_gbm_model$bestTune

  n.trees interaction.depth shrinkage n.minobsinnode
9     150                 3       0.1             10

# rf_model gives better results

sj_predictions <- predict(sj_rf_model, sj_testSet)
abs_error <- abs(sj_predictions - sj_testSet$total_cases)
abs_error

           2            4            7           11           12 
  0.60536667   1.40536667   0.25726667   0.53183333   6.04376667 
          22           31           33           34           35 
 11.78633333   2.63833333   7.99356667  29.05596667  27.98630000 
          40           41           50           53           57 
  1.33523333   1.06786667   1.22973333   4.22040000   0.32973333 
          60           63           65           69           70 
  7.54326667  14.54550000   3.72100000   7.20933333  43.55733333 
          74           76           80           92          102 
  7.07240000   7.98086667  11.85700000   6.44883333   2.79326667 
         105          106          109          116          117 
  4.08930000   1.50220000   3.62713333   8.62720000  11.37263333 
         118          126          128          144          151 
  2.00500000   1.81170000  16.94873333   0.16693333   6.36120000 
         156          159          164          165          167 
  7.39626667   0.49953333   8.22746667   3.65356667  14.68580000 
         172          181          185          191          199 
  0.19553333  11.08946667   6.16160000   4.77193333   3.43493333 
         206          209          214          215          218 
  4.19116667   3.30716667   1.91086667  25.54643333  21.17106667 
         219          226          227          231          238 
 38.84203333  74.19703333  96.68523333  59.21083333  71.29823333 
         241          248          249          253          255 
  1.88280000   5.00926667  10.28136667   0.20226667   4.31993333 
         257          260          261          262          263 
  2.24886667   2.75240000   0.17466667   4.02240000   3.06150000 
         269          278          281          283          287 
 21.43520000   3.51320000  12.29910000   7.29880000   3.44216667 
         289          295          299          305          307 
  7.18146667   1.62430000   7.74153333   3.24176667   8.42760000 
         308          312          313          314          316 
  0.33966667   0.11653333   1.56063333   0.76933333   1.25433333 
         325          333          335          336          338 
  4.76886667   7.13136667   0.66590000   1.13306667   0.06956667 
         340          343          346          347          352 
  3.70780000   2.19630000   1.52453333   5.65936667   2.82920000 
         358          360          362          363          364 
  0.38730000   1.93146667   0.12036667   6.54600000   5.01570000 
         365          379          386          388          394 
  0.74963333   7.75193333  48.14550000  14.32256667   0.51706667 
         402          404          415          417          419 
  8.40123333   6.49553333   1.89220000   0.74286667  20.32950000 
         421          423          424          427          428 
 35.89686667  70.80640000 110.69096667  18.52476667  26.89603333 
         429          430          437          439          440 
 81.13006667  19.64186667   7.26120000  18.45156667   5.87920000 
         441          445          450          453          455 
  8.06096667   0.97703333   6.80856667   3.34896667   4.35503333 
         457          458          459          468          469 
  8.98723333   1.41703333   5.72526667   3.14796667   7.47660000 
         470          478          479          480          483 
  2.29556667  29.44936667  41.98690000  35.77770000   9.53136667 
         485          486          487          492          497 
 16.25150000   6.15876667  16.77796667   4.51270000   7.37310000 
         499          513          514          515          521 
  0.66563333   0.37730000   1.35730000   4.37256667   2.77523333 
         523          529          536          537          540 
  3.80756667  21.52903333   3.22023333   2.98833333   5.88230000 
         543          544          546          558          560 
  7.63163333  12.85850000   4.66853333   3.85750000  10.22626667 
 [ reached getOption("max.print") -- omitted 82 entries ]

mean(abs_error)

[1] 8.960396

mae(sj_predictions, sj_testSet$total_cases)

[1] 8.960396

Model IQ

ndengue_iq <- as.data.frame(dengue_iq$total_cases)
ndengue_iq$year <- dengue_iq$year
ndengue_iq$weekofyear <- dengue_iq$weekofyear
ndengue_iq$week_start_date <- dengue_iq$week_start_date
ndengue_iq$total_cases <- dengue_iq$total_cases
ndengue_iq <- ndengue_iq[-1]
iq_training_index <- createDataPartition(y = ndengue_iq$total_cases, p = 0.75, 
    list = FALSE)

iq_trainSet <- ndengue_iq[iq_training_index, ]
iq_testSet <- ndengue_iq[-iq_training_index, ]

#We'll try a tree model
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)
iq_gbm_model <- train(total_cases ~., data = iq_trainSet, method = "gbm", 
                    trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE, metric = "MAE")

iq_rf_model <- train(total_cases ~., data = iq_trainSet, method = "rf", 
                    trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE, metric = "MAE")

note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

# Models
iq_rf_model

Random Forest 

392 samples
  3 predictor

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 353, 352, 353, 352, 354, 353, ... 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE     
  2     6.799687  0.6436208  3.674138
  3     6.900326  0.6278193  3.799385

MAE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.

iq_gbm_model

Stochastic Gradient Boosting 

392 samples
  3 predictor

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 352, 353, 355, 353, 353, 352, ... 
Resampling results across tuning parameters:

  interaction.depth  n.trees  RMSE      Rsquared   MAE     
  1                   50      9.554108  0.1791399  5.896704
  1                  100      9.374815  0.2124445  5.784036
  1                  150      9.240234  0.2393229  5.694098
  2                   50      8.687365  0.3447116  5.236434
  2                  100      8.138817  0.4352410  4.924732
  2                  150      7.817397  0.4851849  4.733001
  3                   50      8.131750  0.4411319  4.832819
  3                  100      7.609319  0.5178272  4.561258
  3                  150      7.262895  0.5657509  4.347620

Tuning parameter 'shrinkage' was held constant at a value of 0.1

Tuning parameter 'n.minobsinnode' was held constant at a value of 10
MAE was used to select the optimal model using the smallest value.
The final values used for the model were n.trees = 150,
 interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.

# rf_model gives better results

iq_predictions <- predict(iq_rf_model, iq_testSet)
iq_abs_error <- abs(iq_predictions - iq_testSet$total_cases)
iq_abs_error

           2            4            5            7           12 
 0.026133333  0.024133333  0.024133333  0.024133333  0.415133333 
          19           26           41           42           47 
 0.108833333  1.485633333  0.464133333  0.450400000  0.017800000 
          48           51           53           56           59 
 0.007466667  0.002800000  0.000800000  0.049000000  0.009600000 
          60           63           65           66           81 
 0.037866667  0.140333333  0.085000000  0.181633333  2.237600000 
          86           87           88           91           95 
 8.285700000  1.692066667  5.117333333  4.002500000  4.891333333 
         103          106          116          117          135 
 0.135433333  3.467000000  0.887200000  4.320466667  2.202333333 
         138          144          148          151          153 
 0.365000000  0.047100000  0.587233333  1.621300000  0.453266667 
         164          172          175          176          178 
 2.602200000  3.356433333  1.942666667  4.474866667  8.621033333 
         182          187          189          202          203 
 1.845400000  0.824166667  4.932266667  3.698166667  6.237666667 
         205          206          208          218          220 
 3.293600000  2.419100000  1.286733333  7.587300000  5.232933333 
         232          233          236          248          249 
 0.111233333  3.196666667  5.490833333  0.469900000  2.059966667 
         252          253          259          262          264 
 1.286533333  0.097866667  0.088833333  3.407133333  1.032566667 
         268          275          281          282          286 
 1.406933333  7.877600000  1.115700000  5.642966667  1.162433333 
         287          297          299          300          304 
 5.894400000  1.360300000  1.336766667  1.067333333  1.481766667 
         308          309          318          321          322 
 0.278333333 11.380600000  2.415900000  3.580366667  0.597566667 
         326          331          334          340          342 
 6.656100000 15.374833333  3.368800000  6.348933333  0.144266667 
         344          347          349          351          352 
 2.375266667  3.055433333  0.443466667  6.557300000  0.048733333 
         354          362          364          369          381 
 0.223000000  5.795400000  0.589166667  2.433833333  0.099233333 
         385          391          392          399          402 
 9.048233333  5.862866667 12.272800000  0.967133333 32.980666667 
         406          408          409          410          412 
 0.155733333  0.984233333  0.140266667  1.953933333  0.954900000 
         415          416          421          424          425 
 2.109400000  2.396833333  8.381900000  8.191366667 21.373366667 
         433          436          437          442          443 
 3.156666667  3.982266667  2.121133333  3.180800000 11.052366667 
         444          446          457          461          463 
 2.573500000  0.712733333  0.199633333  1.104666667  0.701700000 
         469          478          482          487          491 
 0.350333333  3.502466667  3.473033333 12.205733333  8.860766667 
         494          499          501          503          513 
 6.050633333  5.478100000  1.511033333  1.929166667  5.540433333 
         516          517          520 
 1.800033333  3.416166667  3.167200000

mean(iq_abs_error)

[1] 3.22985

mae(iq_predictions, iq_testSet$total_cases)

[1] 3.22985

Submission

setwd("..")

dengue_test <- read_csv("./Datasets/dengue_features_test.csv")
dengue_test_iq <- filter(dengue_test, city == "iq")
dengue_test_sj <- filter(dengue_test, city == "sj")
final_iq_predictions <- predict(iq_rf_model, dengue_test_iq)
final_sj_predictions <- predict(sj_rf_model, dengue_test_sj)
dengue_test_iq$prediction <- final_iq_predictions
dengue_test_sj$prediction <- final_sj_predictions
result_dengue_test <- rbind(dengue_test_sj, dengue_test_iq)
result_dengue_test$prediction <- round(result_dengue_test$prediction)

submission_format <- read_csv("./Datasets/submission_format.csv")
submission_format$total_cases <- result_dengue_test$prediction
results_dengue_test <- as.data.frame(result_dengue_test$prediction)
write.csv(submission_format, file = "./Results/first_try.csv")

Prophet try

library(prophet)
prophet_iq <- as_data_frame(ndengue_iq$week_start_date)
prophet_iq$ds <- ndengue_iq$week_start_date
prophet_iq$y <- ndengue_iq$total_cases
iq_prophet_model <- prophet(prophet_iq, weekly.seasonality = F, daily.seasonality = F)

iq_future <- as.data.frame(dengue_test_iq$week_start_date)
iq_future$ds <- dengue_test_iq$week_start_date
iq_future <- iq_future[-1]
iq_prophet_predictions <- predict(iq_prophet_model, iq_future)

prophet_sj <- as_data_frame(ndengue_sj$week_start_date)
prophet_sj$ds <- ndengue_sj$week_start_date
prophet_sj$y <- ndengue_sj$total_cases
sj_prophet_model <- prophet(prophet_sj, weekly.seasonality = F, daily.seasonality = F)

sj_future <- as.data.frame(dengue_test_sj$week_start_date)
sj_future$ds <- dengue_test_sj$week_start_date
sj_future <- sj_future[-1]
sj_prophet_predictions <- predict(sj_prophet_model, sj_future)
sj_pro_preds <- sj_prophet_predictions$yhat
dengue_test_iq$prediction <- iq_prophet_predictions$yhat
dengue_test_sj$prediction <- sj_prophet_predictions$yhat
result_dengue_test <- rbind(dengue_test_sj, dengue_test_iq)
result_dengue_test$prediction <- round(result_dengue_test$prediction)


setwd("..")

submission_format <- read_csv("./Datasets/submission_format.csv")
submission_format$total_cases <- result_dengue_test$prediction
results_dengue_test <- as.data.frame(result_dengue_test$prediction)

write.csv(submission_format, file = "./Results/prophet_try.csv", quote = F, 
    row.names = F)