Predicting cases of dengue
###Import and preprocessing
# We first import the packages that we need to do the task:
library(readr)
library(Metrics)
library(ggplot2)
library(markdown)
library(corrplot)
library(dplyr)
library(caret)
# We import our data and rename the columns:
set.seed(203)
setwd("..")
dengue <- read_csv("./Datasets/dengue_features_train.csv")
dengue_label <- read_csv("./Datasets/dengue_labels_train.csv")
total_dengue <- merge(dengue, dengue_label)
any(is.na(total_dengue))[1] TRUE
'data.frame': 1456 obs. of 25 variables:
$ city : chr "iq" "iq" "iq" "iq" ...
$ year : num 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
$ weekofyear : num 26 27 28 29 30 31 32 33 34 35 ...
$ week_start_date : Date, format: "2000-07-01" "2000-07-08" ...
$ ndvi_ne : num 0.193 0.217 0.177 0.228 0.329 ...
$ ndvi_nw : num 0.132 0.276 0.173 0.145 0.322 ...
$ ndvi_se : num 0.341 0.289 0.204 0.254 0.254 ...
$ ndvi_sw : num 0.247 0.242 0.128 0.2 0.361 ...
$ precipitation_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
$ reanalysis_air_temp_k : num 297 297 296 295 296 ...
$ reanalysis_avg_temp_k : num 298 298 297 296 298 ...
$ reanalysis_dew_point_temp_k : num 295 295 296 293 294 ...
$ reanalysis_max_air_temp_k : num 307 307 304 304 307 ...
$ reanalysis_min_air_temp_k : num 293 291 293 289 292 ...
$ reanalysis_precip_amt_kg_per_m2 : num 43.2 46 64.8 24 31.8 ...
$ reanalysis_relative_humidity_percent : num 92.4 93.6 95.8 87.2 88.2 ...
$ reanalysis_sat_precip_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
$ reanalysis_specific_humidity_g_per_kg: num 16.7 16.9 17.1 14.4 15.4 ...
$ reanalysis_tdtr_k : num 8.93 10.31 7.39 9.11 9.5 ...
$ station_avg_temp_c : num 26.4 26.9 26.8 25.8 26.6 ...
$ station_diur_temp_rng_c : num 10.8 11.6 11.5 10.5 11.5 ...
$ station_max_temp_c : num 32.5 34 33 31.5 33.3 32 34 33 34 34 ...
$ station_min_temp_c : num 20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
$ station_precip_mm : num 3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
$ total_cases : num 0 0 0 0 0 0 0 0 0 0 ...
dengue_iq <- filter(dengue2, city == "iq")
dengue_sj <- filter(dengue2, city == "sj")
dengue_iq <- dengue_iq[-1]
# dengue_iq <- dengue_iq[-3]
dengue_sj <- dengue_sj[-1]
# cor_iq <- round(digits = 2 , cor(dengue_iq, method = c('pearson',
# 'kendall', 'spearman'))) No correlations foundModel SJ
ndengue_sj <- as.data.frame(dengue_sj$total_cases)
ndengue_sj$year <- dengue_sj$year
ndengue_sj$weekofyear <- dengue_sj$weekofyear
ndengue_sj$week_start_date <- dengue_sj$week_start_date
ndengue_sj$total_cases <- dengue_sj$total_cases
ndengue_sj <- ndengue_sj[-1]
sj_training_index <- createDataPartition(y = ndengue_sj$total_cases, p = 0.75,
list = FALSE)
sj_trainSet <- ndengue_sj[sj_training_index, ]
sj_testSet <- ndengue_sj[-sj_training_index, ]#We'll try a tree model
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
sj_gbm_model <- train(total_cases ~., data = sj_trainSet, method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE, metric = "MAE")sj_rf_model <- train(total_cases ~., data = sj_trainSet, method = "rf",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE, metric = "MAE")note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
Random Forest
704 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 633, 633, 633, 634, 633, 634, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE
2 14.69925 0.8933123 7.823636
3 15.93092 0.8734293 8.422975
MAE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.
n.trees interaction.depth shrinkage n.minobsinnode
9 150 3 0.1 10
sj_predictions <- predict(sj_rf_model, sj_testSet)
abs_error <- abs(sj_predictions - sj_testSet$total_cases)
abs_error 2 4 7 11 12
0.60536667 1.40536667 0.25726667 0.53183333 6.04376667
22 31 33 34 35
11.78633333 2.63833333 7.99356667 29.05596667 27.98630000
40 41 50 53 57
1.33523333 1.06786667 1.22973333 4.22040000 0.32973333
60 63 65 69 70
7.54326667 14.54550000 3.72100000 7.20933333 43.55733333
74 76 80 92 102
7.07240000 7.98086667 11.85700000 6.44883333 2.79326667
105 106 109 116 117
4.08930000 1.50220000 3.62713333 8.62720000 11.37263333
118 126 128 144 151
2.00500000 1.81170000 16.94873333 0.16693333 6.36120000
156 159 164 165 167
7.39626667 0.49953333 8.22746667 3.65356667 14.68580000
172 181 185 191 199
0.19553333 11.08946667 6.16160000 4.77193333 3.43493333
206 209 214 215 218
4.19116667 3.30716667 1.91086667 25.54643333 21.17106667
219 226 227 231 238
38.84203333 74.19703333 96.68523333 59.21083333 71.29823333
241 248 249 253 255
1.88280000 5.00926667 10.28136667 0.20226667 4.31993333
257 260 261 262 263
2.24886667 2.75240000 0.17466667 4.02240000 3.06150000
269 278 281 283 287
21.43520000 3.51320000 12.29910000 7.29880000 3.44216667
289 295 299 305 307
7.18146667 1.62430000 7.74153333 3.24176667 8.42760000
308 312 313 314 316
0.33966667 0.11653333 1.56063333 0.76933333 1.25433333
325 333 335 336 338
4.76886667 7.13136667 0.66590000 1.13306667 0.06956667
340 343 346 347 352
3.70780000 2.19630000 1.52453333 5.65936667 2.82920000
358 360 362 363 364
0.38730000 1.93146667 0.12036667 6.54600000 5.01570000
365 379 386 388 394
0.74963333 7.75193333 48.14550000 14.32256667 0.51706667
402 404 415 417 419
8.40123333 6.49553333 1.89220000 0.74286667 20.32950000
421 423 424 427 428
35.89686667 70.80640000 110.69096667 18.52476667 26.89603333
429 430 437 439 440
81.13006667 19.64186667 7.26120000 18.45156667 5.87920000
441 445 450 453 455
8.06096667 0.97703333 6.80856667 3.34896667 4.35503333
457 458 459 468 469
8.98723333 1.41703333 5.72526667 3.14796667 7.47660000
470 478 479 480 483
2.29556667 29.44936667 41.98690000 35.77770000 9.53136667
485 486 487 492 497
16.25150000 6.15876667 16.77796667 4.51270000 7.37310000
499 513 514 515 521
0.66563333 0.37730000 1.35730000 4.37256667 2.77523333
523 529 536 537 540
3.80756667 21.52903333 3.22023333 2.98833333 5.88230000
543 544 546 558 560
7.63163333 12.85850000 4.66853333 3.85750000 10.22626667
[ reached getOption("max.print") -- omitted 82 entries ]
[1] 8.960396
[1] 8.960396
Model IQ
ndengue_iq <- as.data.frame(dengue_iq$total_cases)
ndengue_iq$year <- dengue_iq$year
ndengue_iq$weekofyear <- dengue_iq$weekofyear
ndengue_iq$week_start_date <- dengue_iq$week_start_date
ndengue_iq$total_cases <- dengue_iq$total_cases
ndengue_iq <- ndengue_iq[-1]
iq_training_index <- createDataPartition(y = ndengue_iq$total_cases, p = 0.75,
list = FALSE)
iq_trainSet <- ndengue_iq[iq_training_index, ]
iq_testSet <- ndengue_iq[-iq_training_index, ]#We'll try a tree model
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
iq_gbm_model <- train(total_cases ~., data = iq_trainSet, method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE, metric = "MAE")iq_rf_model <- train(total_cases ~., data = iq_trainSet, method = "rf",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE, metric = "MAE")note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
Random Forest
392 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 353, 352, 353, 352, 354, 353, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE
2 6.799687 0.6436208 3.674138
3 6.900326 0.6278193 3.799385
MAE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.
Stochastic Gradient Boosting
392 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 352, 353, 355, 353, 353, 352, ...
Resampling results across tuning parameters:
interaction.depth n.trees RMSE Rsquared MAE
1 50 9.554108 0.1791399 5.896704
1 100 9.374815 0.2124445 5.784036
1 150 9.240234 0.2393229 5.694098
2 50 8.687365 0.3447116 5.236434
2 100 8.138817 0.4352410 4.924732
2 150 7.817397 0.4851849 4.733001
3 50 8.131750 0.4411319 4.832819
3 100 7.609319 0.5178272 4.561258
3 150 7.262895 0.5657509 4.347620
Tuning parameter 'shrinkage' was held constant at a value of 0.1
Tuning parameter 'n.minobsinnode' was held constant at a value of 10
MAE was used to select the optimal model using the smallest value.
The final values used for the model were n.trees = 150,
interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
iq_predictions <- predict(iq_rf_model, iq_testSet)
iq_abs_error <- abs(iq_predictions - iq_testSet$total_cases)
iq_abs_error 2 4 5 7 12
0.026133333 0.024133333 0.024133333 0.024133333 0.415133333
19 26 41 42 47
0.108833333 1.485633333 0.464133333 0.450400000 0.017800000
48 51 53 56 59
0.007466667 0.002800000 0.000800000 0.049000000 0.009600000
60 63 65 66 81
0.037866667 0.140333333 0.085000000 0.181633333 2.237600000
86 87 88 91 95
8.285700000 1.692066667 5.117333333 4.002500000 4.891333333
103 106 116 117 135
0.135433333 3.467000000 0.887200000 4.320466667 2.202333333
138 144 148 151 153
0.365000000 0.047100000 0.587233333 1.621300000 0.453266667
164 172 175 176 178
2.602200000 3.356433333 1.942666667 4.474866667 8.621033333
182 187 189 202 203
1.845400000 0.824166667 4.932266667 3.698166667 6.237666667
205 206 208 218 220
3.293600000 2.419100000 1.286733333 7.587300000 5.232933333
232 233 236 248 249
0.111233333 3.196666667 5.490833333 0.469900000 2.059966667
252 253 259 262 264
1.286533333 0.097866667 0.088833333 3.407133333 1.032566667
268 275 281 282 286
1.406933333 7.877600000 1.115700000 5.642966667 1.162433333
287 297 299 300 304
5.894400000 1.360300000 1.336766667 1.067333333 1.481766667
308 309 318 321 322
0.278333333 11.380600000 2.415900000 3.580366667 0.597566667
326 331 334 340 342
6.656100000 15.374833333 3.368800000 6.348933333 0.144266667
344 347 349 351 352
2.375266667 3.055433333 0.443466667 6.557300000 0.048733333
354 362 364 369 381
0.223000000 5.795400000 0.589166667 2.433833333 0.099233333
385 391 392 399 402
9.048233333 5.862866667 12.272800000 0.967133333 32.980666667
406 408 409 410 412
0.155733333 0.984233333 0.140266667 1.953933333 0.954900000
415 416 421 424 425
2.109400000 2.396833333 8.381900000 8.191366667 21.373366667
433 436 437 442 443
3.156666667 3.982266667 2.121133333 3.180800000 11.052366667
444 446 457 461 463
2.573500000 0.712733333 0.199633333 1.104666667 0.701700000
469 478 482 487 491
0.350333333 3.502466667 3.473033333 12.205733333 8.860766667
494 499 501 503 513
6.050633333 5.478100000 1.511033333 1.929166667 5.540433333
516 517 520
1.800033333 3.416166667 3.167200000
[1] 3.22985
[1] 3.22985
Submission
setwd("..")
dengue_test <- read_csv("./Datasets/dengue_features_test.csv")
dengue_test_iq <- filter(dengue_test, city == "iq")
dengue_test_sj <- filter(dengue_test, city == "sj")
final_iq_predictions <- predict(iq_rf_model, dengue_test_iq)
final_sj_predictions <- predict(sj_rf_model, dengue_test_sj)
dengue_test_iq$prediction <- final_iq_predictions
dengue_test_sj$prediction <- final_sj_predictions
result_dengue_test <- rbind(dengue_test_sj, dengue_test_iq)
result_dengue_test$prediction <- round(result_dengue_test$prediction)
submission_format <- read_csv("./Datasets/submission_format.csv")
submission_format$total_cases <- result_dengue_test$prediction
results_dengue_test <- as.data.frame(result_dengue_test$prediction)
write.csv(submission_format, file = "./Results/first_try.csv")Prophet try
library(prophet)
prophet_iq <- as_data_frame(ndengue_iq$week_start_date)
prophet_iq$ds <- ndengue_iq$week_start_date
prophet_iq$y <- ndengue_iq$total_cases
iq_prophet_model <- prophet(prophet_iq, weekly.seasonality = F, daily.seasonality = F)
iq_future <- as.data.frame(dengue_test_iq$week_start_date)
iq_future$ds <- dengue_test_iq$week_start_date
iq_future <- iq_future[-1]
iq_prophet_predictions <- predict(iq_prophet_model, iq_future)
prophet_sj <- as_data_frame(ndengue_sj$week_start_date)
prophet_sj$ds <- ndengue_sj$week_start_date
prophet_sj$y <- ndengue_sj$total_cases
sj_prophet_model <- prophet(prophet_sj, weekly.seasonality = F, daily.seasonality = F)
sj_future <- as.data.frame(dengue_test_sj$week_start_date)
sj_future$ds <- dengue_test_sj$week_start_date
sj_future <- sj_future[-1]
sj_prophet_predictions <- predict(sj_prophet_model, sj_future)
sj_pro_preds <- sj_prophet_predictions$yhat
dengue_test_iq$prediction <- iq_prophet_predictions$yhat
dengue_test_sj$prediction <- sj_prophet_predictions$yhat
result_dengue_test <- rbind(dengue_test_sj, dengue_test_iq)
result_dengue_test$prediction <- round(result_dengue_test$prediction)
setwd("..")
submission_format <- read_csv("./Datasets/submission_format.csv")
submission_format$total_cases <- result_dengue_test$prediction
results_dengue_test <- as.data.frame(result_dengue_test$prediction)
write.csv(submission_format, file = "./Results/prophet_try.csv", quote = F,
row.names = F)