DengAI-Predicting-Disease-Spread
DengAI-Predicting-Disease-Spread
loading packages and data
pacman:: p_load(readr, h2o, dplyr,ggplot2, caret, imputTS, data.table, lubridate, corrplot, quantreg, Ecfun, randomForest, forecast)
features_tr <- read_csv("./input/dengue_features_train.csv",
col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
labels_tr <- read_csv("./input/dengue_labels_train.csv")
features_te <- read_csv("./input/dengue_features_test.csv",
col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
submission_format <- read_csv("./input/submission_format.csv")
options(scipen=999)create two dataset, train and test
train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))
test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))
train$type <- "train"
test$type <- "test"
all <- rbind(train, test)
summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])## station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
## Min. :21.40 Min. :294.9 Min. :294.6
## 1st Qu.:26.33 1st Qu.:298.3 1st Qu.:297.7
## Median :27.43 Median :299.3 Median :298.6
## Mean :27.23 Mean :299.3 Mean :298.7
## 3rd Qu.:28.20 3rd Qu.:300.3 3rd Qu.:299.9
## Max. :30.80 Max. :303.3 Max. :302.2
## NA's :55 NA's :12 NA's :12
## station_precip_mm precipitation_amt_mm
## Min. : 0.0 Min. : 0.00
## 1st Qu.: 8.8 1st Qu.: 9.43
## Median : 23.8 Median : 36.64
## Mean : 38.2 Mean : 44.11
## 3rd Qu.: 51.5 3rd Qu.: 67.50
## Max. :543.3 Max. :390.60
## NA's :27 NA's :15
Inital investigation
Checking distributions
the plots below confirm that most of the predictors haves similar distributions across train and test
treat NAs
Split data by city
sj <- all %>% filter(city == "sj" & type == "train")
# ggplot(data = sj, aes(week_start_date, station_precip_mm)) + geom_line()
#
# ggplot(data = sj, aes(x = week_start_date, imputeTS::na.kalman(sj$station_precip_mm))) + geom_line()
tail(sj)## # A tibble: 6 x 23
## # Groups: city, year, week_start_date [6]
## city year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
## <chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
## 1 sj 2008 12 2008-03-18 0.0449 0.0244 0.102 0.088
## 2 sj 2008 13 2008-03-25 0.0778 -0.0399 0.310 0.296
## 3 sj 2008 14 2008-04-01 -0.038 -0.0168 0.119 0.0664
## 4 sj 2008 15 2008-04-08 -0.155 -0.0528 0.138 0.141
## 5 sj 2008 16 2008-04-15 0.0018 NA 0.204 0.210
## 6 sj 2008 17 2008-04-22 -0.037 -0.0104 0.0773 0.0906
## # ... with 15 more variables: reanalysis_dew_point_temp_k <dbl>,
## # reanalysis_max_air_temp_k <dbl>, reanalysis_min_air_temp_k <dbl>,
## # reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## # station_diur_temp_rng_c <dbl>, station_max_temp_c <dbl>,
## # station_min_temp_c <dbl>, total_cases <dbl>, type <chr>,
## # avg_temp <dbl>, avg_precip <dbl>
The missing values will be replaced by a forecast
Iquitos
iq <- all %>% filter(city == "iq" & type == "train")
# ggplot(data = iq, aes(week_start_date, station_diur_temp_rng_c)) + geom_line()
#
# ggplot(data = iq, aes(x = week_start_date, imputeTS::na.kalman(iq$station_diur_temp_rng_c))) + geom_line()
iq <- cbind(iq[ , c("city", "type", "week_start_date")] , apply(X = iq[, which(sapply(iq,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))total cases across cities
## city type week_start_date year weekofyear ndvi_ne ndvi_nw
## 931 sj train 2008-03-18 2008 12 0.04490 0.02445000
## 932 sj train 2008-03-25 2008 13 0.07785 -0.03990000
## 933 sj train 2008-04-01 2008 14 -0.03800 -0.01683333
## 934 sj train 2008-04-08 2008 15 -0.15520 -0.05275000
## 935 sj train 2008-04-15 2008 16 0.00180 -0.04492786
## 936 sj train 2008-04-22 2008 17 -0.03700 -0.01036667
## ndvi_se ndvi_sw reanalysis_dew_point_temp_k
## 931 0.10162860 0.08800000 292.2057
## 932 0.31047140 0.29624290 292.0957
## 933 0.11937140 0.06638571 293.2357
## 934 0.13775710 0.14121430 292.7329
## 935 0.20390000 0.20984290 292.2743
## 936 0.07731429 0.09058571 294.2800
## reanalysis_max_air_temp_k reanalysis_min_air_temp_k
## 931 299.8 294.9
## 932 299.7 294.4
## 933 299.8 296.5
## 934 299.4 295.8
## 935 299.7 295.9
## 936 300.9 295.9
## reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
## 931 0.90 72.91571
## 932 7.55 74.24714
## 933 3.67 74.60000
## 934 35.00 75.02714
## 935 4.82 72.28571
## 936 2.17 76.96000
## reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
## 931 0.00 13.73714
## 932 27.19 13.64429
## 933 3.82 14.66286
## 934 16.96 14.18429
## 935 0.00 13.85857
## 936 0.00 15.67143
## reanalysis_tdtr_k station_diur_temp_rng_c station_max_temp_c
## 931 3.871429 7.042857 30.0
## 932 2.885714 5.785714 30.0
## 933 2.714286 6.814286 30.6
## 934 2.185714 5.714286 29.4
## 935 2.785714 6.242857 29.4
## 936 3.957143 7.514286 31.7
## station_min_temp_c total_cases avg_temp avg_precip
## 931 20.6 3 297.4357 0.5
## 932 21.1 4 296.9571 1.8
## 933 22.2 3 298.2286 0.5
## 934 21.7 1 297.5643 30.7
## 935 21.7 3 297.7786 11.2
## 936 23.3 5 298.6929 0.3
Correlation analysis (with lags)
melt data to plot variables against total_cases
m <- reshape::melt(data = sj, id.vars = c("city", "type", "week_start_date", "year", "weekofyear")) # create a melted dataframe
# ggplot(filter(m, variable %in% c("ndvi_ne", "total_cases")), aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")
#
# m %>% filter(variable %in% c("station_min_temp_c", "total_cases")) %>% ggplot(aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")Pairwise correlation
function for pairwise correlation
pairwiseCor <- function(dataframe){
pairs <- combn(names(dataframe), 2, simplify=FALSE)
df <- data.frame(Vairable1=rep(0,length(pairs)), Variable2=rep(0,length(pairs)),
AbsCor=rep(0,length(pairs)), Cor=rep(0,length(pairs)))
for(i in 1:length(pairs)){
df[i,1] <- pairs[[i]][1]
df[i,2] <- pairs[[i]][2]
df[i,3] <- round(abs(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman")),4)
df[i,4] <- round(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman"),4)
}
pairwiseCorDF <- df
pairwiseCorDF <- pairwiseCorDF[order(pairwiseCorDF$AbsCor, decreasing=TRUE),]
row.names(pairwiseCorDF) <- 1:length(pairs)
pairwiseCorDF <<- pairwiseCorDF
pairwiseCorDF
}corr_all <- cor(sj[, which(sapply(sj,is.numeric))])
pairwise <- pairwiseCor(sj[, which(sapply(sj,is.numeric))])
pairwise_total_cases <- filter(pairwise, Variable2 == "total_cases")
pairwise_total_cases## Vairable1 Variable2 AbsCor Cor
## 1 weekofyear total_cases 0.3587 0.3587
## 2 year total_cases 0.3332 -0.3332
## 3 reanalysis_specific_humidity_g_per_kg total_cases 0.2593 0.2593
## 4 reanalysis_dew_point_temp_k total_cases 0.2538 0.2538
## 5 reanalysis_min_air_temp_k total_cases 0.2311 0.2311
## 6 reanalysis_precip_amt_kg_per_m2 total_cases 0.2229 0.2229
## 7 reanalysis_relative_humidity_percent total_cases 0.1981 0.1981
## 8 reanalysis_max_air_temp_k total_cases 0.1976 0.1976
## 9 station_min_temp_c total_cases 0.1871 0.1871
## 10 reanalysis_tdtr_k total_cases 0.1643 -0.1643
## 11 station_max_temp_c total_cases 0.1503 0.1503
## 12 ndvi_nw total_cases 0.1335 0.1335
## 13 reanalysis_sat_precip_amt_mm total_cases 0.1295 0.1295
## 14 ndvi_ne total_cases 0.0835 0.0835
## 15 station_diur_temp_rng_c total_cases 0.0269 0.0269
## 16 ndvi_sw total_cases 0.0082 -0.0082
## 17 ndvi_se total_cases 0.0005 0.0005
## [1] "reanalysis_specific_humidity_g_per_kg"
## [2] "avg_temp"
## Vairable1
## 1 reanalysis_dew_point_temp_k
## 2 reanalysis_min_air_temp_k
## 3 reanalysis_max_air_temp_k
## 4 reanalysis_dew_point_temp_k
## 5 reanalysis_dew_point_temp_k
## 6 reanalysis_specific_humidity_g_per_kg
## Variable2 AbsCor Cor
## 1 reanalysis_specific_humidity_g_per_kg 0.9996 0.9996
## 2 avg_temp 0.9459 0.9459
## 3 avg_temp 0.9433 0.9433
## 4 avg_temp 0.8910 0.8910
## 5 reanalysis_min_air_temp_k 0.8905 0.8905
## 6 avg_temp 0.8890 0.8890
Compare correlation by lags
compare_cor <- c()
# for(i in 1:20) {
#
# a <- sj %>% select(station_avg_temp_c,total_cases)
#
# b <- stats::cor(cbind(a$total_cases, lag(a$station_avg_temp_c,i)), method = "spearman" , use = "complete.obs")
#
# compare_cor <- rbind(compare_cor, b[1,])
#
# }
# which(abs(compare_cor[, 2]) == max(abs(compare_cor[,2])))Modelling San Juan
train model to predict Test (All train)
# Get the best mtry train a random forest using that mtry
sj$year <- as.character(sj$year)
preprocessparams <- preProcess(x = select(sj[ , which(sapply(sj,is.numeric))], -total_cases) ,method = c("center", "scale", "pca"))
sj <- predict(preprocessparams, sj)
sj$type <- NULL
#sj$week_start_date <- NULL
opt_lambda <- BoxCox.lambda(sj$total_cases)
sj$total_cases <- BoxCox(sj$total_cases, lambda = opt_lambda)
rf_reg <- tuneRF(x = sj[ , -which(names(sj) == "total_cases")], y = sj$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=TRUE, doBest = T) ## mtry = 4 OOB error = 3.544785
## Searching left ...
## mtry = 2 OOB error = 5.414495
## -0.5274537 0.05
## Searching right ...
## mtry = 8 OOB error = 2.649467
## 0.2525734 0.05
## mtry = 13 OOB error = 2.616229
## 0.01254497 0.05
## RMSE Rsquared MAE
## 0.6511216 0.9760122 0.4836689
Make final prediction for San Juan (test)
The missing values will be replaced by a forecast
Apply parameters and predict
sj_te <- predict(preprocessparams, sj_te)
sj_te$total_cases <- predict(rf_reg , sj_te)
#sj_te$total_cases <- sj_te$total_cases^2
ggplot(sj_te, aes(week_start_date, total_cases)) + geom_line()sj$year <- as.numeric(sj$year)
sj$weekofyear <- as.numeric(sj$weekofyear)
sj$total_cases <- InvBoxCox(sj$total_cases, opt_lambda)
sj_te$total_cases <- InvBoxCox(sj_te$total_cases, lambda = opt_lambda)
ggplot(data = union_all(sj, sj_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line() Modelling Iquito
Iquito
train model to predict Test (All train)
## mtry = 4 OOB error = 3.276995
## Searching left ...
## mtry = 2 OOB error = 3.663842
## -0.1180494 0.05
## Searching right ...
## mtry = 8 OOB error = 2.880987
## 0.1208449 0.05
## mtry = 13 OOB error = 3.059203
## -0.06185953 0.05
Make final prediction for Iquito (test)
The missing values will be replaced by a forecast
#iq_te$type <- NULL
iq_te$city <- NULL
#iq_te$year <- NULL
#iq_te$week_start_date <- NULL
iq_te$weekofyear <- as.character(iq_te$weekofyear)
iq_te <- predict(preprocessparams, iq_te)iq_te$total_cases <- predict(rf_reg, iq_te)
iq$year <- as.numeric(iq$year)
iq$weekofyear <- as.numeric(iq$weekofyear)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)
iq$type <- "train"
iq$total_cases <- InvBoxCox(iq$total_cases, opt_lambda)
iq_te$total_cases <- InvBoxCox(iq_te$total_cases, lambda = opt_lambda)
ggplot(data = union_all(iq, iq_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line() Final Submission using RF
iq_te$city <- "iq"
iq_te$type <- "test"
rf_final_df <- rbind(iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")],
sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])
submission_format$total_cases <- 10
#submission_format$year <- as.character(submission_format$year)
#submission_format$weekofyear <- as.character(submission_format$weekofyear)
submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))
data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")
submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round
ggplot(submission_rf, aes(week_start_date, total_cases)) + geom_line() + facet_wrap(city~.)submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases)
write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)
summary(submission_format)## city year weekofyear total_cases
## Length:416 Min. :2008 Min. : 1.00 Min. :10
## Class :character 1st Qu.:2010 1st Qu.:13.75 1st Qu.:10
## Mode :character Median :2011 Median :26.00 Median :10
## Mean :2011 Mean :26.44 Mean :10
## 3rd Qu.:2012 3rd Qu.:39.00 3rd Qu.:10
## Max. :2013 Max. :53.00 Max. :10
## city year weekofyear total_cases
## Length:416 Min. :2008 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.:2010 1st Qu.:13.75 1st Qu.: 3.00
## Mode :character Median :2011 Median :26.00 Median : 8.00
## Mean :2011 Mean :26.44 Mean :18.26
## 3rd Qu.:2012 3rd Qu.:39.00 3rd Qu.:24.00
## Max. :2013 Max. :53.00 Max. :84.00
Prophet
Prophet San Juan
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
##
## :=
sj_prohet_model <- prophet(sj_prophet, daily.seasonality = F, weekly.seasonality = F, yearly.seasonality = T, seasonality.mode = "multiplicative")
future <- make_future_dataframe(sj_prohet_model, 500, freq = "week")
forecast <- predict(sj_prohet_model, future)
forecast$year <- year(forecast$ds)
forecast$weekofyear <- week(forecast$ds)
dyplot.prophet(x = sj_prohet_model, fcst = forecast)Prophet Iquito
iq_prophet <- setnames(iq, c("week_start_date", "total_cases"),new = c("ds","y"))
iq_prohet_model <- prophet(iq_prophet)## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(iq_prohet_model, 500, freq = "week")
forecast <- predict(iq_prohet_model, future)
forecast$year <- year(forecast$ds)
forecast$weekofyear <- week(forecast$ds)
dyplot.prophet(x = iq_prohet_model, fcst = forecast)Match Prophet and submission
forecast$ds <- as.Date(forecast$ds)
forecast$city <- "iq"
forecast <- rbind(forecast_sj, forecast)
submission <- left_join(submission_format, y = forecast[ , c("ds","weekofyear","year", "yhat", "city")], by = c("year","weekofyear", "city"))
submission$yhat <- imputeTS::na.kalman(submission$yhat) # fill NAs due to 53 week yearChange negative values to 0
Final final submission with the mean of both previous
submission_mean <- submission
a <- cbind(submission$total_cases, submission_rf$total_cases)
submission_mean$total_cases <- round(apply(X = a, MARGIN = 1, function(x) mean(x)),0)
write.csv(submission_mean, "submission_mean.csv", quote = F, row.names = F)–> –>