DengAI-Predicting-Disease-Spread
DengAI-Predicting-Disease-Spread
loading packages and data
pacman:: p_load(readr, h2o, dplyr,ggplot2, caret, imputTS, data.table, lubridate,
corrplot, quantreg, Ecfun, randomForest, forecast, PMmisc, kableExtra)
features_tr <- read_csv("../input/dengue_features_train.csv",
col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
labels_tr <- read_csv("../input/dengue_labels_train.csv")
features_te <- read_csv("../input/dengue_features_test.csv",
col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
submission_format <- read_csv("../input/submission_format.csv")
options(scipen=999)create two dataset, train and test
train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))
test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))
train$type <- "train"
test$type <- "test"
all <- rbind(train, test)
summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])## station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
## Min. :21.40 Min. :294.9 Min. :294.6
## 1st Qu.:26.33 1st Qu.:298.3 1st Qu.:297.7
## Median :27.43 Median :299.3 Median :298.6
## Mean :27.23 Mean :299.3 Mean :298.7
## 3rd Qu.:28.20 3rd Qu.:300.3 3rd Qu.:299.9
## Max. :30.80 Max. :303.3 Max. :302.2
## NA's :55 NA's :12 NA's :12
## station_precip_mm precipitation_amt_mm
## Min. : 0.0 Min. : 0.00
## 1st Qu.: 8.8 1st Qu.: 9.43
## Median : 23.8 Median : 36.64
## Mean : 38.2 Mean : 44.11
## 3rd Qu.: 51.5 3rd Qu.: 67.50
## Max. :543.3 Max. :390.60
## NA's :27 NA's :15
Checking distributions
the plots below confirm that most of the predictors haves similar distributions across train and test
for (i in names(all)) {
x <- ggplot(data = all,aes_string(i,color="type")) + geom_density()
print(x)
}Split data by city
Missing value analysis
The missing values will be replaced by a forecast
| x | |
|---|---|
| city | 0.0000000 |
| year | 0.0000000 |
| weekofyear | 0.0000000 |
| week_start_date | 0.0000000 |
| ndvi_ne | 0.2040598 |
| ndvi_nw | 0.0523504 |
| ndvi_se | 0.0202991 |
| ndvi_sw | 0.0202991 |
| reanalysis_dew_point_temp_k | 0.0064103 |
| reanalysis_max_air_temp_k | 0.0064103 |
| reanalysis_min_air_temp_k | 0.0064103 |
| reanalysis_precip_amt_kg_per_m2 | 0.0064103 |
| reanalysis_relative_humidity_percent | 0.0064103 |
| reanalysis_sat_precip_amt_mm | 0.0096154 |
| reanalysis_specific_humidity_g_per_kg | 0.0064103 |
| reanalysis_tdtr_k | 0.0064103 |
| station_diur_temp_rng_c | 0.0064103 |
| station_max_temp_c | 0.0064103 |
| station_min_temp_c | 0.0064103 |
| total_cases | 0.0000000 |
| type | 0.0000000 |
| avg_temp | 0.0064103 |
| avg_precip | 0.0064103 |
sj$ndvi_ne <- NULL # ndvi_ne has 20% of missing values so will not be considered
sj <- cbind(sj[ , c("city", "type", "week_start_date")] , apply(X = sj[, which(sapply(sj,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))Iquitos
iq <- all %>% filter(city == "iq" & type == "train")
apply(X = iq, 2, function(x) sum(is.na(x))/nrow(iq)) %>% knitr:: kable() %>% kable_styling() # Get percentage of missing value by column| x | |
|---|---|
| city | 0.0000000 |
| year | 0.0000000 |
| weekofyear | 0.0000000 |
| week_start_date | 0.0000000 |
| ndvi_ne | 0.0057692 |
| ndvi_nw | 0.0057692 |
| ndvi_se | 0.0057692 |
| ndvi_sw | 0.0057692 |
| reanalysis_dew_point_temp_k | 0.0076923 |
| reanalysis_max_air_temp_k | 0.0076923 |
| reanalysis_min_air_temp_k | 0.0076923 |
| reanalysis_precip_amt_kg_per_m2 | 0.0076923 |
| reanalysis_relative_humidity_percent | 0.0076923 |
| reanalysis_sat_precip_amt_mm | 0.0076923 |
| reanalysis_specific_humidity_g_per_kg | 0.0076923 |
| reanalysis_tdtr_k | 0.0076923 |
| station_diur_temp_rng_c | 0.0711538 |
| station_max_temp_c | 0.0269231 |
| station_min_temp_c | 0.0153846 |
| total_cases | 0.0000000 |
| type | 0.0000000 |
| avg_temp | 0.0076923 |
| avg_precip | 0.0307692 |
total cases across cities
Correlation analysis (with lags)
melt data to plot variables against total_cases
m <- reshape::melt(data = sj, id.vars = c("city", "type", "week_start_date", "year", "weekofyear")) # create a melted dataframe
# ggplot(filter(m, variable %in% c("ndvi_ne", "total_cases")), aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")
#
# m %>% filter(variable %in% c("station_min_temp_c", "total_cases")) %>% ggplot(aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")Pairwise correlation
function for pairwise correlation
pairwiseCor <- function(dataframe){
pairs <- combn(names(dataframe), 2, simplify=FALSE)
df <- data.frame(Vairable1=rep(0,length(pairs)), Variable2=rep(0,length(pairs)),
AbsCor=rep(0,length(pairs)), Cor=rep(0,length(pairs)))
for(i in 1:length(pairs)){
df[i,1] <- pairs[[i]][1]
df[i,2] <- pairs[[i]][2]
df[i,3] <- round(abs(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman")),4)
df[i,4] <- round(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman"),4)
}
pairwiseCorDF <- df
pairwiseCorDF <- pairwiseCorDF[order(pairwiseCorDF$AbsCor, decreasing=TRUE),]
row.names(pairwiseCorDF) <- 1:length(pairs)
pairwiseCorDF <<- pairwiseCorDF
pairwiseCorDF
}corr_all <- cor(sj[, which(sapply(sj,is.numeric))])
pairwise <- pairwiseCor(sj[, which(sapply(sj,is.numeric))])
pairwise_total_cases <- filter(pairwise, Variable2 == "total_cases")
pairwise_total_cases %>% knitr:: kable() %>% kable_styling()| Vairable1 | Variable2 | AbsCor | Cor |
|---|---|---|---|
| weekofyear | total_cases | 0.3587 | 0.3587 |
| year | total_cases | 0.3332 | -0.3332 |
| reanalysis_specific_humidity_g_per_kg | total_cases | 0.2593 | 0.2593 |
| reanalysis_dew_point_temp_k | total_cases | 0.2538 | 0.2538 |
| reanalysis_min_air_temp_k | total_cases | 0.2311 | 0.2311 |
| reanalysis_precip_amt_kg_per_m2 | total_cases | 0.2229 | 0.2229 |
| reanalysis_relative_humidity_percent | total_cases | 0.1981 | 0.1981 |
| reanalysis_max_air_temp_k | total_cases | 0.1976 | 0.1976 |
| station_min_temp_c | total_cases | 0.1871 | 0.1871 |
| reanalysis_tdtr_k | total_cases | 0.1643 | -0.1643 |
| station_max_temp_c | total_cases | 0.1503 | 0.1503 |
| ndvi_nw | total_cases | 0.1335 | 0.1335 |
| reanalysis_sat_precip_amt_mm | total_cases | 0.1295 | 0.1295 |
| station_diur_temp_rng_c | total_cases | 0.0269 | 0.0269 |
| ndvi_sw | total_cases | 0.0082 | -0.0082 |
| ndvi_se | total_cases | 0.0005 | 0.0005 |
Compare correlation by lags
x <- c()
cor_cons <- c()
for(i in names(sj[, which(sapply(sj,is.numeric))])) {
x <- cor.lag(x = sj[ , i], y = sj$total_cases, lag = 10, lead = 10)
cor_cons <- rbind(cor_cons, x)
}
cor_cons$var <- names(sj[, which(sapply(sj,is.numeric))])
cor_cons$best <- colnames(cor_cons)[apply(cor_cons,1,which.max)]
cor_cons %>% select(var, best, everything()) %>% head() %>% kable() %>%
kable_styling()| var | best | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | lag8 | lag9 | lag10 | 0 | lead1 | lead2 | lead3 | lead4 | lead5 | lead6 | lead7 | lead8 | lead9 | lead10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | lead10 | -0.2142424 | -0.2155294 | -0.2168781 | -0.2180879 | -0.2189452 | -0.2197756 | -0.2203656 | -0.2207159 | -0.2208690 | -0.2211777 | -0.2126898 | -0.2121132 | -0.2112093 | -0.2095755 | -0.2074285 | -0.2048120 | -0.2020628 | -0.1992569 | -0.1962078 | -0.1930860 | -0.1901623 |
| weekofyear | lead2 | 0.2847285 | 0.2827232 | 0.2791242 | 0.2700510 | 0.2593818 | 0.2426145 | 0.2241239 | 0.2052275 | 0.1849555 | 0.1618064 | 0.2871342 | 0.2871300 | 0.2904272 | 0.2807499 | 0.2629795 | 0.2339214 | 0.1998923 | 0.1661635 | 0.1239205 | 0.0828083 | 0.0447503 |
| ndvi_nw | lag10 | 0.0902937 | 0.0851271 | 0.0858996 | 0.0845695 | 0.0874551 | 0.0902990 | 0.0931194 | 0.1011780 | 0.1096144 | 0.1160942 | 0.0869550 | 0.0816922 | 0.0747573 | 0.0601907 | 0.0648398 | 0.0573947 | 0.0541021 | 0.0545117 | 0.0458445 | 0.0383855 | 0.0355512 |
| ndvi_se | lead10 | -0.0022149 | -0.0160184 | -0.0198933 | -0.0175861 | -0.0165110 | -0.0201982 | -0.0330849 | -0.0332932 | -0.0344726 | -0.0409569 | 0.0006669 | -0.0035401 | -0.0117113 | -0.0149307 | -0.0129014 | -0.0007484 | 0.0109015 | 0.0153835 | 0.0070941 | 0.0059332 | 0.0164549 |
| ndvi_sw | lag1 | 0.0160225 | 0.0030702 | -0.0010090 | 0.0054755 | 0.0068771 | 0.0134313 | 0.0050646 | 0.0059238 | 0.0028990 | -0.0064268 | 0.0089708 | 0.0072888 | -0.0036255 | -0.0069402 | -0.0114939 | -0.0045615 | 0.0071190 | 0.0127540 | 0.0107402 | 0.0015031 | 0.0041353 |
| reanalysis_dew_point_temp_k | lag8 | 0.2238978 | 0.2456851 | 0.2643471 | 0.2782623 | 0.2885197 | 0.2950183 | 0.2979067 | 0.3024205 | 0.2991529 | 0.2902735 | 0.2028006 | 0.1722485 | 0.1361254 | 0.1024487 | 0.0707769 | 0.0364153 | -0.0011604 | -0.0425603 | -0.0884030 | -0.1197374 | -0.1496668 |
Modelling San Juan
train model to predict Test (All train)
# Get the best mtry train a random forest using that mtry
sj$year <- as.character(sj$year)
preprocessparams <- preProcess(x = select(sj[ , which(sapply(sj,is.numeric))], -total_cases) ,method = c("center", "scale"))
sj <- predict(preprocessparams, sj)
sj$type <- NULL
#sj$week_start_date <- NULL
#opt_lambda <- BoxCox.lambda(sj$total_cases)
#sj$total_cases <- BoxCox(sj$total_cases, lambda = opt_lambda)
sj$ndvi_sw <- lag(sj$ndvi_nw, 1)
sj <- sj[complete.cases(sj), ]
rf_reg <- tuneRF(x = sj[ , -which(names(sj) == "total_cases")], y = sj$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T) ## mtry = 6 OOB error = 628.5642
## Searching left ...
## mtry = 3 OOB error = 1141.221
## -0.8156001 0.05
## Searching right ...
## mtry = 12 OOB error = 521.7531
## 0.1699288 0.05
## mtry = 19 OOB error = 451.9597
## 0.133767 0.05
## RMSE Rsquared MAE
## 9.2855402 0.9740188 5.0704260
Make final prediction for San Juan (test)
The missing values will be replaced by a forecast
Apply parameters and predict
sj_te <- predict(preprocessparams, sj_te)
sj_te$total_cases <- predict(rf_reg , sj_te)
#sj_te$total_cases <- sj_te$total_cases^2
ggplot(sj_te, aes(week_start_date, total_cases)) + geom_line()sj$year <- as.numeric(sj$year)
sj$weekofyear <- as.numeric(sj$weekofyear)
#sj$total_cases <- InvBoxCox(sj$total_cases, opt_lambda)
#sj_te$total_cases <- InvBoxCox(sj_te$total_cases, lambda = opt_lambda)
ggplot(data = union_all(sj, sj_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line() Modelling Iquito
Iquito
train model to predict Test (All train)
iq$city <- NULL
iq$type <- NULL
iq$year <- as.character(iq$year)
preprocessparams <- preProcess(x = select(iq[ , which(sapply(iq,is.numeric))], -total_cases),method = c("center", "scale","pca"))
iq <- predict(preprocessparams, iq)
#opt_lambda <- BoxCox.lambda(iq$total_cases)
#iq$total_cases <- BoxCox(iq$total_cases, lambda = opt_lambda)
rf_reg <- tuneRF(x = iq[ , -which(colnames(iq)=="total_cases")], iq$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T) ## -0.09242143 0.05
## -0.09343546 0.05
iq$pred <- predict(rf_reg, iq)
ggplot(data = iq, aes(x = week_start_date)) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" )) Make final prediction for Iquito (test)
The missing values will be replaced by a forecast
#iq_te$type <- NULL
iq_te$city <- NULL
#iq_te$year <- NULL
#iq_te$week_start_date <- NULL
iq_te$weekofyear <- as.character(iq_te$weekofyear)
iq_te <- predict(preprocessparams, iq_te)iq_te$total_cases <- predict(rf_reg, iq_te)
iq$year <- as.numeric(iq$year)
iq$weekofyear <- as.numeric(iq$weekofyear)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)
iq$type <- "train"
#iq$total_cases <- InvBoxCox(iq$total_cases, opt_lambda)
#iq_te$total_cases <- InvBoxCox(iq_te$total_cases, lambda = opt_lambda)
ggplot(data = union_all(iq, iq_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line() Final Submission using RF
iq_te$city <- "iq"
iq_te$type <- "test"
rf_final_df <- rbind(iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")],
sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])
submission_format$total_cases <- 10
#submission_format$year <- as.character(submission_format$year)
#submission_format$weekofyear <- as.character(submission_format$weekofyear)
submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))
data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")
submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round
ggplot(submission_rf, aes(week_start_date, total_cases)) + geom_line() + facet_wrap(city~.)submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases)
write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)
summary(submission_format)## city year weekofyear total_cases
## Length:416 Min. :2008 Min. : 1.00 Min. :10
## Class :character 1st Qu.:2010 1st Qu.:13.75 1st Qu.:10
## Mode :character Median :2011 Median :26.00 Median :10
## Mean :2011 Mean :26.44 Mean :10
## 3rd Qu.:2012 3rd Qu.:39.00 3rd Qu.:10
## Max. :2013 Max. :53.00 Max. :10
## city year weekofyear total_cases
## Length:416 Min. :2008 Min. : 1.00 Min. : 3.00
## Class :character 1st Qu.:2010 1st Qu.:13.75 1st Qu.: 7.00
## Mode :character Median :2011 Median :26.00 Median : 13.00
## Mean :2011 Mean :26.44 Mean : 24.62
## 3rd Qu.:2012 3rd Qu.:39.00 3rd Qu.: 32.00
## Max. :2013 Max. :53.00 Max. :137.00
Prophet
Prophet San Juan
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
##
## :=
sj_prohet_model <- prophet(sj_prophet, daily.seasonality = F, weekly.seasonality = F, yearly.seasonality = T, seasonality.mode = "multiplicative")
future <- make_future_dataframe(sj_prohet_model, 500, freq = "week")
forecast <- predict(sj_prohet_model, future)
forecast$year <- year(forecast$ds)
forecast$weekofyear <- week(forecast$ds)
dyplot.prophet(x = sj_prohet_model, fcst = forecast)Prophet Iquito
iq_prophet <- setnames(iq, c("week_start_date", "total_cases"),new = c("ds","y"))
iq_prohet_model <- prophet(iq_prophet)## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(iq_prohet_model, 500, freq = "week")
forecast <- predict(iq_prohet_model, future)
forecast$year <- year(forecast$ds)
forecast$weekofyear <- week(forecast$ds)
dyplot.prophet(x = iq_prohet_model, fcst = forecast)Match Prophet and submission
forecast$ds <- as.Date(forecast$ds)
forecast$city <- "iq"
forecast <- rbind(forecast_sj, forecast)
submission <- left_join(submission_format, y = forecast[ , c("ds","weekofyear","year", "yhat", "city")], by = c("year","weekofyear", "city"))
submission$yhat <- imputeTS::na.kalman(submission$yhat) # fill NAs due to 53 week yearChange negative values to 0
Final final submission with the mean of both previous
submission_mean <- submission
a <- cbind(submission$total_cases, submission_rf$total_cases)
submission_mean$total_cases <- round(apply(X = a, MARGIN = 1, function(x) mean(x)),0)
write.csv(submission_mean, "submission_mean.csv", quote = F, row.names = F)–> –>