DengAI-Predicting-Disease-Spread

loading packages and data
Split data by city
Correlation analysis (with lags)
Modelling San Juan
- Make final prediction for San Juan (test)
- Apply parameters and predict
Modelling Iquito
- Make final prediction for Iquito (test)
Final Submission using RF
Prophet
- Prophet San Juan
  - Match Prophet and submission
Prophet Iquito

loading packages and data

pacman:: p_load(readr, h2o, dplyr,ggplot2, caret, imputTS,  data.table, lubridate, corrplot, quantreg, Ecfun, randomForest, forecast)


features_tr <- read_csv("./input/dengue_features_train.csv", 
    col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

labels_tr <- read_csv("./input/dengue_labels_train.csv")

features_te <- read_csv("./input/dengue_features_test.csv", 
    col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

submission_format <- read_csv("./input/submission_format.csv")


options(scipen=999)

create two dataset, train and test

train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))

test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))

train$type <- "train"

test$type <- "test"

all <- rbind(train, test)

summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])

##  station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
##  Min.   :21.40      Min.   :294.9         Min.   :294.6        
##  1st Qu.:26.33      1st Qu.:298.3         1st Qu.:297.7        
##  Median :27.43      Median :299.3         Median :298.6        
##  Mean   :27.23      Mean   :299.3         Mean   :298.7        
##  3rd Qu.:28.20      3rd Qu.:300.3         3rd Qu.:299.9        
##  Max.   :30.80      Max.   :303.3         Max.   :302.2        
##  NA's   :55         NA's   :12            NA's   :12

summary(all[ , c("station_precip_mm", "precipitation_amt_mm")])

##  station_precip_mm precipitation_amt_mm
##  Min.   :  0.0     Min.   :  0.00      
##  1st Qu.:  8.8     1st Qu.:  9.43      
##  Median : 23.8     Median : 36.64      
##  Mean   : 38.2     Mean   : 44.11      
##  3rd Qu.: 51.5     3rd Qu.: 67.50      
##  Max.   :543.3     Max.   :390.60      
##  NA's   :27        NA's   :15

all <- all %>% group_by(city,year, week_start_date) %>%  mutate(avg_temp = mean(reanalysis_avg_temp_k, reanalysis_air_temp_k, na.rm = T, trim = 0), avg_precip = mean(station_precip_mm, precipitation_amt_mm, na.rm = T, trim = 0),  station_avg_temp_c = NULL, reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL) # Create avg of tempearature and preciptation

Inital investigation

#summary(train)

#summary(test)

Checking distributions

the plots below confirm that most of the predictors haves similar distributions across train and test

# for (i in names(all)) {
# x <- ggplot(data = all,aes_string(i,color="type")) + geom_density()
# print(x)
# }

treat NAs

Split data by city

sj <-  all %>% filter(city == "sj" & type == "train") 

# ggplot(data = sj, aes(week_start_date, station_precip_mm)) + geom_line()
# 
# ggplot(data = sj, aes(x = week_start_date, imputeTS::na.kalman(sj$station_precip_mm))) + geom_line()

tail(sj)

## # A tibble: 6 x 23
## # Groups:   city, year, week_start_date [6]
##   city   year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
##   <chr> <dbl>      <dbl> <date>            <dbl>   <dbl>   <dbl>   <dbl>
## 1 sj     2008         12 2008-03-18       0.0449  0.0244  0.102   0.088 
## 2 sj     2008         13 2008-03-25       0.0778 -0.0399  0.310   0.296 
## 3 sj     2008         14 2008-04-01      -0.038  -0.0168  0.119   0.0664
## 4 sj     2008         15 2008-04-08      -0.155  -0.0528  0.138   0.141 
## 5 sj     2008         16 2008-04-15       0.0018 NA       0.204   0.210 
## 6 sj     2008         17 2008-04-22      -0.037  -0.0104  0.0773  0.0906
## # ... with 15 more variables: reanalysis_dew_point_temp_k <dbl>,
## #   reanalysis_max_air_temp_k <dbl>, reanalysis_min_air_temp_k <dbl>,
## #   reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## #   station_diur_temp_rng_c <dbl>, station_max_temp_c <dbl>,
## #   station_min_temp_c <dbl>, total_cases <dbl>, type <chr>,
## #   avg_temp <dbl>, avg_precip <dbl>

The missing values will be replaced by a forecast

sj <- cbind(sj[ , c("city", "type", "week_start_date")] ,  apply(X = sj[, which(sapply(sj,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

Iquitos

iq <- all %>% filter(city == "iq" & type == "train")

# ggplot(data = iq, aes(week_start_date, station_diur_temp_rng_c)) + geom_line()
# 
# ggplot(data = iq, aes(x = week_start_date, imputeTS::na.kalman(iq$station_diur_temp_rng_c))) + geom_line()

iq <- cbind(iq[ , c("city", "type", "week_start_date")] ,  apply(X = iq[, which(sapply(iq,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

total cases across cities

ggplot(sj, aes(week_start_date, y = total_cases)) + geom_line()

ggplot(iq, aes(week_start_date, y = total_cases)) + geom_line()

tail(sj)

##     city  type week_start_date year weekofyear  ndvi_ne     ndvi_nw
## 931   sj train      2008-03-18 2008         12  0.04490  0.02445000
## 932   sj train      2008-03-25 2008         13  0.07785 -0.03990000
## 933   sj train      2008-04-01 2008         14 -0.03800 -0.01683333
## 934   sj train      2008-04-08 2008         15 -0.15520 -0.05275000
## 935   sj train      2008-04-15 2008         16  0.00180 -0.04492786
## 936   sj train      2008-04-22 2008         17 -0.03700 -0.01036667
##        ndvi_se    ndvi_sw reanalysis_dew_point_temp_k
## 931 0.10162860 0.08800000                    292.2057
## 932 0.31047140 0.29624290                    292.0957
## 933 0.11937140 0.06638571                    293.2357
## 934 0.13775710 0.14121430                    292.7329
## 935 0.20390000 0.20984290                    292.2743
## 936 0.07731429 0.09058571                    294.2800
##     reanalysis_max_air_temp_k reanalysis_min_air_temp_k
## 931                     299.8                     294.9
## 932                     299.7                     294.4
## 933                     299.8                     296.5
## 934                     299.4                     295.8
## 935                     299.7                     295.9
## 936                     300.9                     295.9
##     reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
## 931                            0.90                             72.91571
## 932                            7.55                             74.24714
## 933                            3.67                             74.60000
## 934                           35.00                             75.02714
## 935                            4.82                             72.28571
## 936                            2.17                             76.96000
##     reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
## 931                         0.00                              13.73714
## 932                        27.19                              13.64429
## 933                         3.82                              14.66286
## 934                        16.96                              14.18429
## 935                         0.00                              13.85857
## 936                         0.00                              15.67143
##     reanalysis_tdtr_k station_diur_temp_rng_c station_max_temp_c
## 931          3.871429                7.042857               30.0
## 932          2.885714                5.785714               30.0
## 933          2.714286                6.814286               30.6
## 934          2.185714                5.714286               29.4
## 935          2.785714                6.242857               29.4
## 936          3.957143                7.514286               31.7
##     station_min_temp_c total_cases avg_temp avg_precip
## 931               20.6           3 297.4357        0.5
## 932               21.1           4 296.9571        1.8
## 933               22.2           3 298.2286        0.5
## 934               21.7           1 297.5643       30.7
## 935               21.7           3 297.7786       11.2
## 936               23.3           5 298.6929        0.3

ggplot(filter(all, type == "train"), aes(week_start_date, total_cases, color = c(city))) + geom_line()

Correlation analysis (with lags)

melt data to plot variables against total_cases

m <- reshape::melt(data = sj, id.vars = c("city", "type", "week_start_date", "year", "weekofyear")) # create a melted dataframe

# ggplot(filter(m, variable %in% c("ndvi_ne", "total_cases")), aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")
# 
# m %>% filter(variable %in% c("station_min_temp_c", "total_cases")) %>%  ggplot(aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")

Pairwise correlation

function for pairwise correlation

pairwiseCor <- function(dataframe){
  pairs <- combn(names(dataframe), 2, simplify=FALSE)
  df <- data.frame(Vairable1=rep(0,length(pairs)), Variable2=rep(0,length(pairs)), 
                   AbsCor=rep(0,length(pairs)), Cor=rep(0,length(pairs)))
  for(i in 1:length(pairs)){
    df[i,1] <- pairs[[i]][1]
    df[i,2] <- pairs[[i]][2]
    df[i,3] <- round(abs(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman")),4)
    df[i,4] <- round(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman"),4)
  }
  pairwiseCorDF <- df
  pairwiseCorDF <- pairwiseCorDF[order(pairwiseCorDF$AbsCor, decreasing=TRUE),]
  row.names(pairwiseCorDF) <- 1:length(pairs)
  pairwiseCorDF <<- pairwiseCorDF
  pairwiseCorDF
}

corr_all <- cor(sj[, which(sapply(sj,is.numeric))])

pairwise <- pairwiseCor(sj[, which(sapply(sj,is.numeric))])

pairwise_total_cases <- filter(pairwise, Variable2 == "total_cases")

pairwise_total_cases

##                                Vairable1   Variable2 AbsCor     Cor
## 1                             weekofyear total_cases 0.3587  0.3587
## 2                                   year total_cases 0.3332 -0.3332
## 3  reanalysis_specific_humidity_g_per_kg total_cases 0.2593  0.2593
## 4            reanalysis_dew_point_temp_k total_cases 0.2538  0.2538
## 5              reanalysis_min_air_temp_k total_cases 0.2311  0.2311
## 6        reanalysis_precip_amt_kg_per_m2 total_cases 0.2229  0.2229
## 7   reanalysis_relative_humidity_percent total_cases 0.1981  0.1981
## 8              reanalysis_max_air_temp_k total_cases 0.1976  0.1976
## 9                     station_min_temp_c total_cases 0.1871  0.1871
## 10                     reanalysis_tdtr_k total_cases 0.1643 -0.1643
## 11                    station_max_temp_c total_cases 0.1503  0.1503
## 12                               ndvi_nw total_cases 0.1335  0.1335
## 13          reanalysis_sat_precip_amt_mm total_cases 0.1295  0.1295
## 14                               ndvi_ne total_cases 0.0835  0.0835
## 15               station_diur_temp_rng_c total_cases 0.0269  0.0269
## 16                               ndvi_sw total_cases 0.0082 -0.0082
## 17                               ndvi_se total_cases 0.0005  0.0005

findCorrelation(x = corr_all, names = T)

## [1] "reanalysis_specific_humidity_g_per_kg"
## [2] "avg_temp"

head(pairwise)

##                               Vairable1
## 1           reanalysis_dew_point_temp_k
## 2             reanalysis_min_air_temp_k
## 3             reanalysis_max_air_temp_k
## 4           reanalysis_dew_point_temp_k
## 5           reanalysis_dew_point_temp_k
## 6 reanalysis_specific_humidity_g_per_kg
##                               Variable2 AbsCor    Cor
## 1 reanalysis_specific_humidity_g_per_kg 0.9996 0.9996
## 2                              avg_temp 0.9459 0.9459
## 3                              avg_temp 0.9433 0.9433
## 4                              avg_temp 0.8910 0.8910
## 5             reanalysis_min_air_temp_k 0.8905 0.8905
## 6                              avg_temp 0.8890 0.8890

Compare correlation by lags

compare_cor <- c()

# for(i in 1:20) {
# 
# a <- sj %>% select(station_avg_temp_c,total_cases)  
#     
# b <- stats::cor(cbind(a$total_cases, lag(a$station_avg_temp_c,i)), method = "spearman" , use = "complete.obs")
# 
# compare_cor <- rbind(compare_cor, b[1,])
# 
# }

# which(abs(compare_cor[, 2]) == max(abs(compare_cor[,2])))

Modelling San Juan

sj$city <- NULL

sj$weekofyear <- as.character(sj$weekofyear)

train model to predict Test (All train)

# Get the best mtry train a random forest using that mtry

sj$year <- as.character(sj$year)

preprocessparams <- preProcess(x = select(sj[ , which(sapply(sj,is.numeric))], -total_cases) ,method = c("center", "scale", "pca"))


sj <- predict(preprocessparams, sj)

sj$type <- NULL
#sj$week_start_date <- NULL

opt_lambda <- BoxCox.lambda(sj$total_cases)

sj$total_cases <- BoxCox(sj$total_cases, lambda = opt_lambda)

rf_reg <- tuneRF(x = sj[ , -which(names(sj) == "total_cases")], y = sj$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=TRUE, doBest = T)

## mtry = 4  OOB error = 3.544785 
## Searching left ...
## mtry = 2     OOB error = 5.414495 
## -0.5274537 0.05 
## Searching right ...
## mtry = 8     OOB error = 2.649467 
## 0.2525734 0.05 
## mtry = 13    OOB error = 2.616229 
## 0.01254497 0.05

sj$pred <- predict(rf_reg, sj)

postResample(sj$pred, sj$total_cases)

##      RMSE  Rsquared       MAE 
## 0.6511216 0.9760122 0.4836689

ggplot(data = sj, aes(x = week_start_date)) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

sj$pred <- NULL
sj$type <- "train"

Make final prediction for San Juan (test)

sj_te <- all %>% filter(city == "sj" & type == "test")

#sj_te$year <- as.character(sj_te$year)

The missing values will be replaced by a forecast

sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] ,  apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

Apply parameters and predict

sj_te <- predict(preprocessparams, sj_te)

sj_te$total_cases <- predict(rf_reg , sj_te)

#sj_te$total_cases <- sj_te$total_cases^2

ggplot(sj_te, aes(week_start_date, total_cases)) + geom_line()

sj$year <- as.numeric(sj$year)
sj$weekofyear <- as.numeric(sj$weekofyear)

sj$total_cases <- InvBoxCox(sj$total_cases, opt_lambda)

sj_te$total_cases <- InvBoxCox(sj_te$total_cases, lambda = opt_lambda)

ggplot(data = union_all(sj, sj_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line()

Modelling Iquito

Iquito

iq$weekofyear <- as.character(iq$weekofyear)

train model to predict Test (All train)

## mtry = 4  OOB error = 3.276995 
## Searching left ...
## mtry = 2     OOB error = 3.663842 
## -0.1180494 0.05 
## Searching right ...
## mtry = 8     OOB error = 2.880987 
## 0.1208449 0.05 
## mtry = 13    OOB error = 3.059203 
## -0.06185953 0.05

Make final prediction for Iquito (test)

iq_te <- all %>% filter(city == "iq" & type == "test")

The missing values will be replaced by a forecast

iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date", "weekofyear")] ,  apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

#iq_te$type <- NULL
iq_te$city <- NULL
#iq_te$year <- NULL
#iq_te$week_start_date <- NULL

iq_te$weekofyear <- as.character(iq_te$weekofyear)


iq_te <- predict(preprocessparams, iq_te)

iq_te$total_cases <- predict(rf_reg, iq_te)


iq$year <- as.numeric(iq$year)
iq$weekofyear <- as.numeric(iq$weekofyear)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)

iq$type <- "train"

iq$total_cases <- InvBoxCox(iq$total_cases, opt_lambda)

iq_te$total_cases <- InvBoxCox(iq_te$total_cases, lambda = opt_lambda)

ggplot(data = union_all(iq, iq_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line()

Final Submission using RF

iq_te$city <- "iq" 

iq_te$type <- "test"

rf_final_df <- rbind(iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")], 
                     sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])

submission_format$total_cases <- 10


#submission_format$year <- as.character(submission_format$year)

#submission_format$weekofyear <- as.character(submission_format$weekofyear)

submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))

data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")

submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round

ggplot(submission_rf, aes(week_start_date, total_cases)) + geom_line() + facet_wrap(city~.)

ggplot(rf_final_df, aes(week_start_date, total_cases)) + geom_line() + facet_wrap(city~.)

submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases) 

write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)

summary(submission_format)

##      city                year        weekofyear     total_cases
##  Length:416         Min.   :2008   Min.   : 1.00   Min.   :10  
##  Class :character   1st Qu.:2010   1st Qu.:13.75   1st Qu.:10  
##  Mode  :character   Median :2011   Median :26.00   Median :10  
##                     Mean   :2011   Mean   :26.44   Mean   :10  
##                     3rd Qu.:2012   3rd Qu.:39.00   3rd Qu.:10  
##                     Max.   :2013   Max.   :53.00   Max.   :10

summary(submission_rf)

##      city                year        weekofyear     total_cases   
##  Length:416         Min.   :2008   Min.   : 1.00   Min.   : 1.00  
##  Class :character   1st Qu.:2010   1st Qu.:13.75   1st Qu.: 3.00  
##  Mode  :character   Median :2011   Median :26.00   Median : 8.00  
##                     Mean   :2011   Mean   :26.44   Mean   :18.26  
##                     3rd Qu.:2012   3rd Qu.:39.00   3rd Qu.:24.00  
##                     Max.   :2013   Max.   :53.00   Max.   :84.00

Prophet

Prophet San Juan

sj_prophet <- setnames(sj, c("week_start_date", "total_cases"),new = c("ds","y"))

library(prophet)

## Loading required package: Rcpp

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:data.table':
## 
##     :=

sj_prohet_model <- prophet(sj_prophet, daily.seasonality = F, weekly.seasonality = F, yearly.seasonality = T, seasonality.mode = "multiplicative")

future <- make_future_dataframe(sj_prohet_model, 500, freq = "week")

forecast <- predict(sj_prohet_model, future)

forecast$year <- year(forecast$ds)

forecast$weekofyear <- week(forecast$ds)


dyplot.prophet(x = sj_prohet_model, fcst = forecast)

Match Prophet and submission

forecast$ds <- as.Date(forecast$ds)

forecast$city <- "sj"

forecast_sj <- forecast

submission <- left_join(submission_format, y = forecast[ , c("ds","weekofyear","year", "yhat", "city")], by = c("year","weekofyear", "city"))

Prophet Iquito

iq_prophet <- setnames(iq, c("week_start_date", "total_cases"),new = c("ds","y"))

iq_prohet_model <- prophet(iq_prophet)

## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

future <- make_future_dataframe(iq_prohet_model, 500, freq = "week")

forecast <- predict(iq_prohet_model, future)

forecast$year <- year(forecast$ds)

forecast$weekofyear <- week(forecast$ds)

dyplot.prophet(x = iq_prohet_model, fcst = forecast)

Match Prophet and submission

forecast$ds <- as.Date(forecast$ds)

forecast$city <- "iq"

forecast <- rbind(forecast_sj, forecast)

submission <- left_join(submission_format, y = forecast[ , c("ds","weekofyear","year", "yhat", "city")], by = c("year","weekofyear", "city"))

submission$yhat <- imputeTS::na.kalman(submission$yhat) # fill NAs due to 53 week year

Change negative values to 0

submission$yhat[submission$yhat < 0 ] <- 0

submission$total_cases <- submission$yhat

submission <- submission %>% select(city, year, weekofyear, total_cases)

submission$total_cases <- base::round(submission$total_cases,0)

write.csv(submission, "submission.csv", quote = F, row.names = F)

Final final submission with the mean of both previous

submission_mean <- submission

a <- cbind(submission$total_cases, submission_rf$total_cases)

submission_mean$total_cases <- round(apply(X = a, MARGIN = 1, function(x) mean(x)),0)

write.csv(submission_mean, "submission_mean.csv", quote = F, row.names = F)

–> */

–> –>