DengAI-Predicting-Disease-Spread

loading packages and data
- Checking distributions
Split data by city
- Missing value analysis
Correlation analysis (with lags)
Modelling San Juan
- Make final prediction for San Juan (test)
- Apply parameters and predict
Modelling Iquito
- Make final prediction for Iquito (test)
Final Submission using RF
Prophet
- Prophet San Juan
  - Match Prophet and submission
Prophet Iquito

loading packages and data

pacman:: p_load(readr, h2o, dplyr,ggplot2, caret, imputTS,  data.table, lubridate,
                corrplot, quantreg, Ecfun, randomForest, forecast, PMmisc, kableExtra)


features_tr <- read_csv("../input/dengue_features_train.csv", 
    col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

labels_tr <- read_csv("../input/dengue_labels_train.csv")

features_te <- read_csv("../input/dengue_features_test.csv", 
    col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

submission_format <- read_csv("../input/submission_format.csv")


options(scipen=999)

create two dataset, train and test

train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))

test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))

train$type <- "train"

test$type <- "test"

all <- rbind(train, test)

summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])

##  station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
##  Min.   :21.40      Min.   :294.9         Min.   :294.6        
##  1st Qu.:26.33      1st Qu.:298.3         1st Qu.:297.7        
##  Median :27.43      Median :299.3         Median :298.6        
##  Mean   :27.23      Mean   :299.3         Mean   :298.7        
##  3rd Qu.:28.20      3rd Qu.:300.3         3rd Qu.:299.9        
##  Max.   :30.80      Max.   :303.3         Max.   :302.2        
##  NA's   :55         NA's   :12            NA's   :12

summary(all[ , c("station_precip_mm", "precipitation_amt_mm")])

##  station_precip_mm precipitation_amt_mm
##  Min.   :  0.0     Min.   :  0.00      
##  1st Qu.:  8.8     1st Qu.:  9.43      
##  Median : 23.8     Median : 36.64      
##  Mean   : 38.2     Mean   : 44.11      
##  3rd Qu.: 51.5     3rd Qu.: 67.50      
##  Max.   :543.3     Max.   :390.60      
##  NA's   :27        NA's   :15

all <- all %>% group_by(city,year, week_start_date) %>%  mutate(avg_temp = mean(reanalysis_avg_temp_k, reanalysis_air_temp_k, na.rm = T, trim = 0), avg_precip = mean(station_precip_mm, precipitation_amt_mm, na.rm = T, trim = 0),  station_avg_temp_c = NULL, reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL) # Create avg of tempearature and preciptation

Checking distributions

the plots below confirm that most of the predictors haves similar distributions across train and test

for (i in names(all)) {
x <- ggplot(data = all,aes_string(i,color="type")) + geom_density()
print(x)
}

Split data by city

sj <-  all %>% filter(city == "sj" & type == "train")

Missing value analysis

The missing values will be replaced by a forecast

apply(X = sj, 2, function(x) sum(is.na(x))/nrow(sj)) %>% knitr:: kable() %>%  kable_styling() # Get percentage of missing value by column

	x
city	0.0000000
year	0.0000000
weekofyear	0.0000000
week_start_date	0.0000000
ndvi_ne	0.2040598
ndvi_nw	0.0523504
ndvi_se	0.0202991
ndvi_sw	0.0202991
reanalysis_dew_point_temp_k	0.0064103
reanalysis_max_air_temp_k	0.0064103
reanalysis_min_air_temp_k	0.0064103
reanalysis_precip_amt_kg_per_m2	0.0064103
reanalysis_relative_humidity_percent	0.0064103
reanalysis_sat_precip_amt_mm	0.0096154
reanalysis_specific_humidity_g_per_kg	0.0064103
reanalysis_tdtr_k	0.0064103
station_diur_temp_rng_c	0.0064103
station_max_temp_c	0.0064103
station_min_temp_c	0.0064103
total_cases	0.0000000
type	0.0000000
avg_temp	0.0064103
avg_precip	0.0064103

sj$ndvi_ne <- NULL # ndvi_ne has 20% of missing values so will not be considered 

sj <- cbind(sj[ , c("city", "type", "week_start_date")] ,  apply(X = sj[, which(sapply(sj,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

Iquitos

iq <- all %>% filter(city == "iq" & type == "train")

apply(X = iq, 2, function(x) sum(is.na(x))/nrow(iq)) %>% knitr:: kable() %>%  kable_styling() # Get percentage of missing value by column

	x
city	0.0000000
year	0.0000000
weekofyear	0.0000000
week_start_date	0.0000000
ndvi_ne	0.0057692
ndvi_nw	0.0057692
ndvi_se	0.0057692
ndvi_sw	0.0057692
reanalysis_dew_point_temp_k	0.0076923
reanalysis_max_air_temp_k	0.0076923
reanalysis_min_air_temp_k	0.0076923
reanalysis_precip_amt_kg_per_m2	0.0076923
reanalysis_relative_humidity_percent	0.0076923
reanalysis_sat_precip_amt_mm	0.0076923
reanalysis_specific_humidity_g_per_kg	0.0076923
reanalysis_tdtr_k	0.0076923
station_diur_temp_rng_c	0.0711538
station_max_temp_c	0.0269231
station_min_temp_c	0.0153846
total_cases	0.0000000
type	0.0000000
avg_temp	0.0076923
avg_precip	0.0307692

iq <- cbind(iq[ , c("city", "type", "week_start_date")] ,  apply(X = iq[, which(sapply(iq,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

total cases across cities

ggplot(sj, aes(week_start_date, y = total_cases)) + geom_line()

ggplot(iq, aes(week_start_date, y = total_cases)) + geom_line()

ggplot(filter(all, type == "train"), aes(week_start_date, total_cases, color = c(city))) + geom_line()

Correlation analysis (with lags)

melt data to plot variables against total_cases

m <- reshape::melt(data = sj, id.vars = c("city", "type", "week_start_date", "year", "weekofyear")) # create a melted dataframe

# ggplot(filter(m, variable %in% c("ndvi_ne", "total_cases")), aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")
# 
# m %>% filter(variable %in% c("station_min_temp_c", "total_cases")) %>%  ggplot(aes(week_start_date, y = value))+ geom_line() + facet_grid(variable~., scales = "free")

Pairwise correlation

function for pairwise correlation

pairwiseCor <- function(dataframe){
  pairs <- combn(names(dataframe), 2, simplify=FALSE)
  df <- data.frame(Vairable1=rep(0,length(pairs)), Variable2=rep(0,length(pairs)), 
                   AbsCor=rep(0,length(pairs)), Cor=rep(0,length(pairs)))
  for(i in 1:length(pairs)){
    df[i,1] <- pairs[[i]][1]
    df[i,2] <- pairs[[i]][2]
    df[i,3] <- round(abs(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman")),4)
    df[i,4] <- round(cor(dataframe[,pairs[[i]][1]], dataframe[,pairs[[i]][2]], method = "spearman"),4)
  }
  pairwiseCorDF <- df
  pairwiseCorDF <- pairwiseCorDF[order(pairwiseCorDF$AbsCor, decreasing=TRUE),]
  row.names(pairwiseCorDF) <- 1:length(pairs)
  pairwiseCorDF <<- pairwiseCorDF
  pairwiseCorDF
}

corr_all <- cor(sj[, which(sapply(sj,is.numeric))])

pairwise <- pairwiseCor(sj[, which(sapply(sj,is.numeric))])

pairwise_total_cases <- filter(pairwise, Variable2 == "total_cases")

pairwise_total_cases %>% knitr:: kable() %>%  kable_styling()

Vairable1	Variable2	AbsCor	Cor
weekofyear	total_cases	0.3587	0.3587
year	total_cases	0.3332	-0.3332
reanalysis_specific_humidity_g_per_kg	total_cases	0.2593	0.2593
reanalysis_dew_point_temp_k	total_cases	0.2538	0.2538
reanalysis_min_air_temp_k	total_cases	0.2311	0.2311
reanalysis_precip_amt_kg_per_m2	total_cases	0.2229	0.2229
reanalysis_relative_humidity_percent	total_cases	0.1981	0.1981
reanalysis_max_air_temp_k	total_cases	0.1976	0.1976
station_min_temp_c	total_cases	0.1871	0.1871
reanalysis_tdtr_k	total_cases	0.1643	-0.1643
station_max_temp_c	total_cases	0.1503	0.1503
ndvi_nw	total_cases	0.1335	0.1335
reanalysis_sat_precip_amt_mm	total_cases	0.1295	0.1295
station_diur_temp_rng_c	total_cases	0.0269	0.0269
ndvi_sw	total_cases	0.0082	-0.0082
ndvi_se	total_cases	0.0005	0.0005

# findCorrelation(x = corr_all, names = T)

Compare correlation by lags

x <- c()
cor_cons <- c()
for(i in names(sj[, which(sapply(sj,is.numeric))])) {
x <- cor.lag(x = sj[ , i], y = sj$total_cases, lag = 10, lead = 10)

cor_cons <- rbind(cor_cons, x)
}

cor_cons$var <- names(sj[, which(sapply(sj,is.numeric))])

cor_cons$best <- colnames(cor_cons)[apply(cor_cons,1,which.max)]

cor_cons %>% select(var, best, everything()) %>% head() %>% kable() %>%
kable_styling()

var	best	lag1	lag2	lag3	lag4	lag5	lag6	lag7	lag8	lag9	lag10	0	lead1	lead2	lead3	lead4	lead5	lead6	lead7	lead8	lead9	lead10
year	lead10	-0.2142424	-0.2155294	-0.2168781	-0.2180879	-0.2189452	-0.2197756	-0.2203656	-0.2207159	-0.2208690	-0.2211777	-0.2126898	-0.2121132	-0.2112093	-0.2095755	-0.2074285	-0.2048120	-0.2020628	-0.1992569	-0.1962078	-0.1930860	-0.1901623
weekofyear	lead2	0.2847285	0.2827232	0.2791242	0.2700510	0.2593818	0.2426145	0.2241239	0.2052275	0.1849555	0.1618064	0.2871342	0.2871300	0.2904272	0.2807499	0.2629795	0.2339214	0.1998923	0.1661635	0.1239205	0.0828083	0.0447503
ndvi_nw	lag10	0.0902937	0.0851271	0.0858996	0.0845695	0.0874551	0.0902990	0.0931194	0.1011780	0.1096144	0.1160942	0.0869550	0.0816922	0.0747573	0.0601907	0.0648398	0.0573947	0.0541021	0.0545117	0.0458445	0.0383855	0.0355512
ndvi_se	lead10	-0.0022149	-0.0160184	-0.0198933	-0.0175861	-0.0165110	-0.0201982	-0.0330849	-0.0332932	-0.0344726	-0.0409569	0.0006669	-0.0035401	-0.0117113	-0.0149307	-0.0129014	-0.0007484	0.0109015	0.0153835	0.0070941	0.0059332	0.0164549
ndvi_sw	lag1	0.0160225	0.0030702	-0.0010090	0.0054755	0.0068771	0.0134313	0.0050646	0.0059238	0.0028990	-0.0064268	0.0089708	0.0072888	-0.0036255	-0.0069402	-0.0114939	-0.0045615	0.0071190	0.0127540	0.0107402	0.0015031	0.0041353
reanalysis_dew_point_temp_k	lag8	0.2238978	0.2456851	0.2643471	0.2782623	0.2885197	0.2950183	0.2979067	0.3024205	0.2991529	0.2902735	0.2028006	0.1722485	0.1361254	0.1024487	0.0707769	0.0364153	-0.0011604	-0.0425603	-0.0884030	-0.1197374	-0.1496668

Modelling San Juan

sj$city <- NULL

sj$weekofyear <- as.character(sj$weekofyear)

train model to predict Test (All train)

# Get the best mtry train a random forest using that mtry

sj$year <- as.character(sj$year)

preprocessparams <- preProcess(x = select(sj[ , which(sapply(sj,is.numeric))], -total_cases) ,method = c("center", "scale"))

sj <- predict(preprocessparams, sj)

sj$type <- NULL
#sj$week_start_date <- NULL

#opt_lambda <- BoxCox.lambda(sj$total_cases)

#sj$total_cases <- BoxCox(sj$total_cases, lambda = opt_lambda)

sj$ndvi_sw <- lag(sj$ndvi_nw, 1)

sj <- sj[complete.cases(sj), ]

rf_reg <- tuneRF(x = sj[ , -which(names(sj) == "total_cases")], y = sj$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T)

## mtry = 6  OOB error = 628.5642 
## Searching left ...
## mtry = 3     OOB error = 1141.221 
## -0.8156001 0.05 
## Searching right ...
## mtry = 12    OOB error = 521.7531 
## 0.1699288 0.05 
## mtry = 19    OOB error = 451.9597 
## 0.133767 0.05

sj$pred <- predict(rf_reg, sj)

postResample(sj$pred, sj$total_cases)

##      RMSE  Rsquared       MAE 
## 9.2855402 0.9740188 5.0704260

ggplot(data = sj, aes(x = week_start_date)) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

sj$pred <- NULL
sj$type <- "train"

Make final prediction for San Juan (test)

sj_te <- all %>% filter(city == "sj" & type == "test")

#sj_te$year <- as.character(sj_te$year)

The missing values will be replaced by a forecast

sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] ,  apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

Apply parameters and predict

sj_te <- predict(preprocessparams, sj_te)

sj_te$total_cases <- predict(rf_reg , sj_te)

#sj_te$total_cases <- sj_te$total_cases^2

ggplot(sj_te, aes(week_start_date, total_cases)) + geom_line()

sj$year <- as.numeric(sj$year)
sj$weekofyear <- as.numeric(sj$weekofyear)

#sj$total_cases <- InvBoxCox(sj$total_cases, opt_lambda)

#sj_te$total_cases <- InvBoxCox(sj_te$total_cases, lambda = opt_lambda)

ggplot(data = union_all(sj, sj_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line()

Modelling Iquito

Iquito

iq$weekofyear <- as.character(iq$weekofyear)

train model to predict Test (All train)

iq$city <- NULL

iq$type <- NULL

iq$year <- as.character(iq$year)

preprocessparams <- preProcess(x = select(iq[ , which(sapply(iq,is.numeric))], -total_cases),method = c("center", "scale","pca"))

iq <- predict(preprocessparams, iq)

#opt_lambda <- BoxCox.lambda(iq$total_cases)

#iq$total_cases <- BoxCox(iq$total_cases, lambda = opt_lambda)

rf_reg <- tuneRF(x = iq[ , -which(colnames(iq)=="total_cases")], iq$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T)

## -0.09242143 0.05 
## -0.09343546 0.05

iq$pred <- predict(rf_reg, iq)

ggplot(data = iq, aes(x = week_start_date)) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

combined_iq <- c()

combined_iq <- cbind(combined_iq, postResample(iq$total_cases, iq$pred))

Make final prediction for Iquito (test)

iq_te <- all %>% filter(city == "iq" & type == "test")

The missing values will be replaced by a forecast

iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date", "weekofyear")] ,  apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) imputeTS::na.kalman(x)))

#iq_te$type <- NULL
iq_te$city <- NULL
#iq_te$year <- NULL
#iq_te$week_start_date <- NULL

iq_te$weekofyear <- as.character(iq_te$weekofyear)


iq_te <- predict(preprocessparams, iq_te)

iq_te$total_cases <- predict(rf_reg, iq_te)


iq$year <- as.numeric(iq$year)
iq$weekofyear <- as.numeric(iq$weekofyear)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)

iq$type <- "train"

#iq$total_cases <- InvBoxCox(iq$total_cases, opt_lambda)

#iq_te$total_cases <- InvBoxCox(iq_te$total_cases, lambda = opt_lambda)

ggplot(data = union_all(iq, iq_te), aes(x = week_start_date, y = total_cases, color = type)) + geom_line()

Final Submission using RF

iq_te$city <- "iq" 

iq_te$type <- "test"

rf_final_df <- rbind(iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")], 
                     sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])

submission_format$total_cases <- 10


#submission_format$year <- as.character(submission_format$year)

#submission_format$weekofyear <- as.character(submission_format$weekofyear)

submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))

data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")

submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round

ggplot(submission_rf, aes(week_start_date, total_cases)) + geom_line() + facet_wrap(city~.)

#ggplot(rf_final_df, aes(week_start_date, total_cases)) + geom_line() + facet_wrap(city~.)

submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases) 

write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)

summary(submission_format)

##      city                year        weekofyear     total_cases
##  Length:416         Min.   :2008   Min.   : 1.00   Min.   :10  
##  Class :character   1st Qu.:2010   1st Qu.:13.75   1st Qu.:10  
##  Mode  :character   Median :2011   Median :26.00   Median :10  
##                     Mean   :2011   Mean   :26.44   Mean   :10  
##                     3rd Qu.:2012   3rd Qu.:39.00   3rd Qu.:10  
##                     Max.   :2013   Max.   :53.00   Max.   :10

summary(submission_rf)

##      city                year        weekofyear     total_cases    
##  Length:416         Min.   :2008   Min.   : 1.00   Min.   :  3.00  
##  Class :character   1st Qu.:2010   1st Qu.:13.75   1st Qu.:  7.00  
##  Mode  :character   Median :2011   Median :26.00   Median : 13.00  
##                     Mean   :2011   Mean   :26.44   Mean   : 24.62  
##                     3rd Qu.:2012   3rd Qu.:39.00   3rd Qu.: 32.00  
##                     Max.   :2013   Max.   :53.00   Max.   :137.00

Prophet

Prophet San Juan

sj_prophet <- setnames(sj, c("week_start_date", "total_cases"),new = c("ds","y"))

library(prophet)

## Loading required package: Rcpp

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:data.table':
## 
##     :=

sj_prohet_model <- prophet(sj_prophet, daily.seasonality = F, weekly.seasonality = F, yearly.seasonality = T, seasonality.mode = "multiplicative")

future <- make_future_dataframe(sj_prohet_model, 500, freq = "week")

forecast <- predict(sj_prohet_model, future)

forecast$year <- year(forecast$ds)

forecast$weekofyear <- week(forecast$ds)


dyplot.prophet(x = sj_prohet_model, fcst = forecast)

Match Prophet and submission

forecast$ds <- as.Date(forecast$ds)

forecast$city <- "sj"

forecast_sj <- forecast

submission <- left_join(submission_format, y = forecast[ , c("ds","weekofyear","year", "yhat", "city")], by = c("year","weekofyear", "city"))

Prophet Iquito

iq_prophet <- setnames(iq, c("week_start_date", "total_cases"),new = c("ds","y"))

iq_prohet_model <- prophet(iq_prophet)

## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

future <- make_future_dataframe(iq_prohet_model, 500, freq = "week")

forecast <- predict(iq_prohet_model, future)

forecast$year <- year(forecast$ds)

forecast$weekofyear <- week(forecast$ds)

dyplot.prophet(x = iq_prohet_model, fcst = forecast)

Match Prophet and submission

forecast$ds <- as.Date(forecast$ds)

forecast$city <- "iq"

forecast <- rbind(forecast_sj, forecast)

submission <- left_join(submission_format, y = forecast[ , c("ds","weekofyear","year", "yhat", "city")], by = c("year","weekofyear", "city"))

submission$yhat <- imputeTS::na.kalman(submission$yhat) # fill NAs due to 53 week year

Change negative values to 0

submission$yhat[submission$yhat < 0 ] <- 0

submission$total_cases <- submission$yhat

submission <- submission %>% select(city, year, weekofyear, total_cases)

submission$total_cases <- base::round(submission$total_cases,0)

write.csv(submission, "submission.csv", quote = F, row.names = F)

Final final submission with the mean of both previous

submission_mean <- submission

a <- cbind(submission$total_cases, submission_rf$total_cases)

submission_mean$total_cases <- round(apply(X = a, MARGIN = 1, function(x) mean(x)),0)

write.csv(submission_mean, "submission_mean.csv", quote = F, row.names = F)

–> */

–> –>