DengAI_Predicting Disease Spread Data Mining for Business Intelligence
RADHIKA Sood, REKHA RAJ, PRATHIBA Swamykannu, RUPANJALI Chattopadhyay, AJINKYA Dalvi, ACHAL Khullar, AMAN Rastogi, UMESH Singh
March 20, 2020
Background of the Project
For more details on the competition please visit https://www.drivendata.org/competitions/44/dengai-predicting-disease-spread/.
#install.packages("knitr",repos = "http://cran.us.r-project.org")
#install.packages("RCurl",repos = "http://cran.us.r-project.org")
library(RCurl)
#install.packages("e1071",repos = "http://cran.us.r-project.org")
library(e1071)
#install.packages("caret",repos = "http://cran.us.r-project.org")
library(caret)
#install.packages("doSNOW",repos = "http://cran.us.r-project.org")
library(doSNOW)
#install.packages("ipred",repos = "http://cran.us.r-project.org")
library(ipred)
#install.packages("xgboost",repos = "http://cran.us.r-project.org")
library(xgboost)
#library(devtools)
#devtools::install_github('topepo/caret/pkg/caret')
trfeat <- getURL("https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_train.csv")
trfeat <-read.csv(text = trfeat)
tr <- trfeat[, -c(4)]
trlabel <- getURL("https://s3.amazonaws.com/drivendata/data/44/public/dengue_labels_train.csv")
trlabel <- read.csv(text = trlabel)
trfinal <- merge(tr, trlabel, by=c("city", "year", "weekofyear"))
train <- trfinal
str(trfinal)
'data.frame': 1456 obs. of 24 variables:
$ city : Factor w/ 2 levels "iq","sj": 1 1 1 1 1 1 1 1 1 1 ...
$ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
$ weekofyear : int 26 27 28 29 30 31 32 33 34 35 ...
$ ndvi_ne : num 0.193 0.217 0.177 0.228 0.329 ...
$ ndvi_nw : num 0.132 0.276 0.173 0.145 0.322 ...
$ ndvi_se : num 0.341 0.289 0.204 0.254 0.254 ...
$ ndvi_sw : num 0.247 0.242 0.128 0.2 0.361 ...
$ precipitation_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
$ reanalysis_air_temp_k : num 297 297 296 295 296 ...
$ reanalysis_avg_temp_k : num 298 298 297 296 298 ...
$ reanalysis_dew_point_temp_k : num 295 295 296 293 294 ...
$ reanalysis_max_air_temp_k : num 307 307 304 304 307 ...
$ reanalysis_min_air_temp_k : num 293 291 293 289 292 ...
$ reanalysis_precip_amt_kg_per_m2 : num 43.2 46 64.8 24 31.8 ...
$ reanalysis_relative_humidity_percent : num 92.4 93.6 95.8 87.2 88.2 ...
$ reanalysis_sat_precip_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
$ reanalysis_specific_humidity_g_per_kg: num 16.7 16.9 17.1 14.4 15.4 ...
$ reanalysis_tdtr_k : num 8.93 10.31 7.39 9.11 9.5 ...
$ station_avg_temp_c : num 26.4 26.9 26.8 25.8 26.6 ...
$ station_diur_temp_rng_c : num 10.8 11.6 11.5 10.5 11.5 ...
$ station_max_temp_c : num 32.5 34 33 31.5 33.3 32 34 33 34 34 ...
$ station_min_temp_c : num 20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
$ station_precip_mm : num 3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
$ total_cases : int 0 0 0 0 0 0 0 0 0 0 ...
train$city <- as.factor(train$city)
features <- c("city","year","weekofyear","ndvi_ne","ndvi_nw","ndvi_se",
"ndvi_sw", "precipitation_amt_mm", "reanalysis_air_temp_k", "reanalysis_avg_temp_k","reanalysis_dew_point_temp_k", "reanalysis_max_air_temp_k", "reanalysis_min_air_temp_k", "reanalysis_precip_amt_kg_per_m2", "reanalysis_relative_humidity_percent", "reanalysis_sat_precip_amt_mm",
"reanalysis_specific_humidity_g_per_kg", "reanalysis_tdtr_k", "station_avg_temp_c", "station_diur_temp_rng_c", "station_max_temp_c" , "station_min_temp_c", "station_precip_mm","total_cases")
train <- train[, features]
dummy.vars <- dummyVars(~ ., data = train[, -c(1:3,24)])
train.dummy <- predict(dummy.vars, train[, -c(1:3,24)])
Imputation of missing values: try to recover all missing values in our dataset. (1) Data preparation (preProcess) for regression trees bagging modeling (bagImpute)
pre.process <- preProcess(train.dummy, method = "bagImpute")
imputed.data <- predict(pre.process, train.dummy)
train$ndvi_ne <- imputed.data[,1]
train$ndvi_nw <- imputed.data[,2]
train$ndvi_se <- imputed.data[,3]
train$ndvi_sw <- imputed.data[,4]
train$precipitation_amt_mm <- imputed.data[,5]
train$reanalysis_air_temp_k <- imputed.data[, 6]
train$reanalysis_avg_temp_k <- imputed.data[,7]
train$reanalysis_dew_point_temp_k <- imputed.data[,8]
train$reanalysis_max_air_temp_k <- imputed.data[,9]
train$reanalysis_min_air_temp_k <- imputed.data[,10]
train$reanalysis_precip_amt_kg_per_m2 <- imputed.data[,11]
train$reanalysis_relative_humidity_percent <- imputed.data[,12]
train$reanalysis_sat_precip_amt_mm <- imputed.data[,13]
train$reanalysis_specific_humidity_g_per_kg <- imputed.data[,14]
train$reanalysis_tdtr_k <- imputed.data[,15]
train$station_avg_temp_c <- imputed.data[,16]
train$station_diur_temp_rng_c <- imputed.data[,17]
train$station_max_temp_c <- imputed.data[,18]
train$station_min_temp_c <- imputed.data[,19]
train$station_precip_mm <- imputed.data[,20]
anyNA(train)
[1] FALSE
set.seed(54321)
indexes <- createDataPartition(train$total_cases, times = 1, p = .8, list = FALSE)
deng.train <- train[indexes,]
deng.test <- train[-indexes,]
dim(deng.test)
[1] 288 24
dim(deng.train)
[1] 1168 24
train.control <- trainControl(method = "repeatedcv",
number = 5,
repeats = 5,
search = "grid")
tune.grid <- expand.grid(eta = c(0.05, 0.06, 0.1),
nrounds = c(30, 70),
max_depth = 4:8,
min_child_weight = c(2.0, 2.25, 2.5, 10),
colsample_bytree = c(0.3, 0.4, 0.5),
gamma = c(0.01, 0.001),
subsample = 1)
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
train2 = train[, c(1, 3, 4, 5, 6, 7, 11, 13, 17, 18, 24)]
caret.cv <- train(total_cases ~ .,
data = train2,
method = "xgbTree",
tuneGrid = tune.grid,
trControl = train.control)
stopCluster(cl)
preds <- predict(caret.cv, deng.test)
totalpredcasestrtest <- round(preds)
par(mfrow=c(2,1))
plot(totalpredcasestrtest)
plot(deng.test$total_cases)
actual = deng.test$total_cases
predicted = totalpredcasestrtest
mae <- function(error)
{
mean(abs(error))
}
error <- (actual-predicted)
mae(error)
[1] 5.375
testset <- getURL("https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_test.csv")
testset <- read.csv(text=testset)
ts <- testset[, -c(4)]
names(ts)
[1] "city"
[2] "year"
[3] "weekofyear"
[4] "ndvi_ne"
[5] "ndvi_nw"
[6] "ndvi_se"
[7] "ndvi_sw"
[8] "precipitation_amt_mm"
[9] "reanalysis_air_temp_k"
[10] "reanalysis_avg_temp_k"
[11] "reanalysis_dew_point_temp_k"
[12] "reanalysis_max_air_temp_k"
[13] "reanalysis_min_air_temp_k"
[14] "reanalysis_precip_amt_kg_per_m2"
[15] "reanalysis_relative_humidity_percent"
[16] "reanalysis_sat_precip_amt_mm"
[17] "reanalysis_specific_humidity_g_per_kg"
[18] "reanalysis_tdtr_k"
[19] "station_avg_temp_c"
[20] "station_diur_temp_rng_c"
[21] "station_max_temp_c"
[22] "station_min_temp_c"
[23] "station_precip_mm"
ts$total_cases <- NA
ts <- ts[, features]
ts$city <- as.factor(ts$city)
ts$weekofyear <- as.numeric(ts$weekofyear)
tsdummy.vars <- dummyVars(~ ., data = ts[, -c(1:3,24)])
ts.dummy <- predict(tsdummy.vars, ts[, -c(1:3,24)])
tspre.process <- preProcess(ts.dummy, method = "bagImpute")
tsimputed.data <- predict(tspre.process, ts.dummy)
ts$ndvi_ne <- tsimputed.data[,1]
ts$ndvi_nw <- tsimputed.data[,2]
ts$ndvi_se <- tsimputed.data[,3]
ts$ndvi_sw <- tsimputed.data[,4]
ts$precipitation_amt_mm <- tsimputed.data[,5]
ts$reanalysis_air_temp_k <- tsimputed.data[, 6]
ts$reanalysis_avg_temp_k <- tsimputed.data[,7]
ts$reanalysis_dew_point_temp_k <- tsimputed.data[,8]
ts$reanalysis_max_air_temp_k <- tsimputed.data[,9]
ts$reanalysis_min_air_temp_k <- tsimputed.data[,10]
ts$reanalysis_precip_amt_kg_per_m2 <- tsimputed.data[,11]
ts$reanalysis_relative_humidity_percent <- tsimputed.data[,12]
ts$reanalysis_sat_precip_amt_mm <- tsimputed.data[,13]
ts$reanalysis_specific_humidity_g_per_kg <- tsimputed.data[,14]
ts$reanalysis_tdtr_k <- tsimputed.data[,15]
ts$station_avg_temp_c <- tsimputed.data[,16]
ts$station_diur_temp_rng_c <- tsimputed.data[,17]
ts$station_max_temp_c <- tsimputed.data[,18]
ts$station_min_temp_c <- tsimputed.data[,19]
ts$station_precip_mm <- tsimputed.data[,20]
ts$total_cases <- round(predict(caret.cv, ts))
par(mfrow=c(1,1))
plot(ts$total_cases)
Submitformat <- getURL("https://s3.amazonaws.com/drivendata/data/44/public/submission_format.csv")
submitformat2 <- read.csv(text=Submitformat)
submitformat2$total_cases<- ts$total_cases
write.csv(submitformat2, "G://STUDY//submit031920xgb_send.csv", row.names = FALSE)