你能预测登革热的当地流行病吗?
登革热是一种蚊子传播的疾病,发生在世界的热带和亚热带地区。在轻微的情况下,症状类似于流感:发烧,皮疹,肌肉和关节疼痛。在严重的情况下,登革热可导致严重出血,低血压,甚至死亡。
因为它是由蚊子携带的,登革热的传播动力学与气候变量如温度和降水有关。尽管与气候的关系很复杂,但越来越多的科学家认为,气候变化可能会产生分布式变化,这将对全球公共卫生产生重大影响。
近年来,登革热一直在蔓延。从历史上看,这种疾病在东南亚和太平洋岛屿中最为普遍。如今,拉丁美洲每年发生近5亿例病例:
用各种美国联邦政府收集环境数据的机构,由美国疾病控制和预防的美国国家海洋和大气管理局在美国商务部 -你能预测的登革热病例数,每星期报道在圣胡安,波多黎各Rico和伊基托斯,秘鲁?
这是一个中级练习比赛。您的任务是根据描述温度,降水,植被等变化的环境变量,预测每周(每个地点)的登革热病例数。
了解气候和登革热动态之间的关系可以改善研究计划和资源分配,以帮助对抗威胁生命的大流行病。
Train <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_train.csv')
TrainLabel <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_labels_train.csv')
Test <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_test.csv')
TestLabel <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/submission_format.csv')
evalue <- function(pre,label){
ME <- mean(label-pre,na.rm = TRUE)
RMSE <- sqrt(mean((pre-label)^2,na.rm = TRUE))
MAE <- mean(abs(pre-label),na.rm = TRUE)
MAPE <- mean(abs((label-pre)/(label+0.01)),na.rm = TRUE)*100
MPE <- mean(((label-pre)/(label+0.01)),na.rm = TRUE)*100
MASE <- mean(abs(label-pre)^2,na.rm = TRUE)
return(c(ME,RMSE,MAE,MAPE,MPE,MASE))
}
require(tidyverse)
## Loading required package: tidyverse
## ─ Attaching packages ──────────────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.1.0.9000 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.0 ✔ forcats 0.3.0
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## ─ Conflicts ────────────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Train <- Train %>% left_join(TrainLabel,by=c('city','year','weekofyear'))
Test <- Test %>% left_join(TestLabel,by=c('city','year','weekofyear'))
l1 <- lm(total_cases~.,data = Train[,-c(1,2,3,4)])
summary(l1)
##
## Call:
## lm(formula = total_cases ~ ., data = Train[, -c(1, 2, 3, 4)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.794 -14.113 -4.401 5.464 288.051
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 5.357e+03 2.731e+03 1.962
## ndvi_ne -5.654e+00 1.200e+01 -0.471
## ndvi_nw 2.978e+01 1.343e+01 2.218
## ndvi_se -1.430e+01 1.968e+01 -0.727
## ndvi_sw -5.016e+00 1.849e+01 -0.271
## precipitation_amt_mm -3.554e-04 2.387e-02 -0.015
## reanalysis_air_temp_k -9.484e-01 1.082e+01 -0.088
## reanalysis_avg_temp_k -1.398e+01 5.112e+00 -2.736
## reanalysis_dew_point_temp_k -4.910e+00 1.184e+01 -0.415
## reanalysis_max_air_temp_k 1.563e+00 1.118e+00 1.398
## reanalysis_min_air_temp_k 1.058e-01 1.576e+00 0.067
## reanalysis_precip_amt_kg_per_m2 9.228e-03 2.495e-02 0.370
## reanalysis_relative_humidity_percent -3.870e+00 2.133e+00 -1.814
## reanalysis_sat_precip_amt_mm NA NA NA
## reanalysis_specific_humidity_g_per_kg 2.530e+01 9.423e+00 2.685
## reanalysis_tdtr_k -9.256e-01 1.512e+00 -0.612
## station_avg_temp_c -2.147e-02 1.927e+00 -0.011
## station_diur_temp_rng_c 4.588e-02 1.171e+00 0.039
## station_max_temp_c -1.086e+00 1.086e+00 -1.000
## station_min_temp_c 4.946e-01 1.313e+00 0.377
## station_precip_mm 6.016e-04 1.978e-02 0.030
## Pr(>|t|)
## (Intercept) 0.05000 *
## ndvi_ne 0.63754
## ndvi_nw 0.02675 *
## ndvi_se 0.46767
## ndvi_sw 0.78625
## precipitation_amt_mm 0.98812
## reanalysis_air_temp_k 0.93014
## reanalysis_avg_temp_k 0.00632 **
## reanalysis_dew_point_temp_k 0.67842
## reanalysis_max_air_temp_k 0.16239
## reanalysis_min_air_temp_k 0.94652
## reanalysis_precip_amt_kg_per_m2 0.71157
## reanalysis_relative_humidity_percent 0.06991 .
## reanalysis_sat_precip_amt_mm NA
## reanalysis_specific_humidity_g_per_kg 0.00735 **
## reanalysis_tdtr_k 0.54064
## station_avg_temp_c 0.99112
## station_diur_temp_rng_c 0.96877
## station_max_temp_c 0.31730
## station_min_temp_c 0.70647
## station_precip_mm 0.97574
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.84 on 1179 degrees of freedom
## (257 observations deleted due to missingness)
## Multiple R-squared: 0.1988, Adjusted R-squared: 0.1859
## F-statistic: 15.4 on 19 and 1179 DF, p-value: < 2.2e-16
trainpre1 <- predict(l1,Train[,-c(1,2,3,4)])
## Warning in predict.lm(l1, Train[, -c(1, 2, 3, 4)]): prediction from a rank-
## deficient fit may be misleading
testpre1 <- predict(l1,Test[,-c(1,2,3,4)])
## Warning in predict.lm(l1, Test[, -c(1, 2, 3, 4)]): prediction from a rank-
## deficient fit may be misleading
Train.errer <- evalue(pre = trainpre1,label = Train$total_cases)
test.errer <- evalue(pre = testpre1,label = Test$total_cases)
errer1 <- data.frame(t(data.frame(Train.errer,test.errer)))
names(errer1) <- c('ME','RMSE','MAE','MAPE','MPE','MASE')
errer1
## ME RMSE MAE MAPE MPE
## Train.errer -5.804122e-12 27.60811 16.34844 5452.064 -2955.885
## test.errer -2.166551e+01 25.91532 22.26630 222663.028 -216655.123
## MASE
## Train.errer 762.2077
## test.errer 671.6037
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
l2 <- randomForest(total_cases~.,data = na.omit(Train[,-4]))
summary(l2)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 1199 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 1199 -none- numeric
## importance 23 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 1199 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
l2
##
## Call:
## randomForest(formula = total_cases ~ ., data = na.omit(Train[, -4]))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 495.3198
## % Var explained: 47.93
trainpre2 <- predict(l2,Train[,-4])
testpre2 <- predict(l2,Test[,-4])
Train.errer2 <- evalue(pre = trainpre2,label = Train$total_cases)
test.errer2 <- evalue(pre = testpre2,label = Test$total_cases)
errer2 <- data.frame(t(data.frame(Train.errer2,test.errer2)))
names(errer2) <- c('ME','RMSE','MAE','MAPE','MPE','MASE')
errer2
## ME RMSE MAE MAPE MPE
## Train.errer2 -0.4262795 9.784514 5.150878 1387.61 -1379.128
## test.errer2 -20.6030307 27.878706 20.603031 206030.31 -206030.307
## MASE
## Train.errer2 95.7367
## test.errer2 777.2222
require(mlr)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning: replacing previous import 'BBmisc::isFALSE' by
## 'backports::isFALSE' when loading 'mlr'
tasktrain <- makeRegrTask(data = na.omit(Train[,-c(1,4)]),target = "total_cases")
tasktest <- makeRegrTask(data = Test[,-c(1,4)],target = "total_cases")
lnr <- makeLearner(cl = "regr.xgboost",predict.type = 'response')
mdl <- mlr::train(lnr,tasktrain)
trainpre3 <- predict(mdl,tasktrain)
testpre3 <- predict(mdl,tasktest)
Train.errer3 <- evalue(pre = trainpre3$data$response,label = na.omit(Train)$total_cases)
test.errer3 <- evalue(pre = testpre3$data$response,label = Test$total_cases)
errer3 <- data.frame(t(data.frame(Train.errer3,test.errer3)))
names(errer3) <- c('ME','RMSE','MAE','MAPE','MPE','MASE')
errer3
## ME RMSE MAE MAPE MPE MASE
## Train.errer3 14.845044 29.156638 15.319273 1415.079 -1308.737 850.1096
## test.errer3 -5.547147 8.068488 5.547147 55471.473 -55471.473 65.1005