DengAI

你能预测登革热的当地流行病吗?

登革热是一种蚊子传播的疾病,发生在世界的热带和亚热带地区。在轻微的情况下,症状类似于流感:发烧,皮疹,肌肉和关节疼痛。在严重的情况下,登革热可导致严重出血,低血压,甚至死亡。

因为它是由蚊子携带的,登革热的传播动力学与气候变量如温度和降水有关。尽管与气候的关系很复杂,但越来越多的科学家认为,气候变化可能会产生分布式变化,这将对全球公共卫生产生重大影响。

近年来,登革热一直在蔓延。从历史上看,这种疾病在东南亚和太平洋岛屿中最为普遍。如今,拉丁美洲每年发生近5亿例病例:

用各种美国联邦政府收集环境数据的机构,由美国疾病控制和预防的美国国家海洋和大气管理局在美国商务部 -你能预测的登革热病例数,每星期报道在圣胡安,波多黎各Rico和伊基托斯,秘鲁?

这是一个中级练习比赛。您的任务是根据描述温度,降水,植被等变化的环境变量,预测每周(每个地点)的登革热病例数。

了解气候和登革热动态之间的关系可以改善研究计划和资源分配,以帮助对抗威胁生命的大流行病。

读取数据

Train <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_train.csv')

TrainLabel <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_labels_train.csv')



Test <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_test.csv')
TestLabel <- read.csv('https://s3.amazonaws.com/drivendata/data/44/public/submission_format.csv')

evalue <- function(pre,label){
  ME <- mean(label-pre,na.rm = TRUE)
  RMSE <- sqrt(mean((pre-label)^2,na.rm = TRUE))
  MAE <- mean(abs(pre-label),na.rm = TRUE)
  MAPE <- mean(abs((label-pre)/(label+0.01)),na.rm = TRUE)*100
  MPE <-  mean(((label-pre)/(label+0.01)),na.rm = TRUE)*100
  MASE <- mean(abs(label-pre)^2,na.rm = TRUE)
  
  return(c(ME,RMSE,MAE,MAPE,MPE,MASE))
  
}

多元回归方程

require(tidyverse)
## Loading required package: tidyverse
## ─ Attaching packages ──────────────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.1.0.9000     ✔ purrr   0.2.5     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.8     
## ✔ tidyr   0.8.2          ✔ stringr 1.3.1     
## ✔ readr   1.3.0          ✔ forcats 0.3.0
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## ─ Conflicts ────────────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
Train <- Train %>% left_join(TrainLabel,by=c('city','year','weekofyear'))
Test <- Test %>% left_join(TestLabel,by=c('city','year','weekofyear'))


l1 <- lm(total_cases~.,data = Train[,-c(1,2,3,4)])
summary(l1)
## 
## Call:
## lm(formula = total_cases ~ ., data = Train[, -c(1, 2, 3, 4)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.794 -14.113  -4.401   5.464 288.051 
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error t value
## (Intercept)                            5.357e+03  2.731e+03   1.962
## ndvi_ne                               -5.654e+00  1.200e+01  -0.471
## ndvi_nw                                2.978e+01  1.343e+01   2.218
## ndvi_se                               -1.430e+01  1.968e+01  -0.727
## ndvi_sw                               -5.016e+00  1.849e+01  -0.271
## precipitation_amt_mm                  -3.554e-04  2.387e-02  -0.015
## reanalysis_air_temp_k                 -9.484e-01  1.082e+01  -0.088
## reanalysis_avg_temp_k                 -1.398e+01  5.112e+00  -2.736
## reanalysis_dew_point_temp_k           -4.910e+00  1.184e+01  -0.415
## reanalysis_max_air_temp_k              1.563e+00  1.118e+00   1.398
## reanalysis_min_air_temp_k              1.058e-01  1.576e+00   0.067
## reanalysis_precip_amt_kg_per_m2        9.228e-03  2.495e-02   0.370
## reanalysis_relative_humidity_percent  -3.870e+00  2.133e+00  -1.814
## reanalysis_sat_precip_amt_mm                  NA         NA      NA
## reanalysis_specific_humidity_g_per_kg  2.530e+01  9.423e+00   2.685
## reanalysis_tdtr_k                     -9.256e-01  1.512e+00  -0.612
## station_avg_temp_c                    -2.147e-02  1.927e+00  -0.011
## station_diur_temp_rng_c                4.588e-02  1.171e+00   0.039
## station_max_temp_c                    -1.086e+00  1.086e+00  -1.000
## station_min_temp_c                     4.946e-01  1.313e+00   0.377
## station_precip_mm                      6.016e-04  1.978e-02   0.030
##                                       Pr(>|t|)   
## (Intercept)                            0.05000 * 
## ndvi_ne                                0.63754   
## ndvi_nw                                0.02675 * 
## ndvi_se                                0.46767   
## ndvi_sw                                0.78625   
## precipitation_amt_mm                   0.98812   
## reanalysis_air_temp_k                  0.93014   
## reanalysis_avg_temp_k                  0.00632 **
## reanalysis_dew_point_temp_k            0.67842   
## reanalysis_max_air_temp_k              0.16239   
## reanalysis_min_air_temp_k              0.94652   
## reanalysis_precip_amt_kg_per_m2        0.71157   
## reanalysis_relative_humidity_percent   0.06991 . 
## reanalysis_sat_precip_amt_mm                NA   
## reanalysis_specific_humidity_g_per_kg  0.00735 **
## reanalysis_tdtr_k                      0.54064   
## station_avg_temp_c                     0.99112   
## station_diur_temp_rng_c                0.96877   
## station_max_temp_c                     0.31730   
## station_min_temp_c                     0.70647   
## station_precip_mm                      0.97574   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.84 on 1179 degrees of freedom
##   (257 observations deleted due to missingness)
## Multiple R-squared:  0.1988, Adjusted R-squared:  0.1859 
## F-statistic:  15.4 on 19 and 1179 DF,  p-value: < 2.2e-16
trainpre1 <- predict(l1,Train[,-c(1,2,3,4)])
## Warning in predict.lm(l1, Train[, -c(1, 2, 3, 4)]): prediction from a rank-
## deficient fit may be misleading
testpre1 <- predict(l1,Test[,-c(1,2,3,4)])
## Warning in predict.lm(l1, Test[, -c(1, 2, 3, 4)]): prediction from a rank-
## deficient fit may be misleading
Train.errer <-  evalue(pre = trainpre1,label = Train$total_cases)

test.errer <-  evalue(pre = testpre1,label = Test$total_cases)


errer1 <- data.frame(t(data.frame(Train.errer,test.errer)))
names(errer1) <- c('ME','RMSE','MAE','MAPE','MPE','MASE')
errer1
##                        ME     RMSE      MAE       MAPE         MPE
## Train.errer -5.804122e-12 27.60811 16.34844   5452.064   -2955.885
## test.errer  -2.166551e+01 25.91532 22.26630 222663.028 -216655.123
##                 MASE
## Train.errer 762.2077
## test.errer  671.6037

随机森林回归 ——————————————————————

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
l2 <- randomForest(total_cases~.,data = na.omit(Train[,-4]))
summary(l2)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted       1199   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       1199   -none- numeric  
## importance        23   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               1199   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
l2
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = na.omit(Train[,      -4])) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##           Mean of squared residuals: 495.3198
##                     % Var explained: 47.93
trainpre2 <- predict(l2,Train[,-4])
testpre2 <- predict(l2,Test[,-4])


Train.errer2 <-  evalue(pre = trainpre2,label = Train$total_cases)

test.errer2 <-  evalue(pre = testpre2,label = Test$total_cases)

errer2 <- data.frame(t(data.frame(Train.errer2,test.errer2)))
names(errer2) <- c('ME','RMSE','MAE','MAPE','MPE','MASE')
errer2
##                       ME      RMSE       MAE      MAPE         MPE
## Train.errer2  -0.4262795  9.784514  5.150878   1387.61   -1379.128
## test.errer2  -20.6030307 27.878706 20.603031 206030.31 -206030.307
##                  MASE
## Train.errer2  95.7367
## test.errer2  777.2222

xgboost

require(mlr)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning: replacing previous import 'BBmisc::isFALSE' by
## 'backports::isFALSE' when loading 'mlr'
tasktrain <- makeRegrTask(data = na.omit(Train[,-c(1,4)]),target = "total_cases")
tasktest <- makeRegrTask(data = Test[,-c(1,4)],target = "total_cases")

lnr <- makeLearner(cl = "regr.xgboost",predict.type = 'response')

mdl <- mlr::train(lnr,tasktrain)

trainpre3 <- predict(mdl,tasktrain)
testpre3 <- predict(mdl,tasktest)

 
Train.errer3 <-  evalue(pre = trainpre3$data$response,label = na.omit(Train)$total_cases)

test.errer3 <-  evalue(pre = testpre3$data$response,label = Test$total_cases)

errer3 <- data.frame(t(data.frame(Train.errer3,test.errer3)))
names(errer3) <- c('ME','RMSE','MAE','MAPE','MPE','MASE')
errer3
##                     ME      RMSE       MAE      MAPE        MPE     MASE
## Train.errer3 14.845044 29.156638 15.319273  1415.079  -1308.737 850.1096
## test.errer3  -5.547147  8.068488  5.547147 55471.473 -55471.473  65.1005