Demo20180718

library(outbreaks)

## Warning: package 'outbreaks' was built under R version 3.4.4

data("fluH7N9_china_2013")

class(fluH7N9_china_2013)

## [1] "data.frame"

View(fluH7N9_china_2013)
str(fluH7N9_china_2013)

## 'data.frame':    136 obs. of  8 variables:
##  $ case_id                : Factor w/ 136 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ date_of_onset          : Date, format: "2013-02-19" "2013-02-27" ...
##  $ date_of_hospitalisation: Date, format: NA "2013-03-03" ...
##  $ date_of_outcome        : Date, format: "2013-03-04" "2013-03-10" ...
##  $ outcome                : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ gender                 : Factor w/ 2 levels "f","m": 2 2 1 1 1 1 2 2 2 2 ...
##  $ age                    : Factor w/ 61 levels "?","15","2","21",..: 58 7 11 18 20 9 54 14 39 20 ...
##  $ province               : Factor w/ 13 levels "Anhui","Beijing",..: 11 11 1 8 8 8 8 13 13 11 ...

head(fluH7N9_china_2013)

##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1       1    2013-02-19                    <NA>      2013-03-04   Death
## 2       2    2013-02-27              2013-03-03      2013-03-10   Death
## 3       3    2013-03-09              2013-03-19      2013-04-09   Death
## 4       4    2013-03-19              2013-03-27            <NA>    <NA>
## 5       5    2013-03-19              2013-03-30      2013-05-15 Recover
## 6       6    2013-03-21              2013-03-28      2013-04-26   Death
##   gender age province
## 1      m  87 Shanghai
## 2      m  27 Shanghai
## 3      f  35    Anhui
## 4      f  45  Jiangsu
## 5      f  48  Jiangsu
## 6      f  32  Jiangsu

fluH7N9_china_2013$age[fluH7N9_china_2013$age == '?'] <- NA
fluH7N9_china_2013$age <- as.numeric(fluH7N9_china_2013$age)

fluH7N9_china_2013$age

##   [1] 58  7 11 18 20  9 54 14 39 20 36 24 39 15 34 51 46 38 31 27 39 56  5
##  [24] 36 35 49 23 51 48 53 43 46 37 46 54 40  8 28 38 46 26 25 57 42 28 49
##  [47] 44 37 14 10 37 36 35 47 51 45 26 50 22  6 33 40 33 28  4 44 28 29 35
##  [70] 30 44 19 41 NA NA 27  3 59 13 46 57 16 14  6 52 26 41 15 26 17 20 38
##  [93] 28 11 13 17 47 48 40 30 51 53 40 26  9 12 61 55 35 25 28 41 33 22 14
## [116] 37 48 21 12 33 36 14 26 52  8 52 15 30 41 41 60 51 32  2 34 23

fluH7N9_china_2013$case_id <- paste('case',fluH7N9_china_2013$case_id, sep = '_')


library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.4

fluH7N9_china_2013_gather  <- fluH7N9_china_2013 %>%
    gather(Group, Date, date_of_onset:date_of_outcome)

fluH7N9_china_2013[fluH7N9_china_2013$case_id == 'case_1',]

##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death
##   gender age province
## 1      m  58 Shanghai

fluH7N9_china_2013_gather[fluH7N9_china_2013$case_id == 'case_1',]

##     case_id outcome gender age province                   Group       Date
## 1    case_1   Death      m  58 Shanghai           date_of_onset 2013-02-19
## 137  case_1   Death      m  58 Shanghai date_of_hospitalisation       <NA>
## 273  case_1   Death      m  58 Shanghai         date_of_outcome 2013-03-04

nrow(fluH7N9_china_2013)

## [1] 136

nrow(fluH7N9_china_2013_gather)

## [1] 408

fluH7N9_china_2013_gather$Group <- factor(fluH7N9_china_2013_gather$Group, levels =
c("date_of_onset", "date_of_hospitalisation", "date_of_outcome"))

str(fluH7N9_china_2013_gather)

## 'data.frame':    408 obs. of  7 variables:
##  $ case_id : chr  "case_1" "case_2" "case_3" "case_4" ...
##  $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ gender  : Factor w/ 2 levels "f","m": 2 2 1 1 1 1 2 2 2 2 ...
##  $ age     : num  58 7 11 18 20 9 54 14 39 20 ...
##  $ province: Factor w/ 13 levels "Anhui","Beijing",..: 11 11 1 8 8 8 8 13 13 11 ...
##  $ Group   : Factor w/ 3 levels "date_of_onset",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date    : Date, format: "2013-02-19" "2013-02-27" ...

library(plyr)

## Warning: package 'plyr' was built under R version 3.4.4

fluH7N9_china_2013_gather$Group <-mapvalues(fluH7N9_china_2013_gather$Group, from =
c("date_of_onset", "date_of_hospitalisation", "date_of_outcome"),
to = c("Date of onset", "Date of hospitalisation", "Date of outcome"))


levels(fluH7N9_china_2013_gather$province)

##  [1] "Anhui"     "Beijing"   "Fujian"    "Guangdong" "Hebei"    
##  [6] "Henan"     "Hunan"     "Jiangsu"   "Jiangxi"   "Shandong" 
## [11] "Shanghai"  "Taiwan"    "Zhejiang"

fluH7N9_china_2013_gather$province <- mapvalues(fluH7N9_china_2013_gather$province,from =
c("Anhui", "Beijing", "Fujian", "Guangdong", "Hebei", "Henan", "Hunan", "Jiangxi", "Shandong", "Taiwan"),
to = rep("Other", 10))

levels(fluH7N9_china_2013_gather$province)

## [1] "Other"    "Jiangsu"  "Shanghai" "Zhejiang"

levels(fluH7N9_china_2013_gather$gender) <- c(levels(fluH7N9_china_2013_gather$gender),
"unknown")

fluH7N9_china_2013_gather$gender[is.na(fluH7N9_china_2013_gather$gender)] <- "unknown"

fluH7N9_china_2013_gather$province <- factor(fluH7N9_china_2013_gather$province, levels =
c("Jiangsu", "Shanghai", "Zhejiang", "Other"))

str(fluH7N9_china_2013_gather)

## 'data.frame':    408 obs. of  7 variables:
##  $ case_id : chr  "case_1" "case_2" "case_3" "case_4" ...
##  $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ gender  : Factor w/ 3 levels "f","m","unknown": 2 2 1 1 1 1 2 2 2 2 ...
##  $ age     : num  58 7 11 18 20 9 54 14 39 20 ...
##  $ province: Factor w/ 4 levels "Jiangsu","Shanghai",..: 2 2 4 1 1 1 1 3 3 2 ...
##  $ Group   : Factor w/ 3 levels "Date of onset",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date    : Date, format: "2013-02-19" "2013-02-27" ...

資料視覺化

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, fill = outcome)) +
stat_density2d(aes(alpha = ..level..), geom = "polygon") +
geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
geom_rug(aes(color = outcome)) +
labs(
fill = "Outcome",
color = "Outcome",
alpha = "Level",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "2013 Influenza A H7N9 cases in China"
)

## Warning: Removed 149 rows containing non-finite values (stat_density2d).

## Warning: Removed 149 rows containing missing values (geom_point).

ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, fill = outcome)) +
stat_density2d(aes(alpha = ..level..), geom = "polygon") +
geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
geom_rug(aes(color = outcome)) +
labs(
fill = "Outcome",
color = "Outcome",
alpha = "Level",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "2013 Influenza A H7N9 cases in China"
) +
facet_grid(Group ~ province) +
scale_shape_manual(values = c(15, 16, 17)) +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1")

## Warning: Removed 149 rows containing non-finite values (stat_density2d).

## Warning: Removed 149 rows containing missing values (geom_point).

table(fluH7N9_china_2013$outcome, fluH7N9_china_2013$gender)

##          
##            f  m
##   Death    9 22
##   Recover 12 34

ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, color = outcome)) +
geom_point(aes(color = outcome, shape = gender), size = 1.5, alpha = 0.6) +
geom_path(aes(group = case_id)) +
facet_wrap( ~ province, ncol = 2) +
scale_shape_manual(values = c(15, 16, 17)) +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1") +
labs(
color = "Outcome",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "Time from onset of flu to outcome"
)

## Warning: Removed 149 rows containing missing values (geom_point).

## Warning: Removed 98 rows containing missing values (geom_path).

fluH7N9_china_2013_gather_2 <- fluH7N9_china_2013_gather[, -4] %>% gather(group_2, value, gender:province)

## Warning: attributes are not identical across measure variables;
## they will be dropped

fluH7N9_china_2013_gather_2$value <-
mapvalues(fluH7N9_china_2013_gather_2$value, from = c("m", "f", "unknown","Other"), to = c("Male", "Female", "Unknown gender", "Other province"))

fluH7N9_china_2013_gather_2$value <-
factor(fluH7N9_china_2013_gather_2$value, levels = c("Female", "Male","Unknown gender", "Jiangsu", "Shanghai", "Zhejiang", "Other province"))

str(fluH7N9_china_2013_gather_2)

## 'data.frame':    816 obs. of  6 variables:
##  $ case_id: chr  "case_1" "case_2" "case_3" "case_4" ...
##  $ outcome: Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ Group  : Factor w/ 3 levels "Date of onset",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date   : Date, format: "2013-02-19" "2013-02-27" ...
##  $ group_2: chr  "gender" "gender" "gender" "gender" ...
##  $ value  : Factor w/ 7 levels "Female","Male",..: 2 2 1 1 1 1 2 2 2 2 ...

p1 <- ggplot(data = fluH7N9_china_2013_gather_2, aes(x = value, fill = outcome, color = outcome)) +
geom_bar(position = "dodge", alpha = 0.7, size = 1) +
scale_fill_brewer(palette="Set1", na.value = "grey50") +
scale_color_brewer(palette="Set1", na.value = "grey50") +
labs(
color = "",
fill = "",
x = "",
y = "Count",
title = "2013 Influenza A H7N9 cases in China")

p1

p2 <- ggplot(data = fluH7N9_china_2013_gather, aes(x = age, fill = outcome, color = outcome)) +
geom_density(alpha = 0.3, size = 1) +
geom_rug() +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1", na.value = "grey50") +
labs(
color = "",
fill = "",
x = "Age",
y = "Density",
title = "Age distribution of flu cases"
)

p2

## Warning: Removed 6 rows containing non-finite values (stat_density).

long to wide or wide to long

library(reshape2)

## Warning: package 'reshape2' was built under R version 3.4.4

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

data("airquality")
names(airquality) <- tolower(names(airquality))
head(airquality)

##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

aqm <- melt(airquality, id=c("month", "day"), na.rm=TRUE)
head(aqm)

##   month day variable value
## 1     5   1    ozone    41
## 2     5   2    ozone    36
## 3     5   3    ozone    12
## 4     5   4    ozone    18
## 6     5   6    ozone    28
## 7     5   7    ozone    23

newdata <- dcast(aqm, month ~ variable, mean, margins = c("month", "variable"))

抽取特徵

data(fluH7N9_china_2013)
dataset <- fluH7N9_china_2013

dataset $age[which(dataset$age == "?")] <- NA
dataset$age <- as.numeric(as.character(dataset$age))

dataset$case_id <- paste("case", dataset$case_id, sep = "_")

head(dataset)

##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death
## 2  case_2    2013-02-27              2013-03-03      2013-03-10   Death
## 3  case_3    2013-03-09              2013-03-19      2013-04-09   Death
## 4  case_4    2013-03-19              2013-03-27            <NA>    <NA>
## 5  case_5    2013-03-19              2013-03-30      2013-05-15 Recover
## 6  case_6    2013-03-21              2013-03-28      2013-04-26   Death
##   gender age province
## 1      m  87 Shanghai
## 2      m  27 Shanghai
## 3      f  35    Anhui
## 4      f  45  Jiangsu
## 5      f  48  Jiangsu
## 6      f  32  Jiangsu

dataset$hospital <- as.factor(ifelse(is.na(dataset$date_of_hospitalisation), 0, 1))

dataset$gender_f <- as.factor(ifelse(dataset$gender == "f", 1, 0))


dataset$province_Jiangsu <- as.factor(ifelse(dataset$province == "Jiangsu", 1, 0))
dataset$province_Shanghai <- as.factor(ifelse(dataset$province == "Shanghai", 1, 0))
dataset$province_Zhejiang <- as.factor(ifelse(dataset$province == "Zhejiang", 1, 0))

dataset$province_other <- as.factor(ifelse(dataset$province == "Zhejiang" | dataset$province
== "Jiangsu" | dataset$province == "Shanghai", 0, 1))

head(dataset)

##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death
## 2  case_2    2013-02-27              2013-03-03      2013-03-10   Death
## 3  case_3    2013-03-09              2013-03-19      2013-04-09   Death
## 4  case_4    2013-03-19              2013-03-27            <NA>    <NA>
## 5  case_5    2013-03-19              2013-03-30      2013-05-15 Recover
## 6  case_6    2013-03-21              2013-03-28      2013-04-26   Death
##   gender age province hospital gender_f province_Jiangsu province_Shanghai
## 1      m  87 Shanghai        0        0                0                 1
## 2      m  27 Shanghai        1        0                0                 1
## 3      f  35    Anhui        1        1                0                 0
## 4      f  45  Jiangsu        1        1                1                 0
## 5      f  48  Jiangsu        1        1                1                 0
## 6      f  32  Jiangsu        1        1                1                 0
##   province_Zhejiang province_other
## 1                 0              0
## 2                 0              0
## 3                 0              1
## 4                 0              0
## 5                 0              0
## 6                 0              0

#as.Date(as.character(dataset$date_of_outcome), format = "%Y-%m-%d") - as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d")

dataset$days_onset_to_outcome <- as.numeric(as.character(gsub(" days", "",
as.Date(as.character(dataset$date_of_outcome), format = "%Y-%m-%d") -
as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d"))))


dataset$days_onset_to_hospital <- as.numeric(as.character(gsub(" days", "",
as.Date(as.character(dataset$date_of_hospitalisation), format = "%Y-%m-%d") -
as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d"))))

summary(dataset$date_of_onset)

##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2013-02-19" "2013-03-31" "2013-04-08" "2013-04-07" "2013-04-14" 
##         Max.         NA's 
## "2013-07-27"         "10"

dataset$early_onset <- as.factor(ifelse(dataset$date_of_onset < summary(dataset$date_of_onset)[[3]], 1, 0))

dataset$early_outcome <- as.factor(ifelse(dataset$date_of_outcome < summary(dataset$date_of_outcome)[[3]], 1, 0))

#dataset


dataset <- dataset[,c('case_id','outcome'
, 'age', 'hospital','gender_f'
,'province_Jiangsu'
,'province_Shanghai'
,'province_Zhejiang'
,'province_other'
,'days_onset_to_outcome'
,'days_onset_to_hospital'
,'early_onset'
,'early_outcome')]

head(dataset)

##   case_id outcome age hospital gender_f province_Jiangsu province_Shanghai
## 1  case_1   Death  87        0        0                0                 1
## 2  case_2   Death  27        1        0                0                 1
## 3  case_3   Death  35        1        1                0                 0
## 4  case_4    <NA>  45        1        1                1                 0
## 5  case_5 Recover  48        1        1                1                 0
## 6  case_6   Death  32        1        1                1                 0
##   province_Zhejiang province_other days_onset_to_outcome
## 1                 0              0                    13
## 2                 0              0                    11
## 3                 0              1                    31
## 4                 0              0                    NA
## 5                 0              0                    57
## 6                 0              0                    36
##   days_onset_to_hospital early_onset early_outcome
## 1                     NA           1             1
## 2                      4           1             1
## 3                     10           1             1
## 4                      8           1          <NA>
## 5                     11           1             0
## 6                      7           1             1

sum(is.na(dataset))

## [1] 277

library(Amelia)

## Warning: package 'Amelia' was built under R version 3.4.4

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 3.4.4

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

missmap(dataset)

library(mice)

## Warning: package 'mice' was built under R version 3.4.4

## Loading required package: lattice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:tidyr':
## 
##     complete

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

dataset_impute <- mice(data = dataset[, -2], print = FALSE)

## Warning: Number of logged events: 151

#dataset_impute

missmap(complete(dataset_impute, "long"))

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#dataset[,c(1,2)]
#right_join(dataset[,c(1,2)], )
datasets_complete <- right_join(dataset[, c(1, 2)],
complete(dataset_impute, "long"),
by = "case_id") %>% select(-.id)

missmap(datasets_complete)

建立模型

train_index <- which(is.na(datasets_complete$outcome))

nrow(datasets_complete)

## [1] 680

datasets_complete$.imp           <- NULL
datasets_complete$province_other <- NULL
length(train_index)

## [1] 285

train_data <- datasets_complete[-train_index, ]
test_data  <- datasets_complete[train_index, -2]

head(train_data)

##   case_id outcome age hospital gender_f province_Jiangsu province_Shanghai
## 1  case_1   Death  87        0        0                0                 1
## 2  case_2   Death  27        1        0                0                 1
## 3  case_3   Death  35        1        1                0                 0
## 5  case_5 Recover  48        1        1                1                 0
## 6  case_6   Death  32        1        1                1                 0
## 7  case_7   Death  83        1        0                1                 0
##   province_Zhejiang days_onset_to_outcome days_onset_to_hospital
## 1                 0                    13                      4
## 2                 0                    11                      4
## 3                 0                    31                     10
## 5                 0                    57                     11
## 6                 0                    36                      7
## 7                 0                    20                      9
##   early_onset early_outcome
## 1           1             1
## 2           1             1
## 3           1             1
## 5           1             0
## 6           1             1
## 7           1             1

set.seed(42)
idx <- sample.int(2 , nrow(train_data), p = c(0.7, 0.3), replace=TRUE)
val_train_data <- train_data[idx == 1, ]
val_test_data  <- train_data[idx == 2, ]

dim(val_train_data)

## [1] 273  12

dim(val_test_data)

## [1] 122  12

#val_train_data[, -1]


library(rpart)
fit <- rpart(outcome ~., data = val_train_data[,-1])
plot(fit, margin = 0.1)
text(fit)

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.4.4

rpart.plot(fit)

predicted <- predict(fit, val_test_data, type = 'class')

table(val_test_data$outcome, predicted)

##          predicted
##           Death Recover
##   Death      45       6
##   Recover    10      61

sum(val_test_data$outcome == predicted) / length(predicted)

## [1] 0.8688525

評估特徵重要性

library(caret)

## Warning: package 'caret' was built under R version 3.4.4

control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

set.seed(42)
model1 <- train(outcome ~ ., data = val_train_data[,-1], method = "rpart",preProcess = NULL, trControl = control)

importance1 <- varImp(model1, scale=TRUE)
plot(importance1)

Logistic Regression

fit <- glm(outcome ~ ., data = val_train_data[,-1], family = 'binomial')

summary(fit)

## 
## Call:
## glm(formula = outcome ~ ., family = "binomial", data = val_train_data[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4192  -0.8580   0.4280   0.8088   1.7569  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             4.126283   0.710054   5.811 6.20e-09 ***
## age                    -0.038345   0.008062  -4.756 1.97e-06 ***
## hospital1              -0.178303   0.329025  -0.542    0.588    
## gender_f1              -0.415759   0.343030  -1.212    0.226    
## province_Jiangsu1       0.069791   0.539090   0.129    0.897    
## province_Shanghai1     -0.034750   0.448794  -0.077    0.938    
## province_Zhejiang1     -0.053870   0.502262  -0.107    0.915    
## days_onset_to_outcome  -0.016929   0.014190  -1.193    0.233    
## days_onset_to_hospital  0.030334   0.040251   0.754    0.451    
## early_onset1           -0.035302   0.425105  -0.083    0.934    
## early_outcome1         -2.252748   0.446894  -5.041 4.63e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 367.30  on 272  degrees of freedom
## Residual deviance: 280.85  on 262  degrees of freedom
## AIC: 302.85
## 
## Number of Fisher Scoring iterations: 5

fit.step <- step(fit)

## Start:  AIC=302.85
## outcome ~ age + hospital + gender_f + province_Jiangsu + province_Shanghai + 
##     province_Zhejiang + days_onset_to_outcome + days_onset_to_hospital + 
##     early_onset + early_outcome
## 
##                          Df Deviance    AIC
## - province_Shanghai       1   280.86 300.86
## - early_onset             1   280.86 300.86
## - province_Zhejiang       1   280.86 300.86
## - province_Jiangsu        1   280.87 300.87
## - hospital                1   281.14 301.14
## - days_onset_to_hospital  1   281.41 301.41
## - days_onset_to_outcome   1   282.23 302.23
## - gender_f                1   282.32 302.32
## <none>                        280.85 302.85
## - age                     1   308.54 328.54
## - early_outcome           1   309.54 329.54
## 
## Step:  AIC=300.86
## outcome ~ age + hospital + gender_f + province_Jiangsu + province_Zhejiang + 
##     days_onset_to_outcome + days_onset_to_hospital + early_onset + 
##     early_outcome
## 
##                          Df Deviance    AIC
## - province_Zhejiang       1   280.86 298.86
## - early_onset             1   280.87 298.87
## - province_Jiangsu        1   280.89 298.89
## - hospital                1   281.15 299.15
## - days_onset_to_hospital  1   281.42 299.42
## - days_onset_to_outcome   1   282.23 300.23
## - gender_f                1   282.32 300.32
## <none>                        280.86 300.86
## - early_outcome           1   309.54 327.54
## - age                     1   309.55 327.55
## 
## Step:  AIC=298.86
## outcome ~ age + hospital + gender_f + province_Jiangsu + days_onset_to_outcome + 
##     days_onset_to_hospital + early_onset + early_outcome
## 
##                          Df Deviance    AIC
## - early_onset             1   280.88 296.88
## - province_Jiangsu        1   280.91 296.91
## - hospital                1   281.15 297.15
## - days_onset_to_hospital  1   281.49 297.49
## - days_onset_to_outcome   1   282.23 298.23
## - gender_f                1   282.32 298.32
## <none>                        280.86 298.86
## - age                     1   309.55 325.55
## - early_outcome           1   309.57 325.57
## 
## Step:  AIC=296.88
## outcome ~ age + hospital + gender_f + province_Jiangsu + days_onset_to_outcome + 
##     days_onset_to_hospital + early_outcome
## 
##                          Df Deviance    AIC
## - province_Jiangsu        1   280.93 294.93
## - hospital                1   281.18 295.18
## - days_onset_to_hospital  1   281.56 295.56
## - gender_f                1   282.35 296.35
## - days_onset_to_outcome   1   282.76 296.76
## <none>                        280.88 296.88
## - age                     1   309.92 323.92
## - early_outcome           1   327.25 341.25
## 
## Step:  AIC=294.93
## outcome ~ age + hospital + gender_f + days_onset_to_outcome + 
##     days_onset_to_hospital + early_outcome
## 
##                          Df Deviance    AIC
## - hospital                1   281.29 293.29
## - days_onset_to_hospital  1   281.62 293.62
## - gender_f                1   282.37 294.37
## - days_onset_to_outcome   1   282.78 294.78
## <none>                        280.93 294.93
## - age                     1   311.92 323.92
## - early_outcome           1   327.45 339.45
## 
## Step:  AIC=293.29
## outcome ~ age + gender_f + days_onset_to_outcome + days_onset_to_hospital + 
##     early_outcome
## 
##                          Df Deviance    AIC
## - days_onset_to_hospital  1   282.04 292.04
## - gender_f                1   282.79 292.79
## <none>                        281.29 293.29
## - days_onset_to_outcome   1   283.83 293.83
## - age                     1   312.10 322.10
## - early_outcome           1   330.23 340.23
## 
## Step:  AIC=292.04
## outcome ~ age + gender_f + days_onset_to_outcome + early_outcome
## 
##                         Df Deviance    AIC
## - gender_f               1   283.21 291.21
## <none>                       282.04 292.04
## - days_onset_to_outcome  1   284.07 292.07
## - age                    1   312.25 320.25
## - early_outcome          1   331.66 339.66
## 
## Step:  AIC=291.21
## outcome ~ age + days_onset_to_outcome + early_outcome
## 
##                         Df Deviance    AIC
## <none>                       283.21 291.21
## - days_onset_to_outcome  1   285.87 291.87
## - age                    1   312.48 318.48
## - early_outcome          1   333.25 339.25

#val_test_data$outcome
predict(fit.step, val_test_data) >0

##     1     2     5     8    12    14    17    20    21    26    29    30 
## FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE 
##    36    43    44    45    51    57    64    72    73    75    81    83 
## FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE 
##    98   106   111   119   131   143   155   156   157   161   170   171 
## FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE 
##   172   173   182   185   195   208   210   211   213   214   218   219 
## FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE 
##   225   230   247   252   253   257   261   264   269   275   289   296 
##  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE 
##   298   299   301   302   307   308   309   315   321   331   332   333 
## FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE 
##   337   347   361   370   388   391   405   406   409   425   432   438 
##  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE 
##   443   451   453   454   458   463   481   489   490   502   527   529 
##  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE 
##   537   543   547   549   556   558   561   562   563   573   579   588 
##  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE 
##   590   593   601   604   618   622   631   635   638   651   667   669 
## FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE 
##   673   679 
##  TRUE  TRUE

predicted <- ifelse(predict(fit.step, val_test_data) >0, 1, 0 )
predicted <- factor(predicted, levels = c(0,1), labels = c('Recover', 'Death'))
table(val_test_data$outcome, predicted)

##          predicted
##           Recover Death
##   Death        31    20
##   Recover      15    56

RandomForest

library(rpart)
fit1 <- rpart(outcome ~., data = val_train_data[,-1])
predicted1 <- predict(fit1, val_test_data, type = 'class')
table(val_test_data$outcome, predicted1)

##          predicted1
##           Death Recover
##   Death      45       6
##   Recover    10      61

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.4

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

fit2 <- randomForest(outcome ~., data = val_train_data[,-1], ntree = 100)
predicted2 <- predict(fit2, val_test_data, type = 'class')
table(val_test_data$outcome, predicted2)

##          predicted2
##           Death Recover
##   Death      49       2
##   Recover     4      67

library(e1071)

## Warning: package 'e1071' was built under R version 3.4.4

fit3 <- svm(outcome ~., data = val_train_data[,-1])
predicted3 <- predict(fit3, val_test_data, type = 'class')
table(val_test_data$outcome, predicted3)

##          predicted3
##           Death Recover
##   Death      36      15
##   Recover     7      64

#predict(fit1, val_test_data)

control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(42)
model2 <- train(outcome ~ ., data = val_train_data[,-1], method = "rf", preProcess = NULL, trControl = control)
importance2 <- varImp(model2, scale=TRUE)
plot(importance2)

使用ROC Curve 比較模型

library(rpart)
fit1 <- rpart(outcome ~., data = val_train_data[,-1])

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.4.4

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.4.4

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions1 <- predict(fit1, val_test_data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(val_test_data$outcome))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")



library(randomForest)
fit2 <- randomForest(outcome ~., data = val_train_data[,-1], ntree = 100 )

predictions2 <- predict(fit2, val_test_data, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(val_test_data$outcome))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


library(e1071)
fit3 <- svm(outcome ~., data = val_train_data[,-1],probability = TRUE)
?svm

## starting httpd help server ...

##  done

predictions3 <- predict(fit3, val_test_data, probability=TRUE)
pred.to.roc3 <- attr(predictions3, "probabilities")[, 2]
pred.rocr3 <- prediction(pred.to.roc3, as.factor(val_test_data$outcome))
perf.rocr3 <- performance(pred.rocr3, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr3 <- performance(pred.rocr3, "tpr","fpr")

plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart','randomforest', 'svm'), 1:3)
plot(perf.tpr.rocr2, col=2, add=TRUE)
plot(perf.tpr.rocr3, col=3, add=TRUE)

Demo20180718

David Chiu

2018年7月18日

資料視覺化

long to wide or wide to long

抽取特徵

建立模型

評估特徵重要性

Logistic Regression

RandomForest

使用ROC Curve 比較模型