library(outbreaks)
## Warning: package 'outbreaks' was built under R version 3.4.4
data("fluH7N9_china_2013")
class(fluH7N9_china_2013)
## [1] "data.frame"
View(fluH7N9_china_2013)
str(fluH7N9_china_2013)
## 'data.frame': 136 obs. of 8 variables:
## $ case_id : Factor w/ 136 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ date_of_onset : Date, format: "2013-02-19" "2013-02-27" ...
## $ date_of_hospitalisation: Date, format: NA "2013-03-03" ...
## $ date_of_outcome : Date, format: "2013-03-04" "2013-03-10" ...
## $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
## $ gender : Factor w/ 2 levels "f","m": 2 2 1 1 1 1 2 2 2 2 ...
## $ age : Factor w/ 61 levels "?","15","2","21",..: 58 7 11 18 20 9 54 14 39 20 ...
## $ province : Factor w/ 13 levels "Anhui","Beijing",..: 11 11 1 8 8 8 8 13 13 11 ...
head(fluH7N9_china_2013)
## case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1 1 2013-02-19 <NA> 2013-03-04 Death
## 2 2 2013-02-27 2013-03-03 2013-03-10 Death
## 3 3 2013-03-09 2013-03-19 2013-04-09 Death
## 4 4 2013-03-19 2013-03-27 <NA> <NA>
## 5 5 2013-03-19 2013-03-30 2013-05-15 Recover
## 6 6 2013-03-21 2013-03-28 2013-04-26 Death
## gender age province
## 1 m 87 Shanghai
## 2 m 27 Shanghai
## 3 f 35 Anhui
## 4 f 45 Jiangsu
## 5 f 48 Jiangsu
## 6 f 32 Jiangsu
fluH7N9_china_2013$age[fluH7N9_china_2013$age == '?'] <- NA
fluH7N9_china_2013$age <- as.numeric(fluH7N9_china_2013$age)
fluH7N9_china_2013$age
## [1] 58 7 11 18 20 9 54 14 39 20 36 24 39 15 34 51 46 38 31 27 39 56 5
## [24] 36 35 49 23 51 48 53 43 46 37 46 54 40 8 28 38 46 26 25 57 42 28 49
## [47] 44 37 14 10 37 36 35 47 51 45 26 50 22 6 33 40 33 28 4 44 28 29 35
## [70] 30 44 19 41 NA NA 27 3 59 13 46 57 16 14 6 52 26 41 15 26 17 20 38
## [93] 28 11 13 17 47 48 40 30 51 53 40 26 9 12 61 55 35 25 28 41 33 22 14
## [116] 37 48 21 12 33 36 14 26 52 8 52 15 30 41 41 60 51 32 2 34 23
fluH7N9_china_2013$case_id <- paste('case',fluH7N9_china_2013$case_id, sep = '_')
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
fluH7N9_china_2013_gather <- fluH7N9_china_2013 %>%
gather(Group, Date, date_of_onset:date_of_outcome)
fluH7N9_china_2013[fluH7N9_china_2013$case_id == 'case_1',]
## case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1 case_1 2013-02-19 <NA> 2013-03-04 Death
## gender age province
## 1 m 58 Shanghai
fluH7N9_china_2013_gather[fluH7N9_china_2013$case_id == 'case_1',]
## case_id outcome gender age province Group Date
## 1 case_1 Death m 58 Shanghai date_of_onset 2013-02-19
## 137 case_1 Death m 58 Shanghai date_of_hospitalisation <NA>
## 273 case_1 Death m 58 Shanghai date_of_outcome 2013-03-04
nrow(fluH7N9_china_2013)
## [1] 136
nrow(fluH7N9_china_2013_gather)
## [1] 408
fluH7N9_china_2013_gather$Group <- factor(fluH7N9_china_2013_gather$Group, levels =
c("date_of_onset", "date_of_hospitalisation", "date_of_outcome"))
str(fluH7N9_china_2013_gather)
## 'data.frame': 408 obs. of 7 variables:
## $ case_id : chr "case_1" "case_2" "case_3" "case_4" ...
## $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
## $ gender : Factor w/ 2 levels "f","m": 2 2 1 1 1 1 2 2 2 2 ...
## $ age : num 58 7 11 18 20 9 54 14 39 20 ...
## $ province: Factor w/ 13 levels "Anhui","Beijing",..: 11 11 1 8 8 8 8 13 13 11 ...
## $ Group : Factor w/ 3 levels "date_of_onset",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2013-02-19" "2013-02-27" ...
library(plyr)
## Warning: package 'plyr' was built under R version 3.4.4
fluH7N9_china_2013_gather$Group <-mapvalues(fluH7N9_china_2013_gather$Group, from =
c("date_of_onset", "date_of_hospitalisation", "date_of_outcome"),
to = c("Date of onset", "Date of hospitalisation", "Date of outcome"))
levels(fluH7N9_china_2013_gather$province)
## [1] "Anhui" "Beijing" "Fujian" "Guangdong" "Hebei"
## [6] "Henan" "Hunan" "Jiangsu" "Jiangxi" "Shandong"
## [11] "Shanghai" "Taiwan" "Zhejiang"
fluH7N9_china_2013_gather$province <- mapvalues(fluH7N9_china_2013_gather$province,from =
c("Anhui", "Beijing", "Fujian", "Guangdong", "Hebei", "Henan", "Hunan", "Jiangxi", "Shandong", "Taiwan"),
to = rep("Other", 10))
levels(fluH7N9_china_2013_gather$province)
## [1] "Other" "Jiangsu" "Shanghai" "Zhejiang"
levels(fluH7N9_china_2013_gather$gender) <- c(levels(fluH7N9_china_2013_gather$gender),
"unknown")
fluH7N9_china_2013_gather$gender[is.na(fluH7N9_china_2013_gather$gender)] <- "unknown"
fluH7N9_china_2013_gather$province <- factor(fluH7N9_china_2013_gather$province, levels =
c("Jiangsu", "Shanghai", "Zhejiang", "Other"))
str(fluH7N9_china_2013_gather)
## 'data.frame': 408 obs. of 7 variables:
## $ case_id : chr "case_1" "case_2" "case_3" "case_4" ...
## $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
## $ gender : Factor w/ 3 levels "f","m","unknown": 2 2 1 1 1 1 2 2 2 2 ...
## $ age : num 58 7 11 18 20 9 54 14 39 20 ...
## $ province: Factor w/ 4 levels "Jiangsu","Shanghai",..: 2 2 4 1 1 1 1 3 3 2 ...
## $ Group : Factor w/ 3 levels "Date of onset",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2013-02-19" "2013-02-27" ...
資料視覺化
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, fill = outcome)) +
stat_density2d(aes(alpha = ..level..), geom = "polygon") +
geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
geom_rug(aes(color = outcome)) +
labs(
fill = "Outcome",
color = "Outcome",
alpha = "Level",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "2013 Influenza A H7N9 cases in China"
)
## Warning: Removed 149 rows containing non-finite values (stat_density2d).
## Warning: Removed 149 rows containing missing values (geom_point).

ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, fill = outcome)) +
stat_density2d(aes(alpha = ..level..), geom = "polygon") +
geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
geom_rug(aes(color = outcome)) +
labs(
fill = "Outcome",
color = "Outcome",
alpha = "Level",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "2013 Influenza A H7N9 cases in China"
) +
facet_grid(Group ~ province) +
scale_shape_manual(values = c(15, 16, 17)) +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1")
## Warning: Removed 149 rows containing non-finite values (stat_density2d).
## Warning: Removed 149 rows containing missing values (geom_point).

table(fluH7N9_china_2013$outcome, fluH7N9_china_2013$gender)
##
## f m
## Death 9 22
## Recover 12 34
ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, color = outcome)) +
geom_point(aes(color = outcome, shape = gender), size = 1.5, alpha = 0.6) +
geom_path(aes(group = case_id)) +
facet_wrap( ~ province, ncol = 2) +
scale_shape_manual(values = c(15, 16, 17)) +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1") +
labs(
color = "Outcome",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "Time from onset of flu to outcome"
)
## Warning: Removed 149 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_path).

fluH7N9_china_2013_gather_2 <- fluH7N9_china_2013_gather[, -4] %>% gather(group_2, value, gender:province)
## Warning: attributes are not identical across measure variables;
## they will be dropped
fluH7N9_china_2013_gather_2$value <-
mapvalues(fluH7N9_china_2013_gather_2$value, from = c("m", "f", "unknown","Other"), to = c("Male", "Female", "Unknown gender", "Other province"))
fluH7N9_china_2013_gather_2$value <-
factor(fluH7N9_china_2013_gather_2$value, levels = c("Female", "Male","Unknown gender", "Jiangsu", "Shanghai", "Zhejiang", "Other province"))
str(fluH7N9_china_2013_gather_2)
## 'data.frame': 816 obs. of 6 variables:
## $ case_id: chr "case_1" "case_2" "case_3" "case_4" ...
## $ outcome: Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
## $ Group : Factor w/ 3 levels "Date of onset",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2013-02-19" "2013-02-27" ...
## $ group_2: chr "gender" "gender" "gender" "gender" ...
## $ value : Factor w/ 7 levels "Female","Male",..: 2 2 1 1 1 1 2 2 2 2 ...
p1 <- ggplot(data = fluH7N9_china_2013_gather_2, aes(x = value, fill = outcome, color = outcome)) +
geom_bar(position = "dodge", alpha = 0.7, size = 1) +
scale_fill_brewer(palette="Set1", na.value = "grey50") +
scale_color_brewer(palette="Set1", na.value = "grey50") +
labs(
color = "",
fill = "",
x = "",
y = "Count",
title = "2013 Influenza A H7N9 cases in China")
p1

p2 <- ggplot(data = fluH7N9_china_2013_gather, aes(x = age, fill = outcome, color = outcome)) +
geom_density(alpha = 0.3, size = 1) +
geom_rug() +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1", na.value = "grey50") +
labs(
color = "",
fill = "",
x = "Age",
y = "Density",
title = "Age distribution of flu cases"
)
p2
## Warning: Removed 6 rows containing non-finite values (stat_density).

long to wide or wide to long
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
data("airquality")
names(airquality) <- tolower(names(airquality))
head(airquality)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
aqm <- melt(airquality, id=c("month", "day"), na.rm=TRUE)
head(aqm)
## month day variable value
## 1 5 1 ozone 41
## 2 5 2 ozone 36
## 3 5 3 ozone 12
## 4 5 4 ozone 18
## 6 5 6 ozone 28
## 7 5 7 ozone 23
newdata <- dcast(aqm, month ~ variable, mean, margins = c("month", "variable"))
抽取特徵
data(fluH7N9_china_2013)
dataset <- fluH7N9_china_2013
dataset $age[which(dataset$age == "?")] <- NA
dataset$age <- as.numeric(as.character(dataset$age))
dataset$case_id <- paste("case", dataset$case_id, sep = "_")
head(dataset)
## case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1 case_1 2013-02-19 <NA> 2013-03-04 Death
## 2 case_2 2013-02-27 2013-03-03 2013-03-10 Death
## 3 case_3 2013-03-09 2013-03-19 2013-04-09 Death
## 4 case_4 2013-03-19 2013-03-27 <NA> <NA>
## 5 case_5 2013-03-19 2013-03-30 2013-05-15 Recover
## 6 case_6 2013-03-21 2013-03-28 2013-04-26 Death
## gender age province
## 1 m 87 Shanghai
## 2 m 27 Shanghai
## 3 f 35 Anhui
## 4 f 45 Jiangsu
## 5 f 48 Jiangsu
## 6 f 32 Jiangsu
dataset$hospital <- as.factor(ifelse(is.na(dataset$date_of_hospitalisation), 0, 1))
dataset$gender_f <- as.factor(ifelse(dataset$gender == "f", 1, 0))
dataset$province_Jiangsu <- as.factor(ifelse(dataset$province == "Jiangsu", 1, 0))
dataset$province_Shanghai <- as.factor(ifelse(dataset$province == "Shanghai", 1, 0))
dataset$province_Zhejiang <- as.factor(ifelse(dataset$province == "Zhejiang", 1, 0))
dataset$province_other <- as.factor(ifelse(dataset$province == "Zhejiang" | dataset$province
== "Jiangsu" | dataset$province == "Shanghai", 0, 1))
head(dataset)
## case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1 case_1 2013-02-19 <NA> 2013-03-04 Death
## 2 case_2 2013-02-27 2013-03-03 2013-03-10 Death
## 3 case_3 2013-03-09 2013-03-19 2013-04-09 Death
## 4 case_4 2013-03-19 2013-03-27 <NA> <NA>
## 5 case_5 2013-03-19 2013-03-30 2013-05-15 Recover
## 6 case_6 2013-03-21 2013-03-28 2013-04-26 Death
## gender age province hospital gender_f province_Jiangsu province_Shanghai
## 1 m 87 Shanghai 0 0 0 1
## 2 m 27 Shanghai 1 0 0 1
## 3 f 35 Anhui 1 1 0 0
## 4 f 45 Jiangsu 1 1 1 0
## 5 f 48 Jiangsu 1 1 1 0
## 6 f 32 Jiangsu 1 1 1 0
## province_Zhejiang province_other
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 0
## 5 0 0
## 6 0 0
#as.Date(as.character(dataset$date_of_outcome), format = "%Y-%m-%d") - as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d")
dataset$days_onset_to_outcome <- as.numeric(as.character(gsub(" days", "",
as.Date(as.character(dataset$date_of_outcome), format = "%Y-%m-%d") -
as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d"))))
dataset$days_onset_to_hospital <- as.numeric(as.character(gsub(" days", "",
as.Date(as.character(dataset$date_of_hospitalisation), format = "%Y-%m-%d") -
as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d"))))
summary(dataset$date_of_onset)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2013-02-19" "2013-03-31" "2013-04-08" "2013-04-07" "2013-04-14"
## Max. NA's
## "2013-07-27" "10"
dataset$early_onset <- as.factor(ifelse(dataset$date_of_onset < summary(dataset$date_of_onset)[[3]], 1, 0))
dataset$early_outcome <- as.factor(ifelse(dataset$date_of_outcome < summary(dataset$date_of_outcome)[[3]], 1, 0))
#dataset
dataset <- dataset[,c('case_id','outcome'
, 'age', 'hospital','gender_f'
,'province_Jiangsu'
,'province_Shanghai'
,'province_Zhejiang'
,'province_other'
,'days_onset_to_outcome'
,'days_onset_to_hospital'
,'early_onset'
,'early_outcome')]
head(dataset)
## case_id outcome age hospital gender_f province_Jiangsu province_Shanghai
## 1 case_1 Death 87 0 0 0 1
## 2 case_2 Death 27 1 0 0 1
## 3 case_3 Death 35 1 1 0 0
## 4 case_4 <NA> 45 1 1 1 0
## 5 case_5 Recover 48 1 1 1 0
## 6 case_6 Death 32 1 1 1 0
## province_Zhejiang province_other days_onset_to_outcome
## 1 0 0 13
## 2 0 0 11
## 3 0 1 31
## 4 0 0 NA
## 5 0 0 57
## 6 0 0 36
## days_onset_to_hospital early_onset early_outcome
## 1 NA 1 1
## 2 4 1 1
## 3 10 1 1
## 4 8 1 <NA>
## 5 11 1 0
## 6 7 1 1
sum(is.na(dataset))
## [1] 277
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.4.4
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.4.4
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(dataset)

library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
dataset_impute <- mice(data = dataset[, -2], print = FALSE)
## Warning: Number of logged events: 151
#dataset_impute
missmap(complete(dataset_impute, "long"))

library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#dataset[,c(1,2)]
#right_join(dataset[,c(1,2)], )
datasets_complete <- right_join(dataset[, c(1, 2)],
complete(dataset_impute, "long"),
by = "case_id") %>% select(-.id)
missmap(datasets_complete)

建立模型
train_index <- which(is.na(datasets_complete$outcome))
nrow(datasets_complete)
## [1] 680
datasets_complete$.imp <- NULL
datasets_complete$province_other <- NULL
length(train_index)
## [1] 285
train_data <- datasets_complete[-train_index, ]
test_data <- datasets_complete[train_index, -2]
head(train_data)
## case_id outcome age hospital gender_f province_Jiangsu province_Shanghai
## 1 case_1 Death 87 0 0 0 1
## 2 case_2 Death 27 1 0 0 1
## 3 case_3 Death 35 1 1 0 0
## 5 case_5 Recover 48 1 1 1 0
## 6 case_6 Death 32 1 1 1 0
## 7 case_7 Death 83 1 0 1 0
## province_Zhejiang days_onset_to_outcome days_onset_to_hospital
## 1 0 13 4
## 2 0 11 4
## 3 0 31 10
## 5 0 57 11
## 6 0 36 7
## 7 0 20 9
## early_onset early_outcome
## 1 1 1
## 2 1 1
## 3 1 1
## 5 1 0
## 6 1 1
## 7 1 1
set.seed(42)
idx <- sample.int(2 , nrow(train_data), p = c(0.7, 0.3), replace=TRUE)
val_train_data <- train_data[idx == 1, ]
val_test_data <- train_data[idx == 2, ]
dim(val_train_data)
## [1] 273 12
dim(val_test_data)
## [1] 122 12
#val_train_data[, -1]
library(rpart)
fit <- rpart(outcome ~., data = val_train_data[,-1])
plot(fit, margin = 0.1)
text(fit)

library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
rpart.plot(fit)

predicted <- predict(fit, val_test_data, type = 'class')
table(val_test_data$outcome, predicted)
## predicted
## Death Recover
## Death 45 6
## Recover 10 61
sum(val_test_data$outcome == predicted) / length(predicted)
## [1] 0.8688525
評估特徵重要性
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(42)
model1 <- train(outcome ~ ., data = val_train_data[,-1], method = "rpart",preProcess = NULL, trControl = control)
importance1 <- varImp(model1, scale=TRUE)
plot(importance1)

Logistic Regression
fit <- glm(outcome ~ ., data = val_train_data[,-1], family = 'binomial')
summary(fit)
##
## Call:
## glm(formula = outcome ~ ., family = "binomial", data = val_train_data[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4192 -0.8580 0.4280 0.8088 1.7569
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.126283 0.710054 5.811 6.20e-09 ***
## age -0.038345 0.008062 -4.756 1.97e-06 ***
## hospital1 -0.178303 0.329025 -0.542 0.588
## gender_f1 -0.415759 0.343030 -1.212 0.226
## province_Jiangsu1 0.069791 0.539090 0.129 0.897
## province_Shanghai1 -0.034750 0.448794 -0.077 0.938
## province_Zhejiang1 -0.053870 0.502262 -0.107 0.915
## days_onset_to_outcome -0.016929 0.014190 -1.193 0.233
## days_onset_to_hospital 0.030334 0.040251 0.754 0.451
## early_onset1 -0.035302 0.425105 -0.083 0.934
## early_outcome1 -2.252748 0.446894 -5.041 4.63e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 367.30 on 272 degrees of freedom
## Residual deviance: 280.85 on 262 degrees of freedom
## AIC: 302.85
##
## Number of Fisher Scoring iterations: 5
fit.step <- step(fit)
## Start: AIC=302.85
## outcome ~ age + hospital + gender_f + province_Jiangsu + province_Shanghai +
## province_Zhejiang + days_onset_to_outcome + days_onset_to_hospital +
## early_onset + early_outcome
##
## Df Deviance AIC
## - province_Shanghai 1 280.86 300.86
## - early_onset 1 280.86 300.86
## - province_Zhejiang 1 280.86 300.86
## - province_Jiangsu 1 280.87 300.87
## - hospital 1 281.14 301.14
## - days_onset_to_hospital 1 281.41 301.41
## - days_onset_to_outcome 1 282.23 302.23
## - gender_f 1 282.32 302.32
## <none> 280.85 302.85
## - age 1 308.54 328.54
## - early_outcome 1 309.54 329.54
##
## Step: AIC=300.86
## outcome ~ age + hospital + gender_f + province_Jiangsu + province_Zhejiang +
## days_onset_to_outcome + days_onset_to_hospital + early_onset +
## early_outcome
##
## Df Deviance AIC
## - province_Zhejiang 1 280.86 298.86
## - early_onset 1 280.87 298.87
## - province_Jiangsu 1 280.89 298.89
## - hospital 1 281.15 299.15
## - days_onset_to_hospital 1 281.42 299.42
## - days_onset_to_outcome 1 282.23 300.23
## - gender_f 1 282.32 300.32
## <none> 280.86 300.86
## - early_outcome 1 309.54 327.54
## - age 1 309.55 327.55
##
## Step: AIC=298.86
## outcome ~ age + hospital + gender_f + province_Jiangsu + days_onset_to_outcome +
## days_onset_to_hospital + early_onset + early_outcome
##
## Df Deviance AIC
## - early_onset 1 280.88 296.88
## - province_Jiangsu 1 280.91 296.91
## - hospital 1 281.15 297.15
## - days_onset_to_hospital 1 281.49 297.49
## - days_onset_to_outcome 1 282.23 298.23
## - gender_f 1 282.32 298.32
## <none> 280.86 298.86
## - age 1 309.55 325.55
## - early_outcome 1 309.57 325.57
##
## Step: AIC=296.88
## outcome ~ age + hospital + gender_f + province_Jiangsu + days_onset_to_outcome +
## days_onset_to_hospital + early_outcome
##
## Df Deviance AIC
## - province_Jiangsu 1 280.93 294.93
## - hospital 1 281.18 295.18
## - days_onset_to_hospital 1 281.56 295.56
## - gender_f 1 282.35 296.35
## - days_onset_to_outcome 1 282.76 296.76
## <none> 280.88 296.88
## - age 1 309.92 323.92
## - early_outcome 1 327.25 341.25
##
## Step: AIC=294.93
## outcome ~ age + hospital + gender_f + days_onset_to_outcome +
## days_onset_to_hospital + early_outcome
##
## Df Deviance AIC
## - hospital 1 281.29 293.29
## - days_onset_to_hospital 1 281.62 293.62
## - gender_f 1 282.37 294.37
## - days_onset_to_outcome 1 282.78 294.78
## <none> 280.93 294.93
## - age 1 311.92 323.92
## - early_outcome 1 327.45 339.45
##
## Step: AIC=293.29
## outcome ~ age + gender_f + days_onset_to_outcome + days_onset_to_hospital +
## early_outcome
##
## Df Deviance AIC
## - days_onset_to_hospital 1 282.04 292.04
## - gender_f 1 282.79 292.79
## <none> 281.29 293.29
## - days_onset_to_outcome 1 283.83 293.83
## - age 1 312.10 322.10
## - early_outcome 1 330.23 340.23
##
## Step: AIC=292.04
## outcome ~ age + gender_f + days_onset_to_outcome + early_outcome
##
## Df Deviance AIC
## - gender_f 1 283.21 291.21
## <none> 282.04 292.04
## - days_onset_to_outcome 1 284.07 292.07
## - age 1 312.25 320.25
## - early_outcome 1 331.66 339.66
##
## Step: AIC=291.21
## outcome ~ age + days_onset_to_outcome + early_outcome
##
## Df Deviance AIC
## <none> 283.21 291.21
## - days_onset_to_outcome 1 285.87 291.87
## - age 1 312.48 318.48
## - early_outcome 1 333.25 339.25
#val_test_data$outcome
predict(fit.step, val_test_data) >0
## 1 2 5 8 12 14 17 20 21 26 29 30
## FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE
## 36 43 44 45 51 57 64 72 73 75 81 83
## FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
## 98 106 111 119 131 143 155 156 157 161 170 171
## FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
## 172 173 182 185 195 208 210 211 213 214 218 219
## FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 225 230 247 252 253 257 261 264 269 275 289 296
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## 298 299 301 302 307 308 309 315 321 331 332 333
## FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE
## 337 347 361 370 388 391 405 406 409 425 432 438
## TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## 443 451 453 454 458 463 481 489 490 502 527 529
## TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## 537 543 547 549 556 558 561 562 563 573 579 588
## TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## 590 593 601 604 618 622 631 635 638 651 667 669
## FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 673 679
## TRUE TRUE
predicted <- ifelse(predict(fit.step, val_test_data) >0, 1, 0 )
predicted <- factor(predicted, levels = c(0,1), labels = c('Recover', 'Death'))
table(val_test_data$outcome, predicted)
## predicted
## Recover Death
## Death 31 20
## Recover 15 56
RandomForest
library(rpart)
fit1 <- rpart(outcome ~., data = val_train_data[,-1])
predicted1 <- predict(fit1, val_test_data, type = 'class')
table(val_test_data$outcome, predicted1)
## predicted1
## Death Recover
## Death 45 6
## Recover 10 61
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
fit2 <- randomForest(outcome ~., data = val_train_data[,-1], ntree = 100)
predicted2 <- predict(fit2, val_test_data, type = 'class')
table(val_test_data$outcome, predicted2)
## predicted2
## Death Recover
## Death 49 2
## Recover 4 67
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
fit3 <- svm(outcome ~., data = val_train_data[,-1])
predicted3 <- predict(fit3, val_test_data, type = 'class')
table(val_test_data$outcome, predicted3)
## predicted3
## Death Recover
## Death 36 15
## Recover 7 64
#predict(fit1, val_test_data)
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(42)
model2 <- train(outcome ~ ., data = val_train_data[,-1], method = "rf", preProcess = NULL, trControl = control)
importance2 <- varImp(model2, scale=TRUE)
plot(importance2)

使用ROC Curve 比較模型
library(rpart)
fit1 <- rpart(outcome ~., data = val_train_data[,-1])
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions1 <- predict(fit1, val_test_data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(val_test_data$outcome))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
library(randomForest)
fit2 <- randomForest(outcome ~., data = val_train_data[,-1], ntree = 100 )
predictions2 <- predict(fit2, val_test_data, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(val_test_data$outcome))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
library(e1071)
fit3 <- svm(outcome ~., data = val_train_data[,-1],probability = TRUE)
?svm
## starting httpd help server ...
## done
predictions3 <- predict(fit3, val_test_data, probability=TRUE)
pred.to.roc3 <- attr(predictions3, "probabilities")[, 2]
pred.rocr3 <- prediction(pred.to.roc3, as.factor(val_test_data$outcome))
perf.rocr3 <- performance(pred.rocr3, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr3 <- performance(pred.rocr3, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart','randomforest', 'svm'), 1:3)
plot(perf.tpr.rocr2, col=2, add=TRUE)
plot(perf.tpr.rocr3, col=3, add=TRUE)
