library(outbreaks)
## Warning: package 'outbreaks' was built under R version 3.4.4
data("fluH7N9_china_2013")

class(fluH7N9_china_2013)
## [1] "data.frame"
View(fluH7N9_china_2013)
str(fluH7N9_china_2013)
## 'data.frame':    136 obs. of  8 variables:
##  $ case_id                : Factor w/ 136 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ date_of_onset          : Date, format: "2013-02-19" "2013-02-27" ...
##  $ date_of_hospitalisation: Date, format: NA "2013-03-03" ...
##  $ date_of_outcome        : Date, format: "2013-03-04" "2013-03-10" ...
##  $ outcome                : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ gender                 : Factor w/ 2 levels "f","m": 2 2 1 1 1 1 2 2 2 2 ...
##  $ age                    : Factor w/ 61 levels "?","15","2","21",..: 58 7 11 18 20 9 54 14 39 20 ...
##  $ province               : Factor w/ 13 levels "Anhui","Beijing",..: 11 11 1 8 8 8 8 13 13 11 ...
head(fluH7N9_china_2013)
##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1       1    2013-02-19                    <NA>      2013-03-04   Death
## 2       2    2013-02-27              2013-03-03      2013-03-10   Death
## 3       3    2013-03-09              2013-03-19      2013-04-09   Death
## 4       4    2013-03-19              2013-03-27            <NA>    <NA>
## 5       5    2013-03-19              2013-03-30      2013-05-15 Recover
## 6       6    2013-03-21              2013-03-28      2013-04-26   Death
##   gender age province
## 1      m  87 Shanghai
## 2      m  27 Shanghai
## 3      f  35    Anhui
## 4      f  45  Jiangsu
## 5      f  48  Jiangsu
## 6      f  32  Jiangsu
fluH7N9_china_2013$age[fluH7N9_china_2013$age == '?'] <- NA
fluH7N9_china_2013$age <- as.numeric(fluH7N9_china_2013$age)

fluH7N9_china_2013$age
##   [1] 58  7 11 18 20  9 54 14 39 20 36 24 39 15 34 51 46 38 31 27 39 56  5
##  [24] 36 35 49 23 51 48 53 43 46 37 46 54 40  8 28 38 46 26 25 57 42 28 49
##  [47] 44 37 14 10 37 36 35 47 51 45 26 50 22  6 33 40 33 28  4 44 28 29 35
##  [70] 30 44 19 41 NA NA 27  3 59 13 46 57 16 14  6 52 26 41 15 26 17 20 38
##  [93] 28 11 13 17 47 48 40 30 51 53 40 26  9 12 61 55 35 25 28 41 33 22 14
## [116] 37 48 21 12 33 36 14 26 52  8 52 15 30 41 41 60 51 32  2 34 23
fluH7N9_china_2013$case_id <- paste('case',fluH7N9_china_2013$case_id, sep = '_')


library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
fluH7N9_china_2013_gather  <- fluH7N9_china_2013 %>%
    gather(Group, Date, date_of_onset:date_of_outcome)

fluH7N9_china_2013[fluH7N9_china_2013$case_id == 'case_1',]
##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death
##   gender age province
## 1      m  58 Shanghai
fluH7N9_china_2013_gather[fluH7N9_china_2013$case_id == 'case_1',]
##     case_id outcome gender age province                   Group       Date
## 1    case_1   Death      m  58 Shanghai           date_of_onset 2013-02-19
## 137  case_1   Death      m  58 Shanghai date_of_hospitalisation       <NA>
## 273  case_1   Death      m  58 Shanghai         date_of_outcome 2013-03-04
nrow(fluH7N9_china_2013)
## [1] 136
nrow(fluH7N9_china_2013_gather)
## [1] 408
fluH7N9_china_2013_gather$Group <- factor(fluH7N9_china_2013_gather$Group, levels =
c("date_of_onset", "date_of_hospitalisation", "date_of_outcome"))

str(fluH7N9_china_2013_gather)
## 'data.frame':    408 obs. of  7 variables:
##  $ case_id : chr  "case_1" "case_2" "case_3" "case_4" ...
##  $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ gender  : Factor w/ 2 levels "f","m": 2 2 1 1 1 1 2 2 2 2 ...
##  $ age     : num  58 7 11 18 20 9 54 14 39 20 ...
##  $ province: Factor w/ 13 levels "Anhui","Beijing",..: 11 11 1 8 8 8 8 13 13 11 ...
##  $ Group   : Factor w/ 3 levels "date_of_onset",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date    : Date, format: "2013-02-19" "2013-02-27" ...
library(plyr)
## Warning: package 'plyr' was built under R version 3.4.4
fluH7N9_china_2013_gather$Group <-mapvalues(fluH7N9_china_2013_gather$Group, from =
c("date_of_onset", "date_of_hospitalisation", "date_of_outcome"),
to = c("Date of onset", "Date of hospitalisation", "Date of outcome"))


levels(fluH7N9_china_2013_gather$province)
##  [1] "Anhui"     "Beijing"   "Fujian"    "Guangdong" "Hebei"    
##  [6] "Henan"     "Hunan"     "Jiangsu"   "Jiangxi"   "Shandong" 
## [11] "Shanghai"  "Taiwan"    "Zhejiang"
fluH7N9_china_2013_gather$province <- mapvalues(fluH7N9_china_2013_gather$province,from =
c("Anhui", "Beijing", "Fujian", "Guangdong", "Hebei", "Henan", "Hunan", "Jiangxi", "Shandong", "Taiwan"),
to = rep("Other", 10))

levels(fluH7N9_china_2013_gather$province)
## [1] "Other"    "Jiangsu"  "Shanghai" "Zhejiang"
levels(fluH7N9_china_2013_gather$gender) <- c(levels(fluH7N9_china_2013_gather$gender),
"unknown")

fluH7N9_china_2013_gather$gender[is.na(fluH7N9_china_2013_gather$gender)] <- "unknown"

fluH7N9_china_2013_gather$province <- factor(fluH7N9_china_2013_gather$province, levels =
c("Jiangsu", "Shanghai", "Zhejiang", "Other"))

str(fluH7N9_china_2013_gather)
## 'data.frame':    408 obs. of  7 variables:
##  $ case_id : chr  "case_1" "case_2" "case_3" "case_4" ...
##  $ outcome : Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ gender  : Factor w/ 3 levels "f","m","unknown": 2 2 1 1 1 1 2 2 2 2 ...
##  $ age     : num  58 7 11 18 20 9 54 14 39 20 ...
##  $ province: Factor w/ 4 levels "Jiangsu","Shanghai",..: 2 2 4 1 1 1 1 3 3 2 ...
##  $ Group   : Factor w/ 3 levels "Date of onset",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date    : Date, format: "2013-02-19" "2013-02-27" ...

資料視覺化

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, fill = outcome)) +
stat_density2d(aes(alpha = ..level..), geom = "polygon") +
geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
geom_rug(aes(color = outcome)) +
labs(
fill = "Outcome",
color = "Outcome",
alpha = "Level",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "2013 Influenza A H7N9 cases in China"
) 
## Warning: Removed 149 rows containing non-finite values (stat_density2d).
## Warning: Removed 149 rows containing missing values (geom_point).

ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, fill = outcome)) +
stat_density2d(aes(alpha = ..level..), geom = "polygon") +
geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
geom_rug(aes(color = outcome)) +
labs(
fill = "Outcome",
color = "Outcome",
alpha = "Level",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "2013 Influenza A H7N9 cases in China"
) +
facet_grid(Group ~ province) +
scale_shape_manual(values = c(15, 16, 17)) +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1")
## Warning: Removed 149 rows containing non-finite values (stat_density2d).

## Warning: Removed 149 rows containing missing values (geom_point).

table(fluH7N9_china_2013$outcome, fluH7N9_china_2013$gender)
##          
##            f  m
##   Death    9 22
##   Recover 12 34
ggplot(data = fluH7N9_china_2013_gather, aes(x = Date, y = age, color = outcome)) +
geom_point(aes(color = outcome, shape = gender), size = 1.5, alpha = 0.6) +
geom_path(aes(group = case_id)) +
facet_wrap( ~ province, ncol = 2) +
scale_shape_manual(values = c(15, 16, 17)) +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1") +
labs(
color = "Outcome",
shape = "Gender",
x = "Date in 2013",
y = "Age",
title = "Time from onset of flu to outcome"
)
## Warning: Removed 149 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_path).

fluH7N9_china_2013_gather_2 <- fluH7N9_china_2013_gather[, -4] %>% gather(group_2, value, gender:province)
## Warning: attributes are not identical across measure variables;
## they will be dropped
fluH7N9_china_2013_gather_2$value <-
mapvalues(fluH7N9_china_2013_gather_2$value, from = c("m", "f", "unknown","Other"), to = c("Male", "Female", "Unknown gender", "Other province"))

fluH7N9_china_2013_gather_2$value <-
factor(fluH7N9_china_2013_gather_2$value, levels = c("Female", "Male","Unknown gender", "Jiangsu", "Shanghai", "Zhejiang", "Other province"))

str(fluH7N9_china_2013_gather_2)
## 'data.frame':    816 obs. of  6 variables:
##  $ case_id: chr  "case_1" "case_2" "case_3" "case_4" ...
##  $ outcome: Factor w/ 2 levels "Death","Recover": 1 1 1 NA 2 1 1 1 NA 1 ...
##  $ Group  : Factor w/ 3 levels "Date of onset",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date   : Date, format: "2013-02-19" "2013-02-27" ...
##  $ group_2: chr  "gender" "gender" "gender" "gender" ...
##  $ value  : Factor w/ 7 levels "Female","Male",..: 2 2 1 1 1 1 2 2 2 2 ...
p1 <- ggplot(data = fluH7N9_china_2013_gather_2, aes(x = value, fill = outcome, color = outcome)) +
geom_bar(position = "dodge", alpha = 0.7, size = 1) +
scale_fill_brewer(palette="Set1", na.value = "grey50") +
scale_color_brewer(palette="Set1", na.value = "grey50") +
labs(
color = "",
fill = "",
x = "",
y = "Count",
title = "2013 Influenza A H7N9 cases in China")

p1

p2 <- ggplot(data = fluH7N9_china_2013_gather, aes(x = age, fill = outcome, color = outcome)) +
geom_density(alpha = 0.3, size = 1) +
geom_rug() +
scale_color_brewer(palette="Set1", na.value = "grey50") +
scale_fill_brewer(palette="Set1", na.value = "grey50") +
labs(
color = "",
fill = "",
x = "Age",
y = "Density",
title = "Age distribution of flu cases"
)

p2
## Warning: Removed 6 rows containing non-finite values (stat_density).

long to wide or wide to long

library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
data("airquality")
names(airquality) <- tolower(names(airquality))
head(airquality)
##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
aqm <- melt(airquality, id=c("month", "day"), na.rm=TRUE)
head(aqm)
##   month day variable value
## 1     5   1    ozone    41
## 2     5   2    ozone    36
## 3     5   3    ozone    12
## 4     5   4    ozone    18
## 6     5   6    ozone    28
## 7     5   7    ozone    23
newdata <- dcast(aqm, month ~ variable, mean, margins = c("month", "variable"))

抽取特徵

data(fluH7N9_china_2013)
dataset <- fluH7N9_china_2013

dataset $age[which(dataset$age == "?")] <- NA
dataset$age <- as.numeric(as.character(dataset$age))

dataset$case_id <- paste("case", dataset$case_id, sep = "_")

head(dataset)
##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death
## 2  case_2    2013-02-27              2013-03-03      2013-03-10   Death
## 3  case_3    2013-03-09              2013-03-19      2013-04-09   Death
## 4  case_4    2013-03-19              2013-03-27            <NA>    <NA>
## 5  case_5    2013-03-19              2013-03-30      2013-05-15 Recover
## 6  case_6    2013-03-21              2013-03-28      2013-04-26   Death
##   gender age province
## 1      m  87 Shanghai
## 2      m  27 Shanghai
## 3      f  35    Anhui
## 4      f  45  Jiangsu
## 5      f  48  Jiangsu
## 6      f  32  Jiangsu
dataset$hospital <- as.factor(ifelse(is.na(dataset$date_of_hospitalisation), 0, 1))

dataset$gender_f <- as.factor(ifelse(dataset$gender == "f", 1, 0))


dataset$province_Jiangsu <- as.factor(ifelse(dataset$province == "Jiangsu", 1, 0))
dataset$province_Shanghai <- as.factor(ifelse(dataset$province == "Shanghai", 1, 0))
dataset$province_Zhejiang <- as.factor(ifelse(dataset$province == "Zhejiang", 1, 0))

dataset$province_other <- as.factor(ifelse(dataset$province == "Zhejiang" | dataset$province
== "Jiangsu" | dataset$province == "Shanghai", 0, 1))

head(dataset)
##   case_id date_of_onset date_of_hospitalisation date_of_outcome outcome
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death
## 2  case_2    2013-02-27              2013-03-03      2013-03-10   Death
## 3  case_3    2013-03-09              2013-03-19      2013-04-09   Death
## 4  case_4    2013-03-19              2013-03-27            <NA>    <NA>
## 5  case_5    2013-03-19              2013-03-30      2013-05-15 Recover
## 6  case_6    2013-03-21              2013-03-28      2013-04-26   Death
##   gender age province hospital gender_f province_Jiangsu province_Shanghai
## 1      m  87 Shanghai        0        0                0                 1
## 2      m  27 Shanghai        1        0                0                 1
## 3      f  35    Anhui        1        1                0                 0
## 4      f  45  Jiangsu        1        1                1                 0
## 5      f  48  Jiangsu        1        1                1                 0
## 6      f  32  Jiangsu        1        1                1                 0
##   province_Zhejiang province_other
## 1                 0              0
## 2                 0              0
## 3                 0              1
## 4                 0              0
## 5                 0              0
## 6                 0              0
#as.Date(as.character(dataset$date_of_outcome), format = "%Y-%m-%d") - as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d")

dataset$days_onset_to_outcome <- as.numeric(as.character(gsub(" days", "",
as.Date(as.character(dataset$date_of_outcome), format = "%Y-%m-%d") -
as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d"))))


dataset$days_onset_to_hospital <- as.numeric(as.character(gsub(" days", "",
as.Date(as.character(dataset$date_of_hospitalisation), format = "%Y-%m-%d") -
as.Date(as.character(dataset$date_of_onset), format = "%Y-%m-%d"))))

summary(dataset$date_of_onset)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2013-02-19" "2013-03-31" "2013-04-08" "2013-04-07" "2013-04-14" 
##         Max.         NA's 
## "2013-07-27"         "10"
dataset$early_onset <- as.factor(ifelse(dataset$date_of_onset < summary(dataset$date_of_onset)[[3]], 1, 0))

dataset$early_outcome <- as.factor(ifelse(dataset$date_of_outcome < summary(dataset$date_of_outcome)[[3]], 1, 0))

#dataset


dataset <- dataset[,c('case_id','outcome'
, 'age', 'hospital','gender_f'
,'province_Jiangsu'
,'province_Shanghai'
,'province_Zhejiang'
,'province_other'
,'days_onset_to_outcome'
,'days_onset_to_hospital'
,'early_onset'
,'early_outcome')]

head(dataset)
##   case_id outcome age hospital gender_f province_Jiangsu province_Shanghai
## 1  case_1   Death  87        0        0                0                 1
## 2  case_2   Death  27        1        0                0                 1
## 3  case_3   Death  35        1        1                0                 0
## 4  case_4    <NA>  45        1        1                1                 0
## 5  case_5 Recover  48        1        1                1                 0
## 6  case_6   Death  32        1        1                1                 0
##   province_Zhejiang province_other days_onset_to_outcom