Homework 4
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
## 確定病名 = col_character(),
## 發病年份 = col_integer(),
## 發病月份 = col_integer(),
## 縣市 = col_character(),
## 鄉鎮 = col_character(),
## 性別 = col_character(),
## 是否為境外移入 = col_character(),
## 年齡層 = col_character(),
## 確定病例數 = col_integer()
## )
View(measles)
head(measles)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市 小港區 M 否 20-24
## 2 麻疹 2009 5 基隆市 中山區 M 否 30-34
## 3 麻疹 2011 6 新北市 蘆洲區 F 是 25-29
## 4 麻疹 2014 1 高雄市 三民區 F 是 0
## 5 麻疹 2017 3 台北市 松山區 F 是 0
## 6 麻疹 2018 4 桃園市 蘆竹區 F 否 30-34
## # ... with 1 more variable: 確定病例數 <int>
# method 1: tapply
s1 <- measles[(measles$發病年份 >= 2010) & (measles$是否為境外移入 == '否'), ]
stat <- tapply(s1$確定病例數,s1$性別, sum)
# method 2 : dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
stat <- measles %>%
filter(發病年份 >= 2010, 是否為境外移入 == '否') %>%
group_by(性別) %>%
summarise(s = sum(確定病例數))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## Answer 1
stat
## # A tibble: 2 x 2
## 性別 s
## <chr> <int>
## 1 F 41
## 2 M 52
## Answer 2
barplot(stat$s, names = c('F', 'M'))

## Answer 3
pie(stat$s, labels = stat$性別,init.angle = 90, clockwise = FALSE)

## Answer 4
measles %>%
group_by(發病年份, 發病月份, 年齡層) %>%
summarise(a = sum(確定病例數))
## # A tibble: 164 x 4
## # Groups: 發病年份, 發病月份 [?]
## 發病年份 發病月份 年齡層 a
## <int> <int> <chr> <int>
## 1 2003 5 1 1
## 2 2003 9 35-39 1
## 3 2003 10 25-29 1
## 4 2003 11 25-29 1
## 5 2005 2 1 1
## 6 2005 3 10-14 1
## 7 2005 3 2 1
## 8 2005 8 0 1
## 9 2005 11 15-19 1
## 10 2005 11 30-34 1
## # ... with 154 more rows
# Answer 5
measles$dt <- as.Date(with(measles, paste0(發病年份,'-', 發病月份, '-01')))
head(measles)
## # A tibble: 6 x 10
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市 小港區 M 否 20-24
## 2 麻疹 2009 5 基隆市 中山區 M 否 30-34
## 3 麻疹 2011 6 新北市 蘆洲區 F 是 25-29
## 4 麻疹 2014 1 高雄市 三民區 F 是 0
## 5 麻疹 2017 3 台北市 松山區 F 是 0
## 6 麻疹 2018 4 桃園市 蘆竹區 F 否 30-34
## # ... with 2 more variables: 確定病例數 <int>, dt <date>
## Arrange factor levles by number
a <- c('1','10','2','3','20')
fa <- factor(a)
fa
## [1] 1 10 2 3 20
## Levels: 1 10 2 20 3
## Arrange factor levles by number
pos <- order(as.integer(levels(fa)))
levels(fa)[pos]
## [1] "1" "2" "3" "10" "20"
measles$年齡層 <- factor(measles$年齡層)
tmp <- levels(measles$年齡層)
tmp
## [1] "0" "1" "10-14" "15-19" "2" "20-24" "25-29" "3"
## [9] "30-34" "35-39" "4" "40-44" "45-49" "5-9" "55-59"
strsplit(tmp, '-')
## [[1]]
## [1] "0"
##
## [[2]]
## [1] "1"
##
## [[3]]
## [1] "10" "14"
##
## [[4]]
## [1] "15" "19"
##
## [[5]]
## [1] "2"
##
## [[6]]
## [1] "20" "24"
##
## [[7]]
## [1] "25" "29"
##
## [[8]]
## [1] "3"
##
## [[9]]
## [1] "30" "34"
##
## [[10]]
## [1] "35" "39"
##
## [[11]]
## [1] "4"
##
## [[12]]
## [1] "40" "44"
##
## [[13]]
## [1] "45" "49"
##
## [[14]]
## [1] "5" "9"
##
## [[15]]
## [1] "55" "59"
rank <- sapply(strsplit(tmp, '-'), function(e) as.integer(e[1]) )
pos <- order(rank)
levels(measles$年齡層)[pos]
## [1] "0" "1" "2" "3" "4" "5-9" "10-14" "15-19"
## [9] "20-24" "25-29" "30-34" "35-39" "40-44" "45-49" "55-59"
for (y in levels(measles$年齡層)[pos]){
s <-measles %>%
filter(年齡層 == y) %>%
group_by(dt) %>%
summarise(disease_sum = sum(確定病例數))
#print(s)
#plot(s$dt, s$disease_sum,xlab = '年' , ylab = '確定病例數', type= 'line', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')))
}
## Answer 6
par(mfrow=c(3,5))
for (y in levels(measles$年齡層)[pos]){
s <-measles %>%
filter(年齡層 == y) %>%
group_by(dt) %>%
summarise(disease_sum = sum(確定病例數))
#print(s)
plot(s$dt, s$disease_sum,xlab = '年' , ylab = '確定病例數', type= 'line', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')))
}
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

s <-measles %>%
filter(年齡層 == '3') %>%
group_by(dt) %>%
summarise(disease_sum = sum(確定病例數))
init_df <- data.frame(dt = seq(from = as.Date('2003-01-01'), to = as.Date('2019-01-01'), by="1 month"), disease_sum = 0)
m <- merge(x = init_df, y = s, by.x = 'dt', by.y = 'dt', all=TRUE)
m$disease_sum.y[is.na(m$disease_sum.y)] <- 0
m$disease_sum.y
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [71] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
par(mfrow=c(1,1))
plot(m$dt, m$disease_sum.y,xlab = '年' , ylab = '確定病例數', type= 'o', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')))

## Answer 6 - add 0 to non exisiting dataset
init_df <- data.frame(dt = seq(from = as.Date('2003-01-01'), to = as.Date('2019-01-01'), by="1 month"), disease_sum = 0)
par(mfrow=c(3,5))
for (y in levels(measles$年齡層)[pos]){
s <-measles %>%
filter(年齡層 == y) %>%
group_by(dt) %>%
summarise(disease_sum = sum(確定病例數))
m <- merge(init_df, s, by.x = 'dt', by.y = 'dt', all=TRUE)
m$disease_sum.y[is.na(m$disease_sum.y)] <- 0
#print(s)
plot(m$dt, m$disease_sum.y,xlab = '年' , ylab = '確定病例數', type= 'o', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')), ylim = c(0,10) )
}

Date and Time
x <- as.Date('2018-05-08')
class(x)
## [1] "Date"
# date difference from 1970-01-01 til now
unclass(x)
## [1] 17659
y <-as.Date("1970-01-01")
unclass(y)
## [1] 0
X <- Sys.time()
X
## [1] "2018-05-08 16:56:30 CST"
class(X)
## [1] "POSIXct" "POSIXt"
unclass(X)
## [1] 1525769790
p <-as.POSIXlt(x)
unclass(p)
## $sec
## [1] 0
##
## $min
## [1] 0
##
## $hour
## [1] 0
##
## $mday
## [1] 8
##
## $mon
## [1] 4
##
## $year
## [1] 118
##
## $wday
## [1] 2
##
## $yday
## [1] 127
##
## $isdst
## [1] 0
##
## attr(,"tzone")
## [1] "UTC"
p$sec
## [1] 0
p1 <-as.POSIXct(x)
unclass(p1)
## [1] 1525737600
ds <-c("5 8, 2018 12:00")
?strptime()
## starting httpd help server ... done
ds <-c("5 8, 2018 12:00")
x <- strptime(ds, "%m %d, %Y %H:%M")
x1 <- as.POSIXlt(as.Date('2018-05-08'))
x1 - x
## Time difference of -4 hours
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.4
##
## Attaching package: 'lubridate'
##
## The following object is masked from 'package:base':
##
## date
ymd("20180605")
## [1] "2018-06-05"
ymd("2018/06/05")
## [1] "2018-06-05"
dmy("05/06/2011")
## [1] "2011-06-05"
ymd_hms('2018-05-06 08:00:00')
## [1] "2018-05-06 08:00:00 UTC"
bday<-dmy("15/10/1988")
month(bday)
## [1] 10
year(bday)
## [1] 1988
year(bday) <- 2018
bday
## [1] "2018-10-15"
wday(bday)
## [1] 2
library(readr)
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
## 確定病名 = col_character(),
## 發病年份 = col_integer(),
## 發病月份 = col_integer(),
## 縣市 = col_character(),
## 鄉鎮 = col_character(),
## 性別 = col_character(),
## 是否為境外移入 = col_character(),
## 年齡層 = col_character(),
## 確定病例數 = col_integer()
## )
head(measles)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市 小港區 M 否 20-24
## 2 麻疹 2009 5 基隆市 中山區 M 否 30-34
## 3 麻疹 2011 6 新北市 蘆洲區 F 是 25-29
## 4 麻疹 2014 1 高雄市 三民區 F 是 0
## 5 麻疹 2017 3 台北市 松山區 F 是 0
## 6 麻疹 2018 4 桃園市 蘆竹區 F 否 30-34
## # ... with 1 more variable: 確定病例數 <int>
#method 1
paste(measles$發病年份, measles$發病月份, '01', sep='-')
## [1] "2009-4-01" "2009-5-01" "2011-6-01" "2014-1-01" "2017-3-01"
## [6] "2018-4-01" "2018-4-01" "2009-4-01" "2011-2-01" "2014-10-01"
## [11] "2018-3-01" "2018-4-01" "2003-9-01" "2011-3-01" "2012-4-01"
## [16] "2014-10-01" "2011-3-01" "2017-4-01" "2009-3-01" "2010-6-01"
## [21] "2011-4-01" "2009-2-01" "2009-4-01" "2014-8-01" "2009-4-01"
## [26] "2011-4-01" "2015-5-01" "2016-12-01" "2014-10-01" "2016-3-01"
## [31] "2009-2-01" "2009-4-01" "2017-4-01" "2018-4-01" "2005-8-01"
## [36] "2007-6-01" "2009-3-01" "2011-4-01" "2016-5-01" "2018-4-01"
## [41] "2005-11-01" "2007-7-01" "2009-2-01" "2011-2-01" "2011-6-01"
## [46] "2014-10-01" "2015-5-01" "2005-3-01" "2009-4-01" "2014-10-01"
## [51] "2018-3-01" "2018-4-01" "2005-2-01" "2008-4-01" "2009-4-01"
## [56] "2009-5-01" "2014-3-01" "2015-6-01" "2017-2-01" "2003-10-01"
## [61] "2010-2-01" "2014-3-01" "2015-9-01" "2018-4-01" "2005-3-01"
## [66] "2009-2-01" "2009-2-01" "2009-8-01" "2010-3-01" "2008-11-01"
## [71] "2008-11-01" "2010-7-01" "2011-4-01" "2016-8-01" "2018-3-01"
## [76] "2006-5-01" "2009-2-01" "2009-3-01" "2011-3-01" "2014-4-01"
## [81] "2015-5-01" "2015-6-01" "2008-1-01" "2009-1-01" "2009-2-01"
## [86] "2009-3-01" "2010-6-01" "2017-12-01" "2011-5-01" "2009-4-01"
## [91] "2011-5-01" "2014-10-01" "2015-5-01" "2007-12-01" "2008-3-01"
## [96] "2010-6-01" "2011-1-01" "2014-4-01" "2015-5-01" "2018-3-01"
## [101] "2018-4-01" "2008-11-01" "2009-2-01" "2009-3-01" "2015-4-01"
## [106] "2018-4-01" "2008-12-01" "2012-3-01" "2013-4-01" "2014-4-01"
## [111] "2015-5-01" "2008-8-01" "2011-4-01" "2014-5-01" "2014-5-01"
## [116] "2015-3-01" "2015-5-01" "2010-6-01" "2013-1-01" "2015-5-01"
## [121] "2015-5-01" "2011-3-01" "2011-4-01" "2015-7-01" "2008-3-01"
## [126] "2009-3-01" "2016-8-01" "2018-4-01" "2009-4-01" "2013-8-01"
## [131] "2016-4-01" "2008-11-01" "2012-3-01" "2014-1-01" "2014-4-01"
## [136] "2015-5-01" "2007-12-01" "2008-8-01" "2009-2-01" "2009-4-01"
## [141] "2009-4-01" "2018-4-01" "2005-11-01" "2008-12-01" "2008-12-01"
## [146] "2009-2-01" "2010-4-01" "2013-1-01" "2013-4-01" "2016-5-01"
## [151] "2009-2-01" "2009-3-01" "2011-5-01" "2018-4-01" "2006-4-01"
## [156] "2009-2-01" "2010-3-01" "2011-6-01" "2011-6-01" "2012-6-01"
## [161] "2013-8-01" "2014-5-01" "2017-3-01" "2018-3-01" "2003-5-01"
## [166] "2007-7-01" "2009-2-01" "2013-7-01" "2014-6-01" "2015-5-01"
## [171] "2016-7-01" "2005-11-01" "2007-12-01" "2007-5-01" "2009-2-01"
## [176] "2010-5-01" "2015-5-01" "2015-5-01" "2009-2-01" "2009-3-01"
## [181] "2011-4-01" "2012-5-01" "2015-5-01" "2018-4-01" "2003-11-01"
## [186] "2007-6-01" "2010-5-01" "2014-1-01" "2015-6-01" "2016-4-01"
## [191] "2016-8-01" "2006-9-01" "2009-2-01" "2009-2-01" "2011-5-01"
## [196] "2018-3-01" "2009-2-01" "2011-4-01" "2014-2-01" "2014-6-01"
## [201] "2014-8-01" "2015-5-01" "2008-4-01" "2009-2-01" "2011-3-01"
## [206] "2011-6-01" "2012-5-01" "2012-8-01" "2015-5-01" "2015-5-01"
## [211] "2016-6-01" "2016-8-01" "2006-2-01" "2007-7-01" "2009-2-01"
## [216] "2009-5-01" "2012-4-01" "2014-4-01" "2015-5-01" "2015-7-01"
## [221] "2008-6-01" "2009-3-01" "2010-4-01" "2016-7-01" "2016-8-01"
## [226] "2007-8-01" "2011-4-01" "2012-2-01" "2015-10-01" "2009-4-01"
## [231] "2011-4-01" "2013-7-01" "2014-8-01" "2018-3-01" "2018-4-01"
# method 2
?with
measles$發病時間 <- as.Date(with(paste(發病年份, 發病月份, '01', sep='-'), data = measles))
head(measles)
## # A tibble: 6 x 10
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市 小港區 M 否 20-24
## 2 麻疹 2009 5 基隆市 中山區 M 否 30-34
## 3 麻疹 2011 6 新北市 蘆洲區 F 是 25-29
## 4 麻疹 2014 1 高雄市 三民區 F 是 0
## 5 麻疹 2017 3 台北市 松山區 F 是 0
## 6 麻疹 2018 4 桃園市 蘆竹區 F 否 30-34
## # ... with 2 more variables: 確定病例數 <int>, 發病時間 <date>
GGPlot2 Example
library(ggplot2)
ggplot(measles, aes(x = 發病時間, y= 確定病例數)) + geom_point(color = 'red') + geom_point(color = 'blue')

ggplot(measles, aes(x = 發病時間, y= 確定病例數)) + geom_point(color = 'blue', size = 5, shape = 19)

str(measles$性別)
## chr [1:235] "M" "M" "F" "F" "F" "F" "F" "M" "F" "M" "F" "M" "F" "M" ...
p1 <- ggplot(measles, aes(x = 發病時間, y= 確定病例數)) + geom_point(color = 'red')
p1 +geom_point(aes(color=factor(性別)))

p1 +
geom_point(aes(color=factor(性別)))+
scale_color_manual(values =c("orange","purple"))

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')
p2 <- p1 + geom_point(aes(color = factor(性別)))
p3 <- p1 + geom_point(aes(shape = factor(性別)))
multiplot(p2, p3, col= 2)
## Loading required package: grid

## [1] 2
p1 + xlab('時間') + ylab('病例數') + ggtitle('麻疹發病趨勢')

p1 + xlab('時間') + ylab('病例數') + ggtitle('麻疹發病趨勢') + theme(plot.title=element_text(size =20, hjust=0.5))

p1 + geom_line()

p1 + geom_line(color='blue')

p1 +geom_point(aes(color=factor(性別)))+geom_line(aes(color=factor(性別)))

p1 +geom_line(aes(color=factor(性別)))+geom_point(aes(color=factor(性別)))
