Homework 4

library(readr)
## Warning: package 'readr' was built under R version 3.4.4
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
##   確定病名 = col_character(),
##   發病年份 = col_integer(),
##   發病月份 = col_integer(),
##   縣市 = col_character(),
##   鄉鎮 = col_character(),
##   性別 = col_character(),
##   是否為境外移入 = col_character(),
##   年齡層 = col_character(),
##   確定病例數 = col_integer()
## )
View(measles)
head(measles)
## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市   鄉鎮   性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市 小港區 M     否             20-24 
## 2 麻疹         2009        5 基隆市 中山區 M     否             30-34 
## 3 麻疹         2011        6 新北市 蘆洲區 F     是             25-29 
## 4 麻疹         2014        1 高雄市 三民區 F     是             0     
## 5 麻疹         2017        3 台北市 松山區 F     是             0     
## 6 麻疹         2018        4 桃園市 蘆竹區 F     否             30-34 
## # ... with 1 more variable: 確定病例數 <int>
# method 1: tapply
s1 <- measles[(measles$發病年份 >= 2010) & (measles$是否為境外移入 == '否'), ]
stat <- tapply(s1$確定病例數,s1$性別, sum)

# method 2 : dplyr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
stat <- measles %>%
  filter(發病年份 >= 2010, 是否為境外移入 == '否') %>%
  group_by(性別)  %>%
  summarise(s = sum(確定病例數))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## Answer 1
stat
## # A tibble: 2 x 2
##   性別      s
##   <chr> <int>
## 1 F        41
## 2 M        52
## Answer 2
barplot(stat$s, names = c('F', 'M'))

## Answer 3
pie(stat$s, labels = stat$性別,init.angle = 90, clockwise = FALSE)

## Answer 4
measles %>%
  group_by(發病年份, 發病月份, 年齡層) %>%
  summarise(a = sum(確定病例數))
## # A tibble: 164 x 4
## # Groups:   發病年份, 發病月份 [?]
##    發病年份 發病月份 年齡層     a
##       <int>    <int> <chr>  <int>
##  1     2003        5 1          1
##  2     2003        9 35-39      1
##  3     2003       10 25-29      1
##  4     2003       11 25-29      1
##  5     2005        2 1          1
##  6     2005        3 10-14      1
##  7     2005        3 2          1
##  8     2005        8 0          1
##  9     2005       11 15-19      1
## 10     2005       11 30-34      1
## # ... with 154 more rows
# Answer 5
measles$dt <- as.Date(with(measles, paste0(發病年份,'-', 發病月份, '-01')))
head(measles)
## # A tibble: 6 x 10
##   確定病名 發病年份 發病月份 縣市   鄉鎮   性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市 小港區 M     否             20-24 
## 2 麻疹         2009        5 基隆市 中山區 M     否             30-34 
## 3 麻疹         2011        6 新北市 蘆洲區 F     是             25-29 
## 4 麻疹         2014        1 高雄市 三民區 F     是             0     
## 5 麻疹         2017        3 台北市 松山區 F     是             0     
## 6 麻疹         2018        4 桃園市 蘆竹區 F     否             30-34 
## # ... with 2 more variables: 確定病例數 <int>, dt <date>
## Arrange factor levles by number
a  <- c('1','10','2','3','20')
fa <- factor(a)
fa
## [1] 1  10 2  3  20
## Levels: 1 10 2 20 3
## Arrange factor levles by number
pos <- order(as.integer(levels(fa)))
levels(fa)[pos]
## [1] "1"  "2"  "3"  "10" "20"
measles$年齡層 <- factor(measles$年齡層)
tmp <- levels(measles$年齡層)
tmp
##  [1] "0"     "1"     "10-14" "15-19" "2"     "20-24" "25-29" "3"    
##  [9] "30-34" "35-39" "4"     "40-44" "45-49" "5-9"   "55-59"
strsplit(tmp, '-')
## [[1]]
## [1] "0"
## 
## [[2]]
## [1] "1"
## 
## [[3]]
## [1] "10" "14"
## 
## [[4]]
## [1] "15" "19"
## 
## [[5]]
## [1] "2"
## 
## [[6]]
## [1] "20" "24"
## 
## [[7]]
## [1] "25" "29"
## 
## [[8]]
## [1] "3"
## 
## [[9]]
## [1] "30" "34"
## 
## [[10]]
## [1] "35" "39"
## 
## [[11]]
## [1] "4"
## 
## [[12]]
## [1] "40" "44"
## 
## [[13]]
## [1] "45" "49"
## 
## [[14]]
## [1] "5" "9"
## 
## [[15]]
## [1] "55" "59"
rank <- sapply(strsplit(tmp, '-'), function(e) as.integer(e[1]) )
pos  <- order(rank)
levels(measles$年齡層)[pos]
##  [1] "0"     "1"     "2"     "3"     "4"     "5-9"   "10-14" "15-19"
##  [9] "20-24" "25-29" "30-34" "35-39" "40-44" "45-49" "55-59"
for (y in levels(measles$年齡層)[pos]){
  s <-measles %>% 
    filter(年齡層 == y) %>%
    group_by(dt) %>%
    summarise(disease_sum = sum(確定病例數))
  #print(s)
  #plot(s$dt, s$disease_sum,xlab = '年' , ylab = '確定病例數', type= 'line', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')))
}


## Answer 6
par(mfrow=c(3,5))
for (y in levels(measles$年齡層)[pos]){
  s <-measles %>% 
    filter(年齡層 == y) %>%
    group_by(dt) %>%
    summarise(disease_sum = sum(確定病例數))
  #print(s)
  plot(s$dt, s$disease_sum,xlab = '年' , ylab = '確定病例數', type= 'line', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')))
}
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

s <-measles %>% 
    filter(年齡層 == '3') %>%
    group_by(dt) %>%
    summarise(disease_sum = sum(確定病例數))

init_df  <- data.frame(dt = seq(from = as.Date('2003-01-01'), to = as.Date('2019-01-01'), by="1 month"), disease_sum = 0)

m <- merge(x = init_df, y = s, by.x = 'dt', by.y = 'dt', all=TRUE)

m$disease_sum.y[is.na(m$disease_sum.y)] <- 0
m$disease_sum.y
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [71] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
par(mfrow=c(1,1))
plot(m$dt, m$disease_sum.y,xlab = '年' , ylab = '確定病例數', type= 'o', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')))

## Answer 6 - add 0 to non exisiting dataset
init_df  <- data.frame(dt = seq(from = as.Date('2003-01-01'), to = as.Date('2019-01-01'), by="1 month"), disease_sum = 0)


par(mfrow=c(3,5))
for (y in levels(measles$年齡層)[pos]){
  s <-measles %>% 
    filter(年齡層 == y) %>%
    group_by(dt) %>%
    summarise(disease_sum = sum(確定病例數))
  m <- merge(init_df, s, by.x = 'dt', by.y = 'dt', all=TRUE)
  m$disease_sum.y[is.na(m$disease_sum.y)] <- 0
  #print(s)
  plot(m$dt, m$disease_sum.y,xlab = '年' , ylab = '確定病例數', type= 'o', main = paste('年齡層:',y), xlim = c(as.Date('2003-01-01'),as.Date('2019-01-01')), ylim = c(0,10) )
}

GGPlot2

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
help(package='ggplot2')

Date and Time

x <- as.Date('2018-05-08')
class(x)
## [1] "Date"
# date difference from 1970-01-01 til now
unclass(x)
## [1] 17659
y <-as.Date("1970-01-01")
unclass(y)
## [1] 0
X <- Sys.time()
X
## [1] "2018-05-08 16:56:30 CST"
class(X)
## [1] "POSIXct" "POSIXt"
unclass(X)
## [1] 1525769790
p <-as.POSIXlt(x)
unclass(p)
## $sec
## [1] 0
## 
## $min
## [1] 0
## 
## $hour
## [1] 0
## 
## $mday
## [1] 8
## 
## $mon
## [1] 4
## 
## $year
## [1] 118
## 
## $wday
## [1] 2
## 
## $yday
## [1] 127
## 
## $isdst
## [1] 0
## 
## attr(,"tzone")
## [1] "UTC"
p$sec
## [1] 0
p1 <-as.POSIXct(x)
unclass(p1)
## [1] 1525737600
ds <-c("5 8, 2018 12:00")
?strptime()
## starting httpd help server ... done
ds <-c("5 8, 2018 12:00")
x <- strptime(ds, "%m %d, %Y %H:%M")

x1 <- as.POSIXlt(as.Date('2018-05-08'))
x1 - x
## Time difference of -4 hours
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.4
## 
## Attaching package: 'lubridate'
## 
## The following object is masked from 'package:base':
## 
##     date
ymd("20180605")
## [1] "2018-06-05"
ymd("2018/06/05")
## [1] "2018-06-05"
dmy("05/06/2011")
## [1] "2011-06-05"
ymd_hms('2018-05-06 08:00:00')
## [1] "2018-05-06 08:00:00 UTC"
bday<-dmy("15/10/1988")
month(bday)
## [1] 10
year(bday)
## [1] 1988
year(bday) <- 2018
bday
## [1] "2018-10-15"
wday(bday)
## [1] 2
library(readr)
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
##   確定病名 = col_character(),
##   發病年份 = col_integer(),
##   發病月份 = col_integer(),
##   縣市 = col_character(),
##   鄉鎮 = col_character(),
##   性別 = col_character(),
##   是否為境外移入 = col_character(),
##   年齡層 = col_character(),
##   確定病例數 = col_integer()
## )
head(measles)
## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市   鄉鎮   性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市 小港區 M     否             20-24 
## 2 麻疹         2009        5 基隆市 中山區 M     否             30-34 
## 3 麻疹         2011        6 新北市 蘆洲區 F     是             25-29 
## 4 麻疹         2014        1 高雄市 三民區 F     是             0     
## 5 麻疹         2017        3 台北市 松山區 F     是             0     
## 6 麻疹         2018        4 桃園市 蘆竹區 F     否             30-34 
## # ... with 1 more variable: 確定病例數 <int>
#method 1
paste(measles$發病年份, measles$發病月份, '01', sep='-')
##   [1] "2009-4-01"  "2009-5-01"  "2011-6-01"  "2014-1-01"  "2017-3-01" 
##   [6] "2018-4-01"  "2018-4-01"  "2009-4-01"  "2011-2-01"  "2014-10-01"
##  [11] "2018-3-01"  "2018-4-01"  "2003-9-01"  "2011-3-01"  "2012-4-01" 
##  [16] "2014-10-01" "2011-3-01"  "2017-4-01"  "2009-3-01"  "2010-6-01" 
##  [21] "2011-4-01"  "2009-2-01"  "2009-4-01"  "2014-8-01"  "2009-4-01" 
##  [26] "2011-4-01"  "2015-5-01"  "2016-12-01" "2014-10-01" "2016-3-01" 
##  [31] "2009-2-01"  "2009-4-01"  "2017-4-01"  "2018-4-01"  "2005-8-01" 
##  [36] "2007-6-01"  "2009-3-01"  "2011-4-01"  "2016-5-01"  "2018-4-01" 
##  [41] "2005-11-01" "2007-7-01"  "2009-2-01"  "2011-2-01"  "2011-6-01" 
##  [46] "2014-10-01" "2015-5-01"  "2005-3-01"  "2009-4-01"  "2014-10-01"
##  [51] "2018-3-01"  "2018-4-01"  "2005-2-01"  "2008-4-01"  "2009-4-01" 
##  [56] "2009-5-01"  "2014-3-01"  "2015-6-01"  "2017-2-01"  "2003-10-01"
##  [61] "2010-2-01"  "2014-3-01"  "2015-9-01"  "2018-4-01"  "2005-3-01" 
##  [66] "2009-2-01"  "2009-2-01"  "2009-8-01"  "2010-3-01"  "2008-11-01"
##  [71] "2008-11-01" "2010-7-01"  "2011-4-01"  "2016-8-01"  "2018-3-01" 
##  [76] "2006-5-01"  "2009-2-01"  "2009-3-01"  "2011-3-01"  "2014-4-01" 
##  [81] "2015-5-01"  "2015-6-01"  "2008-1-01"  "2009-1-01"  "2009-2-01" 
##  [86] "2009-3-01"  "2010-6-01"  "2017-12-01" "2011-5-01"  "2009-4-01" 
##  [91] "2011-5-01"  "2014-10-01" "2015-5-01"  "2007-12-01" "2008-3-01" 
##  [96] "2010-6-01"  "2011-1-01"  "2014-4-01"  "2015-5-01"  "2018-3-01" 
## [101] "2018-4-01"  "2008-11-01" "2009-2-01"  "2009-3-01"  "2015-4-01" 
## [106] "2018-4-01"  "2008-12-01" "2012-3-01"  "2013-4-01"  "2014-4-01" 
## [111] "2015-5-01"  "2008-8-01"  "2011-4-01"  "2014-5-01"  "2014-5-01" 
## [116] "2015-3-01"  "2015-5-01"  "2010-6-01"  "2013-1-01"  "2015-5-01" 
## [121] "2015-5-01"  "2011-3-01"  "2011-4-01"  "2015-7-01"  "2008-3-01" 
## [126] "2009-3-01"  "2016-8-01"  "2018-4-01"  "2009-4-01"  "2013-8-01" 
## [131] "2016-4-01"  "2008-11-01" "2012-3-01"  "2014-1-01"  "2014-4-01" 
## [136] "2015-5-01"  "2007-12-01" "2008-8-01"  "2009-2-01"  "2009-4-01" 
## [141] "2009-4-01"  "2018-4-01"  "2005-11-01" "2008-12-01" "2008-12-01"
## [146] "2009-2-01"  "2010-4-01"  "2013-1-01"  "2013-4-01"  "2016-5-01" 
## [151] "2009-2-01"  "2009-3-01"  "2011-5-01"  "2018-4-01"  "2006-4-01" 
## [156] "2009-2-01"  "2010-3-01"  "2011-6-01"  "2011-6-01"  "2012-6-01" 
## [161] "2013-8-01"  "2014-5-01"  "2017-3-01"  "2018-3-01"  "2003-5-01" 
## [166] "2007-7-01"  "2009-2-01"  "2013-7-01"  "2014-6-01"  "2015-5-01" 
## [171] "2016-7-01"  "2005-11-01" "2007-12-01" "2007-5-01"  "2009-2-01" 
## [176] "2010-5-01"  "2015-5-01"  "2015-5-01"  "2009-2-01"  "2009-3-01" 
## [181] "2011-4-01"  "2012-5-01"  "2015-5-01"  "2018-4-01"  "2003-11-01"
## [186] "2007-6-01"  "2010-5-01"  "2014-1-01"  "2015-6-01"  "2016-4-01" 
## [191] "2016-8-01"  "2006-9-01"  "2009-2-01"  "2009-2-01"  "2011-5-01" 
## [196] "2018-3-01"  "2009-2-01"  "2011-4-01"  "2014-2-01"  "2014-6-01" 
## [201] "2014-8-01"  "2015-5-01"  "2008-4-01"  "2009-2-01"  "2011-3-01" 
## [206] "2011-6-01"  "2012-5-01"  "2012-8-01"  "2015-5-01"  "2015-5-01" 
## [211] "2016-6-01"  "2016-8-01"  "2006-2-01"  "2007-7-01"  "2009-2-01" 
## [216] "2009-5-01"  "2012-4-01"  "2014-4-01"  "2015-5-01"  "2015-7-01" 
## [221] "2008-6-01"  "2009-3-01"  "2010-4-01"  "2016-7-01"  "2016-8-01" 
## [226] "2007-8-01"  "2011-4-01"  "2012-2-01"  "2015-10-01" "2009-4-01" 
## [231] "2011-4-01"  "2013-7-01"  "2014-8-01"  "2018-3-01"  "2018-4-01"
# method 2
?with
measles$發病時間 <- as.Date(with(paste(發病年份, 發病月份, '01', sep='-'), data = measles))

head(measles)
## # A tibble: 6 x 10
##   確定病名 發病年份 發病月份 縣市   鄉鎮   性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市 小港區 M     否             20-24 
## 2 麻疹         2009        5 基隆市 中山區 M     否             30-34 
## 3 麻疹         2011        6 新北市 蘆洲區 F     是             25-29 
## 4 麻疹         2014        1 高雄市 三民區 F     是             0     
## 5 麻疹         2017        3 台北市 松山區 F     是             0     
## 6 麻疹         2018        4 桃園市 蘆竹區 F     否             30-34 
## # ... with 2 more variables: 確定病例數 <int>, 發病時間 <date>

GGPlot2 Example

library(ggplot2)
ggplot(measles, aes(x = 發病時間, y= 確定病例數)) + geom_point(color = 'red') + geom_point(color = 'blue')

ggplot(measles, aes(x = 發病時間, y= 確定病例數)) + geom_point(color = 'blue', size = 5, shape = 19)

str(measles$性別)
##  chr [1:235] "M" "M" "F" "F" "F" "F" "F" "M" "F" "M" "F" "M" "F" "M" ...
p1 <- ggplot(measles, aes(x = 發病時間, y= 確定病例數)) + geom_point(color = 'red')
p1 +geom_point(aes(color=factor(性別)))

p1 +
  geom_point(aes(color=factor(性別)))+
  scale_color_manual(values =c("orange","purple"))

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')

p2 <- p1 + geom_point(aes(color = factor(性別)))
p3 <- p1 + geom_point(aes(shape = factor(性別)))
multiplot(p2, p3, col= 2)
## Loading required package: grid

## [1] 2
p1 + xlab('時間') + ylab('病例數') + ggtitle('麻疹發病趨勢')

p1 + xlab('時間') + ylab('病例數') + ggtitle('麻疹發病趨勢') + theme(plot.title=element_text(size =20, hjust=0.5))

p1 + geom_line()

p1 + geom_line(color='blue')

p1 +geom_point(aes(color=factor(性別)))+geom_line(aes(color=factor(性別)))

p1 +geom_line(aes(color=factor(性別)))+geom_point(aes(color=factor(性別)))

Geometry

load("C:/Users/nc20/Downloads/cdc.Rdata")

head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f