Demo20200819

library('tidyverse')

## Warning: package 'tidyverse' was built under R version 4.0.2

## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")

## Parsed with column specification:
## cols(
##   確定病名 = col_character(),
##   發病年份 = col_double(),
##   發病月份 = col_double(),
##   縣市 = col_character(),
##   鄉鎮 = col_character(),
##   性別 = col_character(),
##   是否為境外移入 = col_character(),
##   年齡層 = col_character(),
##   確定病例數 = col_double()
## )

head(measles)

## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市   鄉鎮  性別  是否為境外移入 年齡層 確定病例數
##   <chr>       <dbl>    <dbl> <chr>  <chr> <chr> <chr>          <chr>       <dbl>
## 1 麻疹         2009        4 高雄市 小港區… M     否             20-24           2
## 2 麻疹         2009        5 基隆市 中山區… M     否             30-34           1
## 3 麻疹         2011        6 新北市 蘆洲區… F     是             25-29           1
## 4 麻疹         2014        1 高雄市 三民區… F     是             0               1
## 5 麻疹         2017        3 台北市 松山區… F     是             0               1
## 6 麻疹         2018        4 桃園市 蘆竹區… F     否             30-34           1

x <- as.Date('2020-08-20')
class(x)

## [1] "Date"

unclass(x)

## [1] 18494

y <- as.Date('1970-01-01')
unclass(y)

## [1] 0

x <- Sys.time()
x

## [1] "2020-08-19 17:05:36 CST"

p <- as.POSIXlt(x)
unclass(p)

## $sec
## [1] 36.2334
## 
## $min
## [1] 5
## 
## $hour
## [1] 17
## 
## $mday
## [1] 19
## 
## $mon
## [1] 7
## 
## $year
## [1] 120
## 
## $wday
## [1] 3
## 
## $yday
## [1] 231
## 
## $isdst
## [1] 0
## 
## $zone
## [1] "CST"
## 
## $gmtoff
## [1] 28800
## 
## attr(,"tzone")
## [1] ""    "CST" "CDT"

p$sec

## [1] 36.2334

p1 <- as.POSIXct(x)
p1

## [1] "2020-08-19 17:05:36 CST"

ds <- c('May 8, 2020, 12:00')
x <- strptime(ds,'%B %d, %Y, %H:%M')
x

## [1] "2020-05-08 12:00:00 CST"

x1 <- as.POSIXlt(as.Date('2020-05-03'))
x - x1

## Time difference of 5.166667 days

head(measles)

## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市   鄉鎮  性別  是否為境外移入 年齡層 確定病例數
##   <chr>       <dbl>    <dbl> <chr>  <chr> <chr> <chr>          <chr>       <dbl>
## 1 麻疹         2009        4 高雄市 小港區… M     否             20-24           2
## 2 麻疹         2009        5 基隆市 中山區… M     否             30-34           1
## 3 麻疹         2011        6 新北市 蘆洲區… F     是             25-29           1
## 4 麻疹         2014        1 高雄市 三民區… F     是             0               1
## 5 麻疹         2017        3 台北市 松山區… F     是             0               1
## 6 麻疹         2018        4 桃園市 蘆竹區… F     否             30-34           1

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.0.2

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

dt <- '2020-08-19'
ymd(dt)

## [1] "2020-08-19"

ymd('2020/06/05')

## [1] "2020-06-05"

dmy('05/06/2020')

## [1] "2020-06-05"

Sys.getlocale("LC_TIME")

## [1] "en_US.UTF-8"

dt <- '2020-08-19 14:30:00'
ymd_hms(dt, tz = 'Asia/Taipei', locale = 'zh_TW')

## [1] "2020-08-19 14:30:00 CST"

bday <- dmy("15/10/1988")
month(bday)

## [1] 10

wday(bday, label = TRUE)

## [1] Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

year(bday)

## [1] 1988

year(bday) <- 2020
wday(bday, label=TRUE)

## [1] Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

paste('發病年份', '發病月份', '01',sep = '-')

## [1] "發病年份-發病月份-01"

measles$發病時間 <- as.Date(with(measles, paste(發病年份, 發病月份, '01',sep="-")), "%Y-%m-%d")

head(measles)

## # A tibble: 6 x 10
##   確定病名 發病年份 發病月份 縣市  鄉鎮  性別  是否為境外移入 年齡層 確定病例數
##   <chr>       <dbl>    <dbl> <chr> <chr> <chr> <chr>          <chr>       <dbl>
## 1 麻疹         2009        4 高雄市… 小港區… M     否             20-24           2
## 2 麻疹         2009        5 基隆市… 中山區… M     否             30-34           1
## 3 麻疹         2011        6 新北市… 蘆洲區… F     是             25-29           1
## 4 麻疹         2014        1 高雄市… 三民區… F     是             0               1
## 5 麻疹         2017        3 台北市… 松山區… F     是             0               1
## 6 麻疹         2018        4 桃園市… 蘆竹區… F     否             30-34           1
## # … with 1 more variable: 發病時間 <date>

p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point()

p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point(color = "red")

p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point(color = "blue",size = 5, shape = 14)

p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point(color='red')

p1 + geom_point(aes(color = 性別))

p1 + 
  geom_point(aes(color=factor(性別)) ) + 
  scale_color_manual(values = c("orange", "purple"))

p1 + 
  geom_point(aes(color=factor(性別)) ) + 
  scale_color_manual(values = c("orange", "purple"))

p1 + 
  geom_point(color='red' ) + 
  geom_point(color='blue' )

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')
p2 <- p1 + geom_point(aes(color=factor(性別)) )
p3 <- p1 + geom_point(aes(shape=factor(性別)) )
multiplot(p2, p3, cols=2)

## Loading required package: grid

#source('multiplot.R')
p2 <- p1 + geom_point(aes(color=factor(性別)) )
p3 <- p1 + geom_point(aes(shape=factor(性別)) )
multiplot(p2, p3, cols=2)

p1 +  
  geom_point() +
  xlab("時間") +  
  ylab("病例數") +
  ggtitle("麻疹發病趨勢")

p1 + geom_point(aes(color=factor(性別)) ) + geom_line(aes(color=factor(性別)) )

p1 + geom_line(aes(color=factor(性別)) ) + geom_point(aes(color=factor(性別)) )

## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)

confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )

## See spec(...) for full column specifications.

taiwan_stat <- confirmed_df %>%
  filter(`Country/Region` == 'Taiwan*')

date_col <- names(taiwan_stat)[5:length(names(taiwan_stat))]
taiwan_long_stat <- taiwan_stat %>%
  gather(Date, Case_Number , date_col)

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_col)` instead of `date_col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

taiwan_long_stat$Date <- as.Date(taiwan_long_stat$Date, format='%m/%d/%y')


head(taiwan_long_stat)

## # A tibble: 6 x 6
##   `Province/State` `Country/Region`   Lat  Long Date       Case_Number
##   <chr>            <chr>            <dbl> <dbl> <date>           <dbl>
## 1 <NA>             Taiwan*           23.7   121 2020-01-22           1
## 2 <NA>             Taiwan*           23.7   121 2020-01-23           1
## 3 <NA>             Taiwan*           23.7   121 2020-01-24           3
## 4 <NA>             Taiwan*           23.7   121 2020-01-25           3
## 5 <NA>             Taiwan*           23.7   121 2020-01-26           4
## 6 <NA>             Taiwan*           23.7   121 2020-01-27           5

p1 <- ggplot(taiwan_long_stat, aes(x = Date, y= Case_Number))
p1 + 
  geom_line(color = 'red') + 
  xlab('時間') +
  ylab('病例數') + 
  ggtitle('新冠肺炎台灣確診數趨勢圖')

p1 <- ggplot(taiwan_long_stat)  # data
p1 + 
  aes(x = Date, y= Case_Number) + # aes
  geom_line(color = 'red') +      # geometry
  xlab('時間') +
  ylab('病例數') + 
  ggtitle('新冠肺炎台灣確診數趨勢圖')

## MAC 繪圖中文處理

使用 fc-list，檢查有什麼字體

#install.packages("showtext")
library(showtext)

## Warning: package 'showtext' was built under R version 4.0.2

## Loading required package: sysfonts

## Warning: package 'sysfonts' was built under R version 4.0.2

## Loading required package: showtextdb

## Warning: package 'showtextdb' was built under R version 4.0.2

showtext_auto(enable = TRUE)
font_add("Microsoft Jhenghei", "Microsoft Jhenghei.ttf")
plot(runif(10), xlab = "橫軸", ylab = "縱軸", main = "中文標題",
  family = "Microsoft Jhenghei")
text(5, 0.7, "Jhenghei", cex = 3, family = "Microsoft Jhenghei")

curl::curl_download('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cdc.Rdata', 'cdc.Rdata')

load('cdc.Rdata')

head(cdc)

##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f

plot(wtdesire ~ weight, data=cdc )

ggplot(cdc) +
  aes(x = weight, y = wtdesire) +
  geom_point()

hist(cdc$weight)

ggplot(cdc) +
  aes(x = weight) + 
  geom_histogram(bins = 500)

#sort(table(cdc$weight), decreasing = TRUE)


ggplot(cdc) +
  aes(x = weight) + 
  geom_density(alpha = (0.2), stat = 'density') +
  aes(color = factor(gender))

ggplot(cdc) +
  aes(x = weight,color = factor(gender), fill = factor(gender)) + 
  geom_density(alpha = (0.2), stat = 'density')

ggplot(cdc) +
  aes(x = weight, color =  factor(gender), fill =  factor(gender)) + 
  geom_histogram(bins = 50, alpha = 0.2)

boxplot(weight~gender, data = cdc)

ggplot(cdc) +
  aes(x = gender, y= weight, fill = factor(gender)) + 
  geom_boxplot()

table(cdc$genhlth)

## 
## excellent very good      good      fair      poor 
##      4657      6972      5675      2019       677

barplot(table(cdc$genhlth))

ggplot(cdc) + 
  aes(x = genhlth, fill= genhlth) +
  geom_bar()

ggplot(cdc) + 
  aes(x = genhlth) +
  geom_bar(fill='red')

pie(table(cdc$gender))

cdc_sex <- data.frame(table(cdc$gender))
cdc_sex

##   Var1  Freq
## 1    m  9569
## 2    f 10431

ggplot(cdc_sex) +
  aes(x="", y=Freq ,fill=Var1 ) + 
  geom_bar(width=1, stat = "identity") +  
  geom_text(aes(label=Freq, y =c(15000, 5000) ) , size=5) + 
  coord_polar(theta="y", start = 0)

confirmed_stat <- confirmed_df %>%
  select(country = `Country/Region`  , case_number = `8/18/20`) %>%
  arrange(desc(case_number)) %>%
  slice(0:10)

confirmed_stat

## # A tibble: 10 x 2
##    country      case_number
##    <chr>              <dbl>
##  1 US               5482416
##  2 Brazil           3407354
##  3 India            2767253
##  4 Russia            930276
##  5 South Africa      592144
##  6 Peru              541493
##  7 Mexico            531239
##  8 Colombia          489122
##  9 Chile             388855
## 10 Spain             364196

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

ggplot(confirmed_stat) +
  geom_bar(aes(x=reorder(country, -case_number),y=case_number),stat="identity", fill= 'blue') +
  xlab('國家') +
  ylab('確診數') + 
  ggtitle('確診數排名前10國家') +
  scale_y_continuous(labels = scales::comma)

?geom_bar

ggplot(cdc) + 
  aes(x = weight, y= wtdesire, color = factor(gender)) +
  geom_point() +
  geom_smooth(method = 'lm')

## `geom_smooth()` using formula 'y ~ x'

ggplot(cdc) + 
  aes(x = weight, y= height, color = factor(gender)) +
  geom_point() +
  geom_smooth(method = 'lm')

## `geom_smooth()` using formula 'y ~ x'

ggplot(cdc) + 
  aes(x = weight, y= wtdesire, color = factor(gender)) +
  geom_point() +
  geom_smooth(method = 'lm')

## `geom_smooth()` using formula 'y ~ x'

ggplot(cdc) + 
  aes(x = gender, y = weight) +
  geom_jitter() +
  geom_boxplot(aes(fill = gender))

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')

w + 
  facet_grid(gender ~.)

## `geom_smooth()` using formula 'y ~ x'

w + 
  facet_grid(genhlth ~.)

## `geom_smooth()` using formula 'y ~ x'

w + 
  facet_grid(.~genhlth)

## `geom_smooth()` using formula 'y ~ x'

w + 
  facet_grid(gender~genhlth)

## `geom_smooth()` using formula 'y ~ x'

w + 
  facet_grid(genhlth~gender)

## `geom_smooth()` using formula 'y ~ x'

## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)


##死亡案例
deaths_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file <- 'time_series_covid19_deaths_global.csv'
download.file(deaths_url, deaths_file)

## 康復案例
recovered_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recovered_file <- 'time_series_covid19_recovered_global.csv'
download.file(recovered_url, recovered_file)


library(tidyverse)
recovered_df <- read_csv('time_series_covid19_recovered_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )

## See spec(...) for full column specifications.

recovered_df$Case <- 'recovered'
dim(recovered_df)

## [1] 253 215

deaths_df <- read_csv('time_series_covid19_deaths_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.

deaths_df$Case <- 'deaths'
dim(deaths_df)

## [1] 266 215

confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.

confirmed_df$Case <- 'confirmed'
dim(confirmed_df)

## [1] 266 215

merged_df <- list(confirmed = confirmed_df, deaths = deaths_df, recovered = recovered_df)

covid19_ts_df <- do.call(rbind, merged_df)
dim(covid19_ts_df)

## [1] 785 215

col_names <- colnames(covid19_ts_df)
#col_names
date_cols <- col_names[5:(length(col_names) - 1) ]
#date_cols

covid19_tidy_df <- covid19_ts_df %>%
   gather(Date, Case_Number, date_cols)

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_cols)` instead of `date_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

covid19_tidy_df$Date <- as.Date(covid19_tidy_df$Date, format = '%m/%d/%y')

head(covid19_tidy_df)

## # A tibble: 6 x 7
##   `Province/State` `Country/Region`     Lat   Long Case   Date       Case_Number
##   <chr>            <chr>              <dbl>  <dbl> <chr>  <date>           <dbl>
## 1 <NA>             Afghanistan         33.9  67.7  confi… 2020-01-22           0
## 2 <NA>             Albania             41.2  20.2  confi… 2020-01-22           0
## 3 <NA>             Algeria             28.0   1.66 confi… 2020-01-22           0
## 4 <NA>             Andorra             42.5   1.52 confi… 2020-01-22           0
## 5 <NA>             Angola             -11.2  17.9  confi… 2020-01-22           0
## 6 <NA>             Antigua and Barbu…  17.1 -61.8  confi… 2020-01-22           0

taiwan_df <- covid19_tidy_df %>%
  filter(`Country/Region` == 'Taiwan*')

g <- ggplot(taiwan_df) +
  aes(x = Date, y = Case_Number, color= factor(`Case`)) +
  geom_line()

g + facet_grid(`Case` ~ .)

g + facet_grid(.~ `Case`)

#covid19_tidy_df$`Country/Region` %%

covid19_tidy_df

## # A tibble: 164,850 x 7
##    `Province/State`   `Country/Region`   Lat   Long Case  Date       Case_Number
##    <chr>              <chr>            <dbl>  <dbl> <chr> <date>           <dbl>
##  1 <NA>               Afghanistan       33.9  67.7  conf… 2020-01-22           0
##  2 <NA>               Albania           41.2  20.2  conf… 2020-01-22           0
##  3 <NA>               Algeria           28.0   1.66 conf… 2020-01-22           0
##  4 <NA>               Andorra           42.5   1.52 conf… 2020-01-22           0
##  5 <NA>               Angola           -11.2  17.9  conf… 2020-01-22           0
##  6 <NA>               Antigua and Bar…  17.1 -61.8  conf… 2020-01-22           0
##  7 <NA>               Argentina        -38.4 -63.6  conf… 2020-01-22           0
##  8 <NA>               Armenia           40.1  45.0  conf… 2020-01-22           0
##  9 Australian Capita… Australia        -35.5 149.   conf… 2020-01-22           0
## 10 New South Wales    Australia        -33.9 151.   conf… 2020-01-22           0
## # … with 164,840 more rows

a <-  inner_join(covid19_tidy_df, confirmed_stat,  by= c("Country/Region"= "country"))

g <- ggplot(a) +
  aes(x = Date, y = Case_Number, color= factor(`Country/Region`)) +
  geom_line()
g + facet_grid(`Country/Region`~`Case`)

Demo20200819

David Chiu

8/19/2020