library('tidyverse')
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
## 確定病名 = col_character(),
## 發病年份 = col_double(),
## 發病月份 = col_double(),
## 縣市 = col_character(),
## 鄉鎮 = col_character(),
## 性別 = col_character(),
## 是否為境外移入 = col_character(),
## 年齡層 = col_character(),
## 確定病例數 = col_double()
## )
head(measles)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層 確定病例數
## <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 麻疹 2009 4 高雄市 小港區… M 否 20-24 2
## 2 麻疹 2009 5 基隆市 中山區… M 否 30-34 1
## 3 麻疹 2011 6 新北市 蘆洲區… F 是 25-29 1
## 4 麻疹 2014 1 高雄市 三民區… F 是 0 1
## 5 麻疹 2017 3 台北市 松山區… F 是 0 1
## 6 麻疹 2018 4 桃園市 蘆竹區… F 否 30-34 1
x <- as.Date('2020-08-20')
class(x)
## [1] "Date"
unclass(x)
## [1] 18494
y <- as.Date('1970-01-01')
unclass(y)
## [1] 0
x <- Sys.time()
x
## [1] "2020-08-19 17:05:36 CST"
p <- as.POSIXlt(x)
unclass(p)
## $sec
## [1] 36.2334
##
## $min
## [1] 5
##
## $hour
## [1] 17
##
## $mday
## [1] 19
##
## $mon
## [1] 7
##
## $year
## [1] 120
##
## $wday
## [1] 3
##
## $yday
## [1] 231
##
## $isdst
## [1] 0
##
## $zone
## [1] "CST"
##
## $gmtoff
## [1] 28800
##
## attr(,"tzone")
## [1] "" "CST" "CDT"
p$sec
## [1] 36.2334
p1 <- as.POSIXct(x)
p1
## [1] "2020-08-19 17:05:36 CST"
ds <- c('May 8, 2020, 12:00')
x <- strptime(ds,'%B %d, %Y, %H:%M')
x
## [1] "2020-05-08 12:00:00 CST"
x1 <- as.POSIXlt(as.Date('2020-05-03'))
x - x1
## Time difference of 5.166667 days
head(measles)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層 確定病例數
## <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 麻疹 2009 4 高雄市 小港區… M 否 20-24 2
## 2 麻疹 2009 5 基隆市 中山區… M 否 30-34 1
## 3 麻疹 2011 6 新北市 蘆洲區… F 是 25-29 1
## 4 麻疹 2014 1 高雄市 三民區… F 是 0 1
## 5 麻疹 2017 3 台北市 松山區… F 是 0 1
## 6 麻疹 2018 4 桃園市 蘆竹區… F 否 30-34 1
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
dt <- '2020-08-19'
ymd(dt)
## [1] "2020-08-19"
ymd('2020/06/05')
## [1] "2020-06-05"
dmy('05/06/2020')
## [1] "2020-06-05"
Sys.getlocale("LC_TIME")
## [1] "en_US.UTF-8"
dt <- '2020-08-19 14:30:00'
ymd_hms(dt, tz = 'Asia/Taipei', locale = 'zh_TW')
## [1] "2020-08-19 14:30:00 CST"
bday <- dmy("15/10/1988")
month(bday)
## [1] 10
wday(bday, label = TRUE)
## [1] Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
year(bday)
## [1] 1988
year(bday) <- 2020
wday(bday, label=TRUE)
## [1] Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
paste('發病年份', '發病月份', '01',sep = '-')
## [1] "發病年份-發病月份-01"
measles$發病時間 <- as.Date(with(measles, paste(發病年份, 發病月份, '01',sep="-")), "%Y-%m-%d")
head(measles)
## # A tibble: 6 x 10
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層 確定病例數
## <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 麻疹 2009 4 高雄市… 小港區… M 否 20-24 2
## 2 麻疹 2009 5 基隆市… 中山區… M 否 30-34 1
## 3 麻疹 2011 6 新北市… 蘆洲區… F 是 25-29 1
## 4 麻疹 2014 1 高雄市… 三民區… F 是 0 1
## 5 麻疹 2017 3 台北市… 松山區… F 是 0 1
## 6 麻疹 2018 4 桃園市… 蘆竹區… F 否 30-34 1
## # … with 1 more variable: 發病時間 <date>
p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point()
p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point(color = "red")
p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point(color = "blue",size = 5, shape = 14)
p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數))
p1 + geom_point(color='red')
p1 + geom_point(aes(color = 性別))
p1 +
geom_point(aes(color=factor(性別)) ) +
scale_color_manual(values = c("orange", "purple"))
p1 +
geom_point(aes(color=factor(性別)) ) +
scale_color_manual(values = c("orange", "purple"))
p1 +
geom_point(color='red' ) +
geom_point(color='blue' )
source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')
p2 <- p1 + geom_point(aes(color=factor(性別)) )
p3 <- p1 + geom_point(aes(shape=factor(性別)) )
multiplot(p2, p3, cols=2)
## Loading required package: grid
#source('multiplot.R')
p2 <- p1 + geom_point(aes(color=factor(性別)) )
p3 <- p1 + geom_point(aes(shape=factor(性別)) )
multiplot(p2, p3, cols=2)
p1 +
geom_point() +
xlab("時間") +
ylab("病例數") +
ggtitle("麻疹發病趨勢")
p1 + geom_point(aes(color=factor(性別)) ) + geom_line(aes(color=factor(性別)) )
p1 + geom_line(aes(color=factor(性別)) ) + geom_point(aes(color=factor(性別)) )
## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)
confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
taiwan_stat <- confirmed_df %>%
filter(`Country/Region` == 'Taiwan*')
date_col <- names(taiwan_stat)[5:length(names(taiwan_stat))]
taiwan_long_stat <- taiwan_stat %>%
gather(Date, Case_Number , date_col)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_col)` instead of `date_col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
taiwan_long_stat$Date <- as.Date(taiwan_long_stat$Date, format='%m/%d/%y')
head(taiwan_long_stat)
## # A tibble: 6 x 6
## `Province/State` `Country/Region` Lat Long Date Case_Number
## <chr> <chr> <dbl> <dbl> <date> <dbl>
## 1 <NA> Taiwan* 23.7 121 2020-01-22 1
## 2 <NA> Taiwan* 23.7 121 2020-01-23 1
## 3 <NA> Taiwan* 23.7 121 2020-01-24 3
## 4 <NA> Taiwan* 23.7 121 2020-01-25 3
## 5 <NA> Taiwan* 23.7 121 2020-01-26 4
## 6 <NA> Taiwan* 23.7 121 2020-01-27 5
p1 <- ggplot(taiwan_long_stat, aes(x = Date, y= Case_Number))
p1 +
geom_line(color = 'red') +
xlab('時間') +
ylab('病例數') +
ggtitle('新冠肺炎台灣確診數趨勢圖')
p1 <- ggplot(taiwan_long_stat) # data
p1 +
aes(x = Date, y= Case_Number) + # aes
geom_line(color = 'red') + # geometry
xlab('時間') +
ylab('病例數') +
ggtitle('新冠肺炎台灣確診數趨勢圖')
## MAC 繪圖中文處理
使用 fc-list,檢查有什麼字體
#install.packages("showtext")
library(showtext)
## Warning: package 'showtext' was built under R version 4.0.2
## Loading required package: sysfonts
## Warning: package 'sysfonts' was built under R version 4.0.2
## Loading required package: showtextdb
## Warning: package 'showtextdb' was built under R version 4.0.2
showtext_auto(enable = TRUE)
font_add("Microsoft Jhenghei", "Microsoft Jhenghei.ttf")
plot(runif(10), xlab = "橫軸", ylab = "縱軸", main = "中文標題",
family = "Microsoft Jhenghei")
text(5, 0.7, "Jhenghei", cex = 3, family = "Microsoft Jhenghei")
curl::curl_download('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cdc.Rdata', 'cdc.Rdata')
load('cdc.Rdata')
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
plot(wtdesire ~ weight, data=cdc )
ggplot(cdc) +
aes(x = weight, y = wtdesire) +
geom_point()
hist(cdc$weight)
ggplot(cdc) +
aes(x = weight) +
geom_histogram(bins = 500)
#sort(table(cdc$weight), decreasing = TRUE)
ggplot(cdc) +
aes(x = weight) +
geom_density(alpha = (0.2), stat = 'density') +
aes(color = factor(gender))
ggplot(cdc) +
aes(x = weight,color = factor(gender), fill = factor(gender)) +
geom_density(alpha = (0.2), stat = 'density')
ggplot(cdc) +
aes(x = weight, color = factor(gender), fill = factor(gender)) +
geom_histogram(bins = 50, alpha = 0.2)
boxplot(weight~gender, data = cdc)
ggplot(cdc) +
aes(x = gender, y= weight, fill = factor(gender)) +
geom_boxplot()
table(cdc$genhlth)
##
## excellent very good good fair poor
## 4657 6972 5675 2019 677
barplot(table(cdc$genhlth))
ggplot(cdc) +
aes(x = genhlth, fill= genhlth) +
geom_bar()
ggplot(cdc) +
aes(x = genhlth) +
geom_bar(fill='red')
pie(table(cdc$gender))
cdc_sex <- data.frame(table(cdc$gender))
cdc_sex
## Var1 Freq
## 1 m 9569
## 2 f 10431
ggplot(cdc_sex) +
aes(x="", y=Freq ,fill=Var1 ) +
geom_bar(width=1, stat = "identity") +
geom_text(aes(label=Freq, y =c(15000, 5000) ) , size=5) +
coord_polar(theta="y", start = 0)
confirmed_stat <- confirmed_df %>%
select(country = `Country/Region` , case_number = `8/18/20`) %>%
arrange(desc(case_number)) %>%
slice(0:10)
confirmed_stat
## # A tibble: 10 x 2
## country case_number
## <chr> <dbl>
## 1 US 5482416
## 2 Brazil 3407354
## 3 India 2767253
## 4 Russia 930276
## 5 South Africa 592144
## 6 Peru 541493
## 7 Mexico 531239
## 8 Colombia 489122
## 9 Chile 388855
## 10 Spain 364196
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
ggplot(confirmed_stat) +
geom_bar(aes(x=reorder(country, -case_number),y=case_number),stat="identity", fill= 'blue') +
xlab('國家') +
ylab('確診數') +
ggtitle('確診數排名前10國家') +
scale_y_continuous(labels = scales::comma)
?geom_bar
ggplot(cdc) +
aes(x = weight, y= wtdesire, color = factor(gender)) +
geom_point() +
geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'
ggplot(cdc) +
aes(x = weight, y= height, color = factor(gender)) +
geom_point() +
geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'
ggplot(cdc) +
aes(x = weight, y= wtdesire, color = factor(gender)) +
geom_point() +
geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'
ggplot(cdc) +
aes(x = gender, y = weight) +
geom_jitter() +
geom_boxplot(aes(fill = gender))
w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w +
facet_grid(gender ~.)
## `geom_smooth()` using formula 'y ~ x'
w +
facet_grid(genhlth ~.)
## `geom_smooth()` using formula 'y ~ x'
w +
facet_grid(.~genhlth)
## `geom_smooth()` using formula 'y ~ x'
w +
facet_grid(gender~genhlth)
## `geom_smooth()` using formula 'y ~ x'
w +
facet_grid(genhlth~gender)
## `geom_smooth()` using formula 'y ~ x'
## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)
##死亡案例
deaths_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file <- 'time_series_covid19_deaths_global.csv'
download.file(deaths_url, deaths_file)
## 康復案例
recovered_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recovered_file <- 'time_series_covid19_recovered_global.csv'
download.file(recovered_url, recovered_file)
library(tidyverse)
recovered_df <- read_csv('time_series_covid19_recovered_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
recovered_df$Case <- 'recovered'
dim(recovered_df)
## [1] 253 215
deaths_df <- read_csv('time_series_covid19_deaths_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
deaths_df$Case <- 'deaths'
dim(deaths_df)
## [1] 266 215
confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
confirmed_df$Case <- 'confirmed'
dim(confirmed_df)
## [1] 266 215
merged_df <- list(confirmed = confirmed_df, deaths = deaths_df, recovered = recovered_df)
covid19_ts_df <- do.call(rbind, merged_df)
dim(covid19_ts_df)
## [1] 785 215
col_names <- colnames(covid19_ts_df)
#col_names
date_cols <- col_names[5:(length(col_names) - 1) ]
#date_cols
covid19_tidy_df <- covid19_ts_df %>%
gather(Date, Case_Number, date_cols)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_cols)` instead of `date_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
covid19_tidy_df$Date <- as.Date(covid19_tidy_df$Date, format = '%m/%d/%y')
head(covid19_tidy_df)
## # A tibble: 6 x 7
## `Province/State` `Country/Region` Lat Long Case Date Case_Number
## <chr> <chr> <dbl> <dbl> <chr> <date> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 confi… 2020-01-22 0
## 2 <NA> Albania 41.2 20.2 confi… 2020-01-22 0
## 3 <NA> Algeria 28.0 1.66 confi… 2020-01-22 0
## 4 <NA> Andorra 42.5 1.52 confi… 2020-01-22 0
## 5 <NA> Angola -11.2 17.9 confi… 2020-01-22 0
## 6 <NA> Antigua and Barbu… 17.1 -61.8 confi… 2020-01-22 0
taiwan_df <- covid19_tidy_df %>%
filter(`Country/Region` == 'Taiwan*')
g <- ggplot(taiwan_df) +
aes(x = Date, y = Case_Number, color= factor(`Case`)) +
geom_line()
g + facet_grid(`Case` ~ .)
g + facet_grid(.~ `Case`)
#covid19_tidy_df$`Country/Region` %%
covid19_tidy_df
## # A tibble: 164,850 x 7
## `Province/State` `Country/Region` Lat Long Case Date Case_Number
## <chr> <chr> <dbl> <dbl> <chr> <date> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 conf… 2020-01-22 0
## 2 <NA> Albania 41.2 20.2 conf… 2020-01-22 0
## 3 <NA> Algeria 28.0 1.66 conf… 2020-01-22 0
## 4 <NA> Andorra 42.5 1.52 conf… 2020-01-22 0
## 5 <NA> Angola -11.2 17.9 conf… 2020-01-22 0
## 6 <NA> Antigua and Bar… 17.1 -61.8 conf… 2020-01-22 0
## 7 <NA> Argentina -38.4 -63.6 conf… 2020-01-22 0
## 8 <NA> Armenia 40.1 45.0 conf… 2020-01-22 0
## 9 Australian Capita… Australia -35.5 149. conf… 2020-01-22 0
## 10 New South Wales Australia -33.9 151. conf… 2020-01-22 0
## # … with 164,840 more rows
a <- inner_join(covid19_tidy_df, confirmed_stat, by= c("Country/Region"= "country"))
g <- ggplot(a) +
aes(x = Date, y = Case_Number, color= factor(`Country/Region`)) +
geom_line()
g + facet_grid(`Country/Region`~`Case`)