Demo20200826

curl::curl_download('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cdc.Rdata', 'cdc.Rdata')

load('cdc.Rdata')

head(cdc)

##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.2

## ── Attaching packages ──────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

g <- ggplot(cdc) 

g + 
  aes(x = weight, y= height, color = factor(gender)) + 
  geom_point()

g + 
  aes(x = weight, y= wtdesire, color = factor(gender)) + 
  geom_point() +
  geom_smooth(method = 'lm') +
  xlim(200,300) +
  ylim(100,200)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 16963 rows containing non-finite values (stat_smooth).

## Warning: Removed 16963 rows containing missing values (geom_point).

g + 
  aes(x = weight) +
  geom_histogram(binwidth = 10, color ='black') +
  xlim(0,1000)

## Warning: Removed 2 rows containing missing values (geom_bar).

g + 
  aes(x = weight) +
  geom_histogram(binwidth = 10, color ='black') +
  coord_cartesian(ylim =c(0,1000))

g + 
  aes(x = weight) +
  geom_histogram(binwidth = 10, color ='black') +
  xlim(400,500)

## Warning: Removed 19992 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

g + 
  aes(x = weight, y = wtdesire, color = factor(gender)) +
  geom_point() +
  geom_smooth(method = 'lm') + 
  facet_grid(gender ~ genhlth) +
  coord_cartesian(ylim = c(100,200))

## `geom_smooth()` using formula 'y ~ x'

## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)

##死亡案例
deaths_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file <- 'time_series_covid19_deaths_global.csv'
download.file(deaths_url, deaths_file)

## 康復案例
recovered_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recovered_file <- 'time_series_covid19_recovered_global.csv'
download.file(recovered_url, recovered_file)

library(tidyverse)
recovered_df <- read_csv('time_series_covid19_recovered_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )

## See spec(...) for full column specifications.

recovered_df$Case <- 'recovered'
dim(recovered_df)

## [1] 253 222

deaths_df <- read_csv('time_series_covid19_deaths_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.

deaths_df$Case <- 'deaths'
dim(deaths_df)

## [1] 266 222

confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.

confirmed_df$Case <- 'confirmed'
dim(confirmed_df)

## [1] 266 222

merged_df <- list(confirmed = confirmed_df, deaths = deaths_df, recovered = recovered_df)

covid19_ts_df <- do.call(rbind, merged_df)
dim(covid19_ts_df)

## [1] 785 222

col_names <- colnames(covid19_ts_df)
#col_names
date_cols <- col_names[5:(length(col_names) - 1) ]
#date_cols

covid19_tidy_df <- covid19_ts_df %>%
   gather(Date, Case_Number, date_cols)

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_cols)` instead of `date_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

head(covid19_tidy_df)

## # A tibble: 6 x 7
##   `Province/State` `Country/Region`      Lat   Long Case      Date   Case_Number
##   <chr>            <chr>               <dbl>  <dbl> <chr>     <chr>        <dbl>
## 1 <NA>             Afghanistan          33.9  67.7  confirmed 1/22/…           0
## 2 <NA>             Albania              41.2  20.2  confirmed 1/22/…           0
## 3 <NA>             Algeria              28.0   1.66 confirmed 1/22/…           0
## 4 <NA>             Andorra              42.5   1.52 confirmed 1/22/…           0
## 5 <NA>             Angola              -11.2  17.9  confirmed 1/22/…           0
## 6 <NA>             Antigua and Barbuda  17.1 -61.8  confirmed 1/22/…           0

covid19_tidy_df$Date <- as.Date(covid19_tidy_df$Date, format = '%m/%d/%y')

head(covid19_tidy_df)

## # A tibble: 6 x 7
##   `Province/State` `Country/Region`     Lat   Long Case   Date       Case_Number
##   <chr>            <chr>              <dbl>  <dbl> <chr>  <date>           <dbl>
## 1 <NA>             Afghanistan         33.9  67.7  confi… 2020-01-22           0
## 2 <NA>             Albania             41.2  20.2  confi… 2020-01-22           0
## 3 <NA>             Algeria             28.0   1.66 confi… 2020-01-22           0
## 4 <NA>             Andorra             42.5   1.52 confi… 2020-01-22           0
## 5 <NA>             Angola             -11.2  17.9  confi… 2020-01-22           0
## 6 <NA>             Antigua and Barbu…  17.1 -61.8  confi… 2020-01-22           0

filtered_df <- covid19_tidy_df %>%
  filter(`Country/Region` %in% c('US', 'Taiwan*'))

g <-  ggplot(filtered_df)
#head(filtered_df)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

g + 
  aes(x = Date, y = Case_Number, color = factor(`Case`)) +
  geom_line() +
  facet_grid(`Country/Region` ~ `Case`) + 
  xlim(as.Date('2020-01-01'),as.Date('2020-09-01')) +
  scale_y_continuous(labels = scales::comma) + 
  scale_y_log10()

## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

## Warning: Transformation introduced infinite values in continuous y-axis

## Theme

library(showtext)

## Warning: package 'showtext' was built under R version 4.0.2

## Loading required package: sysfonts

## Warning: package 'sysfonts' was built under R version 4.0.2

## Loading required package: showtextdb

## Warning: package 'showtextdb' was built under R version 4.0.2

showtext_auto(enable = TRUE)
font_add("Microsoft Jhenghei", "Microsoft Jhenghei.ttf")
plot(runif(10), xlab = "橫軸", ylab = "縱軸", main = "中文標題",
  family = "Microsoft Jhenghei")
text(5, 0.7, "Jhenghei", cex = 3, family = "Microsoft Jhenghei")

g + 
  aes(x = Date, y = Case_Number, color = factor(`Case`)) +
  geom_line() +
  facet_grid(`Country/Region` ~ `Case`) + 
  xlim(as.Date('2020-01-01'),as.Date('2020-09-01')) +
  scale_y_continuous(labels = scales::comma) +
  xlab('日期') +
  ylab('數量') +
  ggtitle('台灣 v.s. 美國案例數量')

g + 
  aes(x = Date, y = Case_Number, color = factor(`Case`)) +
  geom_line() +
  facet_grid(`Country/Region` ~ `Case`) + 
  xlim(as.Date('2020-01-01'),as.Date('2020-09-01')) +
  scale_y_continuous(labels = scales::comma) +
  xlab('日期') +
  ylab('數量') +
  ggtitle('台灣 v.s. 美國案例數量') + 
  theme(axis.title.x = element_text(color = 'DarkGreen', size = 10),
        axis.title.y = element_text(color = 'Red', size = 10), 
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15)) + 
   theme(plot.title = element_text(size = 20, hjust = 0.5))

w <- ggplot(cdc)

w1 <- w + aes(x = weight, y = wtdesire, color = gender) + geom_point() + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重') + 
  scale_color_manual(name ='性別',labels = c("MALE", "FEMALE"), values = c("blue", "red")) +
  theme(legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.position = c(1,1),
        legend.justification = c(1,1)) 
  
w1

ggsave(w1, file= 'test.png')

## Saving 7 x 5 in image

ggsave(w1, file="test2.png", width=6, height=4)


filtered_df <- covid19_tidy_df %>%
  filter(`Country/Region` == 'Taiwan*')

g2 <- g <- ggplot(filtered_df) + 
  aes(x = Date, y = Case_Number, color = `Case`) + 
  geom_line() + 
  facet_grid(`Case`~.) +
  xlab('日期') + 
  ylab('個案數量') +
  ggtitle('台灣近期新冠肺炎個案人數') + 
  scale_color_manual(name ='個案情況',labels = c("確診", "死亡", '康復'), values = c('blue', 'green', 'red')) +
  theme(plot.title = element_text(size = 20, hjust = 0.5)
        ) 


#g2
ggsave(g2, file = 'taiwan_covid19.png')

## Saving 7 x 5 in image

#install.packages("dostats")
#library(dostats)
user_case_staging <- data.frame(感染國家 = c('TW', 'US', NA, 'US'), 備註= c(NA, '加州', '美國', '費城'))

match('美國', user_case_staging$備註)

## [1] 3

user_case_staging$感染國家[grepl('美國', user_case_staging$備註)] = 'US'

user_case_production <- user_case_staging

user_case_staging <- data.frame(感染國家 = c('TW', 'US', NA, 'US'), 備註= c(NA, '加州', '美國', '費城'))
user_case_production <- user_case_staging %>% 
  mutate(個案國家 = ifelse(((`感染國家` == 'US') |  grepl('美國',`備註`)), 'US', `感染國家` ))
user_case_production

##   感染國家 備註 個案國家
## 1       TW <NA>       TW
## 2       US 加州       US
## 3     <NA> 美國       US
## 4       US 費城       US

##ggmap

#install.packages('ggmap')
#library(ggmap)
#?register_google
#map <- get_map(location = 'Taiwan', zoom = 7)
#ggmap(map)

leaftlet

#install.packages('leaflet')
library(leaflet)

## Warning: package 'leaflet' was built under R version 4.0.2

library(leaflet)

m <- leaflet() %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addMarkers(lng=121, lat=25, popup="Taiwan")

max(covid19_tidy_df$Date)

## [1] "2020-08-25"

covid19_lastest_confirmed_df <- 
covid19_tidy_df %>%
  filter(`Date` == '2020-08-25', `Case` == 'confirmed')

head(covid19_lastest_confirmed_df)

## # A tibble: 6 x 7
##   `Province/State` `Country/Region`     Lat   Long Case   Date       Case_Number
##   <chr>            <chr>              <dbl>  <dbl> <chr>  <date>           <dbl>
## 1 <NA>             Afghanistan         33.9  67.7  confi… 2020-08-25       38070
## 2 <NA>             Albania             41.2  20.2  confi… 2020-08-25        8759
## 3 <NA>             Algeria             28.0   1.66 confi… 2020-08-25       42228
## 4 <NA>             Andorra             42.5   1.52 confi… 2020-08-25        1060
## 5 <NA>             Angola             -11.2  17.9  confi… 2020-08-25        2283
## 6 <NA>             Antigua and Barbu…  17.1 -61.8  confi… 2020-08-25          94

?addCircleMarkers

leaflet(covid19_lastest_confirmed_df) %>% addTiles() %>%
  addCircleMarkers(
    lat = covid19_lastest_confirmed_df$Lat, 
    lng = covid19_lastest_confirmed_df$Long, 
    radius = log(covid19_lastest_confirmed_df$Case_Number),
    color = ifelse(covid19_lastest_confirmed_df$Case_Number >=1000000 , 'red', 'blue'),
    stroke = FALSE, 
    fillOpacity = 0.5
  )

dengue <- read_csv('https://od.cdc.gov.tw/eic/Dengue_Daily_last12m.csv')

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   發病日 = col_date(format = ""),
##   個案研判日 = col_date(format = ""),
##   通報日 = col_date(format = ""),
##   確定病例數 = col_double(),
##   內政部居住縣市代碼 = col_double(),
##   內政部居住鄉鎮代碼 = col_double()
## )

## See spec(...) for full column specifications.

head(dengue)

## # A tibble: 6 x 26
##   發病日     個案研判日 通報日     性別  年齡層 居住縣市 居住鄉鎮 居住村里
##   <date>     <date>     <date>     <chr> <chr>  <chr>    <chr>    <chr>   
## 1 2019-08-01 2019-08-03 2019-08-02 男    35-39  高雄市   苓雅區   福康里  
## 2 2019-08-01 2019-08-03 2019-08-02 男    15-19  桃園市   龜山區   陸光里  
## 3 2019-08-01 2019-08-03 2019-08-02 女    40-44  桃園市   觀音區   樹林里  
## 4 2019-08-01 2019-08-06 2019-08-06 男    45-49  高雄市   仁武區   八卦里  
## 5 2019-08-01 2019-08-13 2019-08-06 女    15-19  高雄市   左營區   新下里  
## 6 2019-08-02 2019-08-03 2019-08-02 女    45-49  桃園市   龜山區   陸光里  
## # … with 18 more variables: 最小統計區 <chr>, 最小統計區中心點X <chr>,
## #   最小統計區中心點Y <chr>, 一級統計區 <chr>, 二級統計區 <chr>,
## #   感染縣市 <chr>, 感染鄉鎮 <chr>, 感染村里 <chr>, 是否境外移入 <chr>,
## #   感染國家 <chr>, 確定病例數 <dbl>, 居住村里代碼 <chr>, 感染村里代碼 <chr>,
## #   血清型 <chr>, 內政部居住縣市代碼 <dbl>, 內政部居住鄉鎮代碼 <dbl>,
## #   內政部感染縣市代碼 <chr>, 內政部感染鄉鎮代碼 <chr>

dengue$最小統計區中心點X <- as.double(dengue$最小統計區中心點X)

## Warning: NAs introduced by coercion

dengue$最小統計區中心點Y <- as.double(dengue$最小統計區中心點Y)

## Warning: NAs introduced by coercion

formattext <- function(gender, age, type){
  sprintf('性別：%s<br>年紀：%s<br>型別：%s', gender, age, type)
}

leaflet(dengue) %>% addTiles() %>% addMarkers(
  lat =  dengue$最小統計區中心點Y,
  lng = dengue$最小統計區中心點X,
  label=as.character(dengue$血清型),
  popup = as.character(formattext(dengue$性別, dengue$年齡層,dengue$血清型)),
  clusterOptions = markerClusterOptions()
)

## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with either
## missing or invalid lat/lon values and will be ignored

leaflet(dengue) %>%  addProviderTiles(providers$CartoDB.Positron) %>% addMarkers(
  lat =  dengue$最小統計區中心點Y,
  lng = dengue$最小統計區中心點X,
  label=as.character(dengue$血清型),
  popup = as.character(formattext(dengue$性別, dengue$年齡層,dengue$血清型)),
  clusterOptions = markerClusterOptions()
)

## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with either
## missing or invalid lat/lon values and will be ignored

GeoJSON

download.file('https://od.cdc.gov.tw/eic/DengueCluster_Taoyuan.geojson', 'DengueCluster_Taoyuan.geojson')
#install.packages('geojsonio')

library(geojsonio)

## Warning: package 'geojsonio' was built under R version 4.0.2

## 
## Attaching package: 'geojsonio'

## The following object is masked from 'package:base':
## 
##     pretty

dengue_geo <- geojsonio::geojson_read("DengueCluster_Taoyuan.geojson", what = "sp")

leaflet(dengue_geo) %>%
  addTiles() %>%
  addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 0.5,
    fillColor = 'red')

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

taiwan_filtered_df <- covid19_tidy_df %>%
  filter((`Country/Region` == 'Taiwan*') & (`Case` == 'confirmed')  )

#taiwan_filtered_df

fig<-plot_ly(taiwan_filtered_df, x = ~Date, y = ~Case_Number, type = 'scatter', mode = 'lines+markers')
fig

## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

taiwan_filtered_df <- covid19_tidy_df %>%
  filter((`Country/Region` == 'Taiwan*')   )

head(taiwan_filtered_df)

## # A tibble: 6 x 7
##   `Province/State` `Country/Region`   Lat  Long Case      Date       Case_Number
##   <chr>            <chr>            <dbl> <dbl> <chr>     <date>           <dbl>
## 1 <NA>             Taiwan*           23.7   121 confirmed 2020-01-22           1
## 2 <NA>             Taiwan*           23.7   121 deaths    2020-01-22           0
## 3 <NA>             Taiwan*           23.7   121 recovered 2020-01-22           0
## 4 <NA>             Taiwan*           23.7   121 confirmed 2020-01-23           1
## 5 <NA>             Taiwan*           23.7   121 deaths    2020-01-23           0
## 6 <NA>             Taiwan*           23.7   121 recovered 2020-01-23           0

taiwan_spread_df<-taiwan_filtered_df %>%
  spread(key = `Case`, value = `Case_Number`)

fig<-plot_ly(taiwan_spread_df, x = ~Date, y = ~confirmed, type = 'scatter', mode = 'lines+markers', name='確診')
fig %>% 
  add_trace(y = ~recovered, mode = 'lines+markers', name = '康復') %>%
  add_trace(y = ~deaths, mode = 'lines+markers', name = '死亡')

#?spread


library(plotly)
ds <- data.frame(labels = c("A", "B", "C"),
        values = c(10, 20, 30))
plot_ly(ds, labels = ds$labels, values = ds$values, type = "pie", hole=0.6) %>%
layout(title = "Donut Chart Example")

library(plotly)
month<- c(1,2,3,4,5)
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot_ly(x = month, y = taipei, fill = "tozeroy", name="taipei",type='scatter', mode= 'markers') %>%
add_trace(x = month, y = tainan, fill = "tozeroy",name="tainan") %>%
layout(yaxis = list(title = 'rainfall') )

library(plotly)
month<- c(1,2,3,4,5)
taipei<- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)
total <- taipei + tainan

y <- list(title="Rainfall")

plot_ly(x = month, y = taipei, fill = "tozeroy", name="taipei",type='scatter', mode= 'markers') %>% 
  add_trace(x = month, y = total, fill = "tonexty", name="tainan")

data("diamonds")
head(diamonds)

## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

d <- diamonds[sample(nrow(diamonds), 1000), ]

plot_ly(d, x = d$carat, y= d$price, color = d$clarity, type='scatter', mode= 'markers', size = d$carat, text= paste("Clarity", d$clarity))

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

p <- subplot(
plot_ly(taiwan_spread_df, x = ~Date, y = ~confirmed, type='scatter', mode = 'lines'),
plot_ly(taiwan_spread_df, x = ~Date, y = ~recovered, type='scatter', mode = 'lines'),
plot_ly(taiwan_spread_df, x = ~Date, y = ~deaths, type='scatter', mode = 'lines'),
margin=0.05
)
p %>% layout(showlegend=FALSE)

p <- subplot(
plot_ly(taiwan_spread_df, x = ~Date, y = ~confirmed, type='scatter', mode = 'lines'),
plot_ly(taiwan_spread_df, x = ~Date, y = ~recovered, type='scatter', mode = 'lines'),
plot_ly(taiwan_spread_df, x = ~Date, y = ~deaths, type='scatter', mode = 'lines'),
margin=0.05, 
nrows = 3
)
p %>% layout(showlegend=FALSE)

filtered_df <- covid19_tidy_df %>%
  filter(`Country/Region` == 'Taiwan*')

g2 <- ggplot(filtered_df) + 
  aes(x = Date, y = Case_Number, color = `Case`) + 
  geom_line() + 
  facet_grid(`Case`~.) +
  xlab('日期') + 
  ylab('個案數量') +
  ggtitle('台灣近期新冠肺炎個案人數') + 
  scale_color_manual(name ='個案情況',labels = c("確診", "死亡", '康復'), values = c('blue', 'green', 'red')) +
  theme(plot.title = element_text(size = 20, hjust = 0.5)
        ) 

fig <- ggplotly(g2)
fig

top10 <- covid19_tidy_df %>% 
  filter((`Date` == '2020-08-25') & `Case` == 'confirmed') %>%
  arrange(desc(Case_Number)) %>%
  head(10)

library(scales)
g <- ggplot(top10) +
  geom_bar(aes(x=reorder(`Country/Region`, -Case_Number),y=Case_Number),stat="identity", fill= 'blue') +
  xlab('國家') +
  ylab('確診數') + 
  ggtitle('確診數排名前10國家') +
  scale_y_continuous(labels = scales::comma)

fig <- ggplotly(g)
fig

Demo20200826

David Chiu

8/26/2020

leaftlet

GeoJSON