tidyverse

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
help(package = 'tidyverse')
tidyverse_packages()
##  [1] "broom"      "cli"        "crayon"     "dbplyr"     "dplyr"     
##  [6] "forcats"    "ggplot2"    "haven"      "hms"        "httr"      
## [11] "jsonlite"   "lubridate"  "magrittr"   "modelr"     "pillar"    
## [16] "purrr"      "readr"      "readxl"     "reprex"     "rlang"     
## [21] "rstudioapi" "rvest"      "stringr"    "tibble"     "tidyr"     
## [26] "xml2"       "tidyverse"

dplyr

library(dplyr)
help(package = 'dplyr')

read covid19 data

#COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv

setwd('~/Desktop/')

curl::curl_download('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv', destfile = '07-28-2020.csv')


download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv', destfile = '07-28-2020.csv')


library(readr)
covid19 <- read_csv("07-28-2020.csv")
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
head(covid19)
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2  -82.5
## 2 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3  -92.4
## 3 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8  -75.6
## 4 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5 -116. 
## 5 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3  -94.5
## 6 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1  -85.3
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
#View(covid19)

filter

#covid19[covid19$Country_Region == 'US', ]

#covid19[covid19[,'Country_Region'] == 'US', ]

# filter(covid19, Country_Region  == 'US')

# AND
filter(covid19, Country_Region  == 'US' & Confirmed > 10000) %>% head()
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1  6001 Alame… California     US             2020-07-29 04:35:11  37.6 -122. 
## 2 24005 Balti… Maryland       US             2020-07-29 04:35:11  39.5  -76.6
## 3 24510 Balti… Maryland       US             2020-07-29 04:35:11  39.3  -76.6
## 4 34003 Bergen New Jersey     US             2020-07-29 04:35:11  41.0  -74.1
## 5 48029 Bexar  Texas          US             2020-07-29 04:35:11  29.4  -98.5
## 6 12011 Browa… Florida        US             2020-07-29 04:35:11  26.2  -80.5
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
filter(covid19, Country_Region  == 'US' , Confirmed > 10000) %>% head()
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1  6001 Alame… California     US             2020-07-29 04:35:11  37.6 -122. 
## 2 24005 Balti… Maryland       US             2020-07-29 04:35:11  39.5  -76.6
## 3 24510 Balti… Maryland       US             2020-07-29 04:35:11  39.3  -76.6
## 4 34003 Bergen New Jersey     US             2020-07-29 04:35:11  41.0  -74.1
## 5 48029 Bexar  Texas          US             2020-07-29 04:35:11  29.4  -98.5
## 6 12011 Browa… Florida        US             2020-07-29 04:35:11  26.2  -80.5
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
# OR
filter(covid19, Country_Region  == 'US' | Confirmed > 10000) %>% head()
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2  -82.5
## 2 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3  -92.4
## 3 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8  -75.6
## 4 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5 -116. 
## 5 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3  -94.5
## 6 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1  -85.3
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
# IN
filter(covid19, Province_State %in% c('New York', 'California') ) %>% head()
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1  6001 Alame… California     US             2020-07-29 04:35:11  37.6 -122. 
## 2 36001 Albany New York       US             2020-07-29 04:35:11  42.6  -74.0
## 3 36003 Alleg… New York       US             2020-07-29 04:35:11  42.3  -78.0
## 4  6003 Alpine California     US             2020-07-29 04:35:11  38.6 -120. 
## 5  6005 Amador California     US             2020-07-29 04:35:11  38.4 -121. 
## 6 36007 Broome New York       US             2020-07-29 04:35:11  42.2  -75.8
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>

select

head(covid19[,c('Country_Region', 'Confirmed')])
## # A tibble: 6 x 2
##   Country_Region Confirmed
##   <chr>              <dbl>
## 1 US                   269
## 2 US                  2229
## 3 US                  1069
## 4 US                  7378
## 5 US                    20
## 6 US                   185
select(covid19, Country_Region, Confirmed) %>% head()
## # A tibble: 6 x 2
##   Country_Region Confirmed
##   <chr>              <dbl>
## 1 US                   269
## 2 US                  2229
## 3 US                  1069
## 4 US                  7378
## 5 US                    20
## 6 US                   185
head(covid19)
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2  -82.5
## 2 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3  -92.4
## 3 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8  -75.6
## 4 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5 -116. 
## 5 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3  -94.5
## 6 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1  -85.3
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
covid19 %>% select(Country_Region:Confirmed) %>% head()
## # A tibble: 6 x 5
##   Country_Region Last_Update           Lat  Long_ Confirmed
##   <chr>          <dttm>              <dbl>  <dbl>     <dbl>
## 1 US             2020-07-29 04:35:11  34.2  -82.5       269
## 2 US             2020-07-29 04:35:11  30.3  -92.4      2229
## 3 US             2020-07-29 04:35:11  37.8  -75.6      1069
## 4 US             2020-07-29 04:35:11  43.5 -116.       7378
## 5 US             2020-07-29 04:35:11  41.3  -94.5        20
## 6 US             2020-07-29 04:35:11  37.1  -85.3       185
covid19 %>% select(- (FIPS:Country_Region)) %>% head()
## # A tibble: 6 x 10
##   Last_Update           Lat  Long_ Confirmed Deaths Recovered Active
##   <dttm>              <dbl>  <dbl>     <dbl>  <dbl>     <dbl>  <dbl>
## 1 2020-07-29 04:35:11  34.2  -82.5       269      6         0    263
## 2 2020-07-29 04:35:11  30.3  -92.4      2229     65         0   2164
## 3 2020-07-29 04:35:11  37.8  -75.6      1069     15         0   1054
## 4 2020-07-29 04:35:11  43.5 -116.       7378     50         0   7328
## 5 2020-07-29 04:35:11  41.3  -94.5        20      0         0     20
## 6 2020-07-29 04:35:11  37.1  -85.3       185     19         0    166
## # … with 3 more variables: Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
covid19 %>% select(starts_with('L')) %>% head()
## # A tibble: 6 x 3
##   Last_Update           Lat  Long_
##   <dttm>              <dbl>  <dbl>
## 1 2020-07-29 04:35:11  34.2  -82.5
## 2 2020-07-29 04:35:11  30.3  -92.4
## 3 2020-07-29 04:35:11  37.8  -75.6
## 4 2020-07-29 04:35:11  43.5 -116. 
## 5 2020-07-29 04:35:11  41.3  -94.5
## 6 2020-07-29 04:35:11  37.1  -85.3
california <- covid19[covid19$Province_State == 'California', ]
#california
sum(california[order(california[, c('Confirmed')], decreasing = TRUE), ]$Confirmed, na.rm=TRUE)
## [1] 470762
covid19 %>%
  filter(Province_State == 'California') %>%
  arrange(desc(Confirmed)) %>%
  select(Confirmed) %>%
  sum()
## [1] 470762
covid19 %>% filter(Province_State == 'California')  %>% select(Confirmed) %>% head()
## # A tibble: 6 x 1
##   Confirmed
##       <dbl>
## 1     10633
## 2         2
## 3        84
## 4       866
## 5        99
## 6       292
covid19 %>%
  filter(Province_State == 'California') %>%
  select(Confirmed) %>%
  head()
## # A tibble: 6 x 1
##   Confirmed
##       <dbl>
## 1     10633
## 2         2
## 3        84
## 4       866
## 5        99
## 6       292
covid19 %>% select(確診=Confirmed) %>% head()
## # A tibble: 6 x 1
##    確診
##   <dbl>
## 1   269
## 2  2229
## 3  1069
## 4  7378
## 5    20
## 6   185
covid19 %>% rename(確診=Confirmed) %>% head()
## # A tibble: 6 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2  -82.5
## 2 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3  -92.4
## 3 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8  -75.6
## 4 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5 -116. 
## 5 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3  -94.5
## 6 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1  -85.3
## # … with 7 more variables: 確診 <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>

practice

#table(covid19$Country_Region)
library(dplyr)

covid19 %>%
  filter(Country_Region == 'Taiwan*') %>%
  select(Country_Region, Confirmed, Deaths, Recovered)
## # A tibble: 1 x 4
##   Country_Region Confirmed Deaths Recovered
##   <chr>              <dbl>  <dbl>     <dbl>
## 1 Taiwan*              467      7       440

Arrange

covid19 %>%
  filter(Country_Region == 'US') %>%
  select(Province_State, Confirmed) %>%
  arrange(Confirmed) %>%
  head()
## # A tibble: 6 x 2
##   Province_State Confirmed
##   <chr>              <dbl>
## 1 Nebraska               0
## 2 Nebraska               0
## 3 Nebraska               0
## 4 New Mexico             0
## 5 Nevada                 0
## 6 Nebraska               0
covid19 %>%
  filter(Country_Region == 'US') %>%
  select(Admin2, Province_State, Confirmed) %>%
  arrange(desc(Confirmed)) %>%
  head(10)
## # A tibble: 10 x 3
##    Admin2        Province_State Confirmed
##    <chr>         <chr>              <dbl>
##  1 New York City New York          224249
##  2 Los Angeles   California        178642
##  3 Maricopa      Arizona           111446
##  4 Miami-Dade    Florida           110352
##  5 Cook          Illinois          103441
##  6 Harris        Texas              67655
##  7 Broward       Florida            51657
##  8 Dallas        Texas              48028
##  9 Nassau        New York           43059
## 10 Suffolk       New York           43024
covid19 %>%
  filter(Country_Region == 'US') %>%
  select(Admin2, Province_State, Confirmed) %>%
  arrange(desc(Confirmed)) %>%
  slice(0:10)
## # A tibble: 10 x 3
##    Admin2        Province_State Confirmed
##    <chr>         <chr>              <dbl>
##  1 New York City New York          224249
##  2 Los Angeles   California        178642
##  3 Maricopa      Arizona           111446
##  4 Miami-Dade    Florida           110352
##  5 Cook          Illinois          103441
##  6 Harris        Texas              67655
##  7 Broward       Florida            51657
##  8 Dallas        Texas              48028
##  9 Nassau        New York           43059
## 10 Suffolk       New York           43024
covid19 %>%
  filter(Country_Region == 'US') %>%
  select(Admin2, Province_State, Confirmed) %>%
  arrange(desc(Confirmed)) %>%
  slice(6:10)
## # A tibble: 5 x 3
##   Admin2  Province_State Confirmed
##   <chr>   <chr>              <dbl>
## 1 Harris  Texas              67655
## 2 Broward Florida            51657
## 3 Dallas  Texas              48028
## 4 Nassau  New York           43059
## 5 Suffolk New York           43024
covid19 %>%
  slice(6:10)
## # A tibble: 5 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1  -85.3
## 2 29001 Adair  Missouri       US             2020-07-29 04:35:11  40.2  -92.6
## 3 40001 Adair  Oklahoma       US             2020-07-29 04:35:11  35.9  -94.7
## 4  8001 Adams  Colorado       US             2020-07-29 04:35:11  39.9 -104. 
## 5 16003 Adams  Idaho          US             2020-07-29 04:35:11  44.9 -116. 
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
covid19 %>%
  slice_head(n=5)
## # A tibble: 5 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2  -82.5
## 2 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3  -92.4
## 3 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8  -75.6
## 4 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5 -116. 
## 5 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3  -94.5
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
?slice_head

covid19 %>%
  slice_sample(prop = 0.01)
## # A tibble: 39 x 14
##     FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##    <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
##  1 26109 Menom… Michigan       US             2020-07-29 04:35:11  45.6  -87.6
##  2  2275 Wrang… Alaska         US             2020-07-29 04:35:11  56.3 -132. 
##  3 72049 Culeb… Puerto Rico    US             2020-07-29 04:35:11  18.3  -65.3
##  4 13003 Atkin… Georgia        US             2020-07-29 04:35:11  31.3  -82.9
##  5 54109 Wyomi… West Virginia  US             2020-07-29 04:35:11  37.6  -81.5
##  6  1105 Perry  Alabama        US             2020-07-29 04:35:11  32.6  -87.3
##  7 17151 Pope   Illinois       US             2020-07-29 04:35:11  37.4  -88.6
##  8 35025 Lea    New Mexico     US             2020-07-29 04:35:11  32.8 -103. 
##  9 16083 Twin … Idaho          US             2020-07-29 04:35:11  42.4 -115. 
## 10 19039 Clarke Iowa           US             2020-07-29 04:35:11  41.0  -93.8
## # … with 29 more rows, and 7 more variables: Confirmed <dbl>, Deaths <dbl>,
## #   Recovered <dbl>, Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
covid19 %>% 
  slice_max(Confirmed, n= 3)
## # A tibble: 3 x 14
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl> <dbl>
## 1    NA <NA>   Sao Paulo      Brazil         2020-07-29 04:35:11 -23.6 -46.6
## 2    NA <NA>   <NA>           South Africa   2020-07-29 04:35:11 -30.6  22.9
## 3    NA <NA>   Maharashtra    India          2020-07-29 04:35:11  19.4  76.1
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
covid19 %>% 
  slice_min(Confirmed, n= 3)
## # A tibble: 50 x 14
##     FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##    <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
##  1 31005 Arthur Nebraska       US             2020-07-29 04:35:11  41.6 -102. 
##  2 31009 Blaine Nebraska       US             2020-07-29 04:35:11  41.9 -100. 
##  3 31017 Brown  Nebraska       US             2020-07-29 04:35:11  42.4  -99.9
##  4 35011 De Ba… New Mexico     US             2020-07-29 04:35:11  34.3 -104. 
##  5 32009 Esmer… Nevada         US             2020-07-29 04:35:11  37.8 -118. 
##  6 31075 Grant  Nebraska       US             2020-07-29 04:35:11  41.9 -102. 
##  7 31085 Hayes  Nebraska       US             2020-07-29 04:35:11  40.5 -101. 
##  8 31103 Keya … Nebraska       US             2020-07-29 04:35:11  42.9  -99.7
##  9 31113 Logan  Nebraska       US             2020-07-29 04:35:11  41.6 -100. 
## 10 31115 Loup   Nebraska       US             2020-07-29 04:35:11  41.9  -99.5
## # … with 40 more rows, and 7 more variables: Confirmed <dbl>, Deaths <dbl>,
## #   Recovered <dbl>, Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>
covid19 %>%
  select(Admin2, Province_State, Country_Region, Confirmed, Deaths, `Case-Fatality_Ratio`) %>%
  mutate(portion = (Deaths / Confirmed) * 100 ) %>%
  head()
## # A tibble: 6 x 7
##   Admin2 Province_State Country_Region Confirmed Deaths `Case-Fatality_… portion
##   <chr>  <chr>          <chr>              <dbl>  <dbl>            <dbl>   <dbl>
## 1 Abbev… South Carolina US                   269      6            2.23    2.23 
## 2 Acadia Louisiana      US                  2229     65            2.92    2.92 
## 3 Accom… Virginia       US                  1069     15            1.40    1.40 
## 4 Ada    Idaho          US                  7378     50            0.678   0.678
## 5 Adair  Iowa           US                    20      0            0       0    
## 6 Adair  Kentucky       US                   185     19           10.3    10.3
covid19 %>%
  select(Admin2, Province_State, Country_Region, Confirmed, Deaths, `Case-Fatality_Ratio`) %>%
  mutate(portion = (Deaths / Confirmed) * 100 ) %>%
  select(portion, everything())
## # A tibble: 3,932 x 7
##    portion Admin2 Province_State Country_Region Confirmed Deaths
##      <dbl> <chr>  <chr>          <chr>              <dbl>  <dbl>
##  1   2.23  Abbev… South Carolina US                   269      6
##  2   2.92  Acadia Louisiana      US                  2229     65
##  3   1.40  Accom… Virginia       US                  1069     15
##  4   0.678 Ada    Idaho          US                  7378     50
##  5   0     Adair  Iowa           US                    20      0
##  6  10.3   Adair  Kentucky       US                   185     19
##  7   0     Adair  Missouri       US                   123      0
##  8   1.88  Adair  Oklahoma       US                   266      5
##  9   2.94  Adams  Colorado       US                  5713    168
## 10   0     Adams  Idaho          US                    17      0
## # … with 3,922 more rows, and 1 more variable: `Case-Fatality_Ratio` <dbl>
covid19 <- covid19 %>% mutate(portion = Deaths/Confirmed)


covid19 %>% head()
## # A tibble: 6 x 15
##    FIPS Admin2 Province_State Country_Region Last_Update           Lat  Long_
##   <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>  <dbl>
## 1 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2  -82.5
## 2 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3  -92.4
## 3 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8  -75.6
## 4 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5 -116. 
## 5 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3  -94.5
## 6 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1  -85.3
## # … with 8 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>, portion <dbl>
covid19 %>%
 transmute(portion = Deaths/Confirmed) %>% 
  head()
## # A tibble: 6 x 1
##   portion
##     <dbl>
## 1 0.0223 
## 2 0.0292 
## 3 0.0140 
## 4 0.00678
## 5 0      
## 6 0.103
covid19 %>% relocate(FIPS:Long_, .after = Confirmed) %>% head()
## # A tibble: 6 x 15
##   Confirmed  FIPS Admin2 Province_State Country_Region Last_Update           Lat
##       <dbl> <dbl> <chr>  <chr>          <chr>          <dttm>              <dbl>
## 1       269 45001 Abbev… South Carolina US             2020-07-29 04:35:11  34.2
## 2      2229 22001 Acadia Louisiana      US             2020-07-29 04:35:11  30.3
## 3      1069 51001 Accom… Virginia       US             2020-07-29 04:35:11  37.8
## 4      7378 16001 Ada    Idaho          US             2020-07-29 04:35:11  43.5
## 5        20 19001 Adair  Iowa           US             2020-07-29 04:35:11  41.3
## 6       185 21001 Adair  Kentucky       US             2020-07-29 04:35:11  37.1
## # … with 8 more variables: Long_ <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## #   `Case-Fatality_Ratio` <dbl>, portion <dbl>
3/ 0
## [1] Inf
covid19 %>%
  filter(Confirmed > 0) %>%
  mutate(CFR = Deaths/ Confirmed) %>%
  arrange(desc(CFR)) %>%
  select(國家= Country_Region, 省份= Province_State, 
           確診數= Confirmed, 死亡數=Deaths, 康復數= Recovered,
           確診死亡率=CFR)
## # A tibble: 3,882 x 6
##    國家  省份         確診數 死亡數 康復數 確診死亡率
##    <chr> <chr>         <dbl>  <dbl>  <dbl>      <dbl>
##  1 US    Louisiana        10    112      0     11.2  
##  2 US    Puerto Rico     432    209      0      0.484
##  3 US    Rhode Island   2112   1005      0      0.476
##  4 Yemen <NA>           1703    484    840      0.284
##  5 US    New Mexico        4      1      0      0.25 
##  6 US    Texas             4      1      0      0.25 
##  7 US    Montana           4      1      0      0.25 
##  8 US    Maine            60     14      0      0.233
##  9 US    Texas             9      2      0      0.222
## 10 US    Kansas            9      2      0      0.222
## # … with 3,872 more rows
g <- c(1,1,2,2,1)
grades <- c(50,60,80,90,55)
tapply(grades, g, mean)
##  1  2 
## 55 85
head(tapply(covid19$Confirmed, covid19$Country_Region, sum))
##         Afghanistan             Albania             Algeria             Andorra 
##               36368                4997               28615                 907 
##              Angola Antigua and Barbuda 
##                1000                  86
covid19 %>%
  select(Country_Region, Confirmed) %>%
  group_by(Country_Region) %>%
  summarize(Confirmed_Total = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 188 x 2
##    Country_Region      Confirmed_Total
##    <chr>                         <dbl>
##  1 Afghanistan                   36368
##  2 Albania                        4997
##  3 Algeria                       28615
##  4 Andorra                         907
##  5 Angola                         1000
##  6 Antigua and Barbuda              86
##  7 Argentina                    173355
##  8 Armenia                       37629
##  9 Australia                     15582
## 10 Austria                       20677
## # … with 178 more rows
covid19 %>%
  select(Country_Region, Confirmed, Deaths, Recovered) %>%
  group_by(Country_Region) %>%
  summarize_each(funs(sum), Confirmed, Deaths, Recovered)
## Warning: `summarise_each_()` is deprecated as of dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 188 x 4
##    Country_Region      Confirmed Deaths Recovered
##    <chr>                   <dbl>  <dbl>     <dbl>
##  1 Afghanistan             36368   1270     25358
##  2 Albania                  4997    148      2789
##  3 Algeria                 28615   1174     19233
##  4 Andorra                   907     52       803
##  5 Angola                   1000     47       266
##  6 Antigua and Barbuda        86      3        65
##  7 Argentina              173355   3179     75083
##  8 Armenia                 37629    719     27357
##  9 Australia               15582    176      9617
## 10 Austria                 20677    713     18379
## # … with 178 more rows
covid19 %>%
  select(Confirmed, Deaths, Recovered) %>%
  summarise_all(sum)
## # A tibble: 1 x 3
##   Confirmed Deaths Recovered
##       <dbl>  <dbl>     <dbl>
## 1  16682030 659374   9711187
659374 / 16682030
## [1] 0.039526
covid19 %>%
  summarise_if(is.numeric, sum)
## # A tibble: 1 x 10
##    FIPS   Lat Long_ Confirmed Deaths Recovered Active Incidence_Rate
##   <dbl> <dbl> <dbl>     <dbl>  <dbl>     <dbl>  <dbl>          <dbl>
## 1    NA    NA    NA  16682030 659374   9711187     NA             NA
## # … with 2 more variables: `Case-Fatality_Ratio` <dbl>, portion <dbl>
covid19 %>%
  select(Country_Region, Confirmed, Deaths, Recovered) %>%
  group_by(Country_Region) %>%
  summarise_each(funs(min,max,sum), Confirmed, Deaths, Recovered)
## # A tibble: 188 x 10
##    Country_Region Confirmed_min Deaths_min Recovered_min Confirmed_max
##    <chr>                  <dbl>      <dbl>         <dbl>         <dbl>
##  1 Afghanistan            36368       1270         25358         36368
##  2 Albania                 4997        148          2789          4997
##  3 Algeria                28615       1174         19233         28615
##  4 Andorra                  907         52           803           907
##  5 Angola                  1000         47           266          1000
##  6 Antigua and B…            86          3            65            86
##  7 Argentina             173355       3179         75083        173355
##  8 Armenia                37629        719         27357         37629
##  9 Australia                 31          0            30          9304
## 10 Austria                20677        713         18379         20677
## # … with 178 more rows, and 5 more variables: Deaths_max <dbl>,
## #   Recovered_max <dbl>, Confirmed_sum <dbl>, Deaths_sum <dbl>,
## #   Recovered_sum <dbl>
length(table(covid19$Country_Region))
## [1] 188
covid19 %>%
  select(Country_Region) %>%
  summarize_each(funs(n()))
## # A tibble: 1 x 1
##   Country_Region
##            <int>
## 1           3932
# SELECT DISTINCT(Country_Region) FROM covid19;
covid19 %>%
  select(Country_Region) %>%
    summarise_each(funs(n_distinct(Country_Region)))
## # A tibble: 1 x 1
##   Country_Region
##            <int>
## 1            188

practice 3

covid19 %>%
  group_by(Country_Region) %>%
  summarize(Confirmed_Total = sum(Confirmed) ) %>%
  arrange(desc(Confirmed_Total)) %>%
  head(10)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 10 x 2
##    Country_Region Confirmed_Total
##    <chr>                    <dbl>
##  1 US                     4351997
##  2 Brazil                 2483191
##  3 India                  1483156
##  4 Russia                  822060
##  5 South Africa            459761
##  6 Mexico                  402697
##  7 Peru                    389717
##  8 Chile                   349800
##  9 United Kingdom          302295
## 10 Iran                    296273
covid19 %>%
  group_by(Country_Region) %>%
  summarize_each(funs(sum), Confirmed, Deaths, Recovered ) %>%
  arrange(desc(Confirmed)) %>%
  head(10)
## # A tibble: 10 x 4
##    Country_Region Confirmed Deaths Recovered
##    <chr>              <dbl>  <dbl>     <dbl>
##  1 US               4351997 149256   1355363
##  2 Brazil           2483191  88539   1868749
##  3 India            1483156  33425    952743
##  4 Russia            822060  13483    611109
##  5 South Africa      459761   7257    287313
##  6 Mexico            402697  44876    308142
##  7 Peru              389717  18418    276452
##  8 Chile             349800   9240    322332
##  9 United Kingdom    302295  45963      1438
## 10 Iran              296273  16147    257019
#covid19 %>%
#  group_by(Country_Region) %>%
#  summarize_each(funs(sum), Confirmed, Deaths, Recovered ) %>%
#  mutate(CFR = Deaths/ Confirmed)
#  arrange(desc(CFR)) %>%
 # head(10)

Join Table

download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-27-2020.csv', '07-27-2020.csv')

download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv', '07-28-2020.csv')


library(readr)
covid_0728 <- read_csv('07-28-2020.csv')
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
covid_0727 <- read_csv('07-27-2020.csv')
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
covid_0728 %>%
  filter(Country_Region == 'Taiwan*') %>%
  select(Country_Region, Confirmed, Deaths, Recovered)
## # A tibble: 1 x 4
##   Country_Region Confirmed Deaths Recovered
##   <chr>              <dbl>  <dbl>     <dbl>
## 1 Taiwan*              467      7       440
covid_0727%>%
  filter(Country_Region == 'Taiwan*') %>%
  select(Country_Region, Confirmed, Deaths, Recovered)
## # A tibble: 1 x 4
##   Country_Region Confirmed Deaths Recovered
##   <chr>              <dbl>  <dbl>     <dbl>
## 1 Taiwan*              462      7       440
stat_0728 <- covid_0728 %>% 
    group_by(Country_Region) %>%
    summarise(Confirmed_Total = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
stat_0727 <- covid_0727 %>% 
    group_by(Country_Region) %>%
    summarise(Confirmed_Total = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
#stat_0728
#stat_0727

# SELECT * FROM stat_0727 INNER JOIN stat_0728 ON stat_0727.Country_Region = stat_0728.Country_Region

m <- stat_0728 %>%
  inner_join(stat_0727, by = 'Country_Region')

m <- m %>%
  mutate(diff = Confirmed_Total.x - Confirmed_Total.y)

m %>% 
  arrange(desc(diff)) %>%
  head()
## # A tibble: 6 x 4
##   Country_Region Confirmed_Total.x Confirmed_Total.y  diff
##   <chr>                      <dbl>             <dbl> <dbl>
## 1 US                       4351997           4290337 61660
## 2 Brazil                   2483191           2442375 40816
## 3 Colombia                  267385            257101 10284
## 4 South Africa              459761            452529  7232
## 5 Mexico                    402697            395489  7208
## 6 Argentina                 173355            167416  5939
m <- m %>%
  mutate(increase_ratio = (Confirmed_Total.x / Confirmed_Total.y) - 1 )

m %>% 
  arrange(desc(increase_ratio)) %>%
  head()
## # A tibble: 6 x 5
##   Country_Region Confirmed_Total.x Confirmed_Total.y  diff increase_ratio
##   <chr>                      <dbl>             <dbl> <dbl>          <dbl>
## 1 Bahamas                      447               382    65         0.170 
## 2 Zambia                      5002              4552   450         0.0989
## 3 Nicaragua                   3672              3439   233         0.0678
## 4 Libya                       3017              2827   190         0.0672
## 5 Angola                      1000               950    50         0.0526
## 6 Ethiopia                   15200             14547   653         0.0449