tidyverse
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
help(package = 'tidyverse')
tidyverse_packages()
## [1] "broom" "cli" "crayon" "dbplyr" "dplyr"
## [6] "forcats" "ggplot2" "haven" "hms" "httr"
## [11] "jsonlite" "lubridate" "magrittr" "modelr" "pillar"
## [16] "purrr" "readr" "readxl" "reprex" "rlang"
## [21] "rstudioapi" "rvest" "stringr" "tibble" "tidyr"
## [26] "xml2" "tidyverse"
dplyr
library(dplyr)
help(package = 'dplyr')
read covid19 data
#COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv
setwd('~/Desktop/')
curl::curl_download('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv', destfile = '07-28-2020.csv')
download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv', destfile = '07-28-2020.csv')
library(readr)
covid19 <- read_csv("07-28-2020.csv")
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
head(covid19)
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2 -82.5
## 2 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3 -92.4
## 3 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8 -75.6
## 4 16001 Ada Idaho US 2020-07-29 04:35:11 43.5 -116.
## 5 19001 Adair Iowa US 2020-07-29 04:35:11 41.3 -94.5
## 6 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1 -85.3
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
#View(covid19)
filter
#covid19[covid19$Country_Region == 'US', ]
#covid19[covid19[,'Country_Region'] == 'US', ]
# filter(covid19, Country_Region == 'US')
# AND
filter(covid19, Country_Region == 'US' & Confirmed > 10000) %>% head()
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 6001 Alame… California US 2020-07-29 04:35:11 37.6 -122.
## 2 24005 Balti… Maryland US 2020-07-29 04:35:11 39.5 -76.6
## 3 24510 Balti… Maryland US 2020-07-29 04:35:11 39.3 -76.6
## 4 34003 Bergen New Jersey US 2020-07-29 04:35:11 41.0 -74.1
## 5 48029 Bexar Texas US 2020-07-29 04:35:11 29.4 -98.5
## 6 12011 Browa… Florida US 2020-07-29 04:35:11 26.2 -80.5
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
filter(covid19, Country_Region == 'US' , Confirmed > 10000) %>% head()
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 6001 Alame… California US 2020-07-29 04:35:11 37.6 -122.
## 2 24005 Balti… Maryland US 2020-07-29 04:35:11 39.5 -76.6
## 3 24510 Balti… Maryland US 2020-07-29 04:35:11 39.3 -76.6
## 4 34003 Bergen New Jersey US 2020-07-29 04:35:11 41.0 -74.1
## 5 48029 Bexar Texas US 2020-07-29 04:35:11 29.4 -98.5
## 6 12011 Browa… Florida US 2020-07-29 04:35:11 26.2 -80.5
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
# OR
filter(covid19, Country_Region == 'US' | Confirmed > 10000) %>% head()
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2 -82.5
## 2 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3 -92.4
## 3 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8 -75.6
## 4 16001 Ada Idaho US 2020-07-29 04:35:11 43.5 -116.
## 5 19001 Adair Iowa US 2020-07-29 04:35:11 41.3 -94.5
## 6 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1 -85.3
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
# IN
filter(covid19, Province_State %in% c('New York', 'California') ) %>% head()
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 6001 Alame… California US 2020-07-29 04:35:11 37.6 -122.
## 2 36001 Albany New York US 2020-07-29 04:35:11 42.6 -74.0
## 3 36003 Alleg… New York US 2020-07-29 04:35:11 42.3 -78.0
## 4 6003 Alpine California US 2020-07-29 04:35:11 38.6 -120.
## 5 6005 Amador California US 2020-07-29 04:35:11 38.4 -121.
## 6 36007 Broome New York US 2020-07-29 04:35:11 42.2 -75.8
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
select
head(covid19[,c('Country_Region', 'Confirmed')])
## # A tibble: 6 x 2
## Country_Region Confirmed
## <chr> <dbl>
## 1 US 269
## 2 US 2229
## 3 US 1069
## 4 US 7378
## 5 US 20
## 6 US 185
select(covid19, Country_Region, Confirmed) %>% head()
## # A tibble: 6 x 2
## Country_Region Confirmed
## <chr> <dbl>
## 1 US 269
## 2 US 2229
## 3 US 1069
## 4 US 7378
## 5 US 20
## 6 US 185
head(covid19)
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2 -82.5
## 2 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3 -92.4
## 3 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8 -75.6
## 4 16001 Ada Idaho US 2020-07-29 04:35:11 43.5 -116.
## 5 19001 Adair Iowa US 2020-07-29 04:35:11 41.3 -94.5
## 6 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1 -85.3
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
covid19 %>% select(Country_Region:Confirmed) %>% head()
## # A tibble: 6 x 5
## Country_Region Last_Update Lat Long_ Confirmed
## <chr> <dttm> <dbl> <dbl> <dbl>
## 1 US 2020-07-29 04:35:11 34.2 -82.5 269
## 2 US 2020-07-29 04:35:11 30.3 -92.4 2229
## 3 US 2020-07-29 04:35:11 37.8 -75.6 1069
## 4 US 2020-07-29 04:35:11 43.5 -116. 7378
## 5 US 2020-07-29 04:35:11 41.3 -94.5 20
## 6 US 2020-07-29 04:35:11 37.1 -85.3 185
covid19 %>% select(- (FIPS:Country_Region)) %>% head()
## # A tibble: 6 x 10
## Last_Update Lat Long_ Confirmed Deaths Recovered Active
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-07-29 04:35:11 34.2 -82.5 269 6 0 263
## 2 2020-07-29 04:35:11 30.3 -92.4 2229 65 0 2164
## 3 2020-07-29 04:35:11 37.8 -75.6 1069 15 0 1054
## 4 2020-07-29 04:35:11 43.5 -116. 7378 50 0 7328
## 5 2020-07-29 04:35:11 41.3 -94.5 20 0 0 20
## 6 2020-07-29 04:35:11 37.1 -85.3 185 19 0 166
## # … with 3 more variables: Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
covid19 %>% select(starts_with('L')) %>% head()
## # A tibble: 6 x 3
## Last_Update Lat Long_
## <dttm> <dbl> <dbl>
## 1 2020-07-29 04:35:11 34.2 -82.5
## 2 2020-07-29 04:35:11 30.3 -92.4
## 3 2020-07-29 04:35:11 37.8 -75.6
## 4 2020-07-29 04:35:11 43.5 -116.
## 5 2020-07-29 04:35:11 41.3 -94.5
## 6 2020-07-29 04:35:11 37.1 -85.3
california <- covid19[covid19$Province_State == 'California', ]
#california
sum(california[order(california[, c('Confirmed')], decreasing = TRUE), ]$Confirmed, na.rm=TRUE)
## [1] 470762
covid19 %>%
filter(Province_State == 'California') %>%
arrange(desc(Confirmed)) %>%
select(Confirmed) %>%
sum()
## [1] 470762
covid19 %>% filter(Province_State == 'California') %>% select(Confirmed) %>% head()
## # A tibble: 6 x 1
## Confirmed
## <dbl>
## 1 10633
## 2 2
## 3 84
## 4 866
## 5 99
## 6 292
covid19 %>%
filter(Province_State == 'California') %>%
select(Confirmed) %>%
head()
## # A tibble: 6 x 1
## Confirmed
## <dbl>
## 1 10633
## 2 2
## 3 84
## 4 866
## 5 99
## 6 292
covid19 %>% select(確診=Confirmed) %>% head()
## # A tibble: 6 x 1
## 確診
## <dbl>
## 1 269
## 2 2229
## 3 1069
## 4 7378
## 5 20
## 6 185
covid19 %>% rename(確診=Confirmed) %>% head()
## # A tibble: 6 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2 -82.5
## 2 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3 -92.4
## 3 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8 -75.6
## 4 16001 Ada Idaho US 2020-07-29 04:35:11 43.5 -116.
## 5 19001 Adair Iowa US 2020-07-29 04:35:11 41.3 -94.5
## 6 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1 -85.3
## # … with 7 more variables: 確診 <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
practice
#table(covid19$Country_Region)
library(dplyr)
covid19 %>%
filter(Country_Region == 'Taiwan*') %>%
select(Country_Region, Confirmed, Deaths, Recovered)
## # A tibble: 1 x 4
## Country_Region Confirmed Deaths Recovered
## <chr> <dbl> <dbl> <dbl>
## 1 Taiwan* 467 7 440
Arrange
covid19 %>%
filter(Country_Region == 'US') %>%
select(Province_State, Confirmed) %>%
arrange(Confirmed) %>%
head()
## # A tibble: 6 x 2
## Province_State Confirmed
## <chr> <dbl>
## 1 Nebraska 0
## 2 Nebraska 0
## 3 Nebraska 0
## 4 New Mexico 0
## 5 Nevada 0
## 6 Nebraska 0
covid19 %>%
filter(Country_Region == 'US') %>%
select(Admin2, Province_State, Confirmed) %>%
arrange(desc(Confirmed)) %>%
head(10)
## # A tibble: 10 x 3
## Admin2 Province_State Confirmed
## <chr> <chr> <dbl>
## 1 New York City New York 224249
## 2 Los Angeles California 178642
## 3 Maricopa Arizona 111446
## 4 Miami-Dade Florida 110352
## 5 Cook Illinois 103441
## 6 Harris Texas 67655
## 7 Broward Florida 51657
## 8 Dallas Texas 48028
## 9 Nassau New York 43059
## 10 Suffolk New York 43024
covid19 %>%
filter(Country_Region == 'US') %>%
select(Admin2, Province_State, Confirmed) %>%
arrange(desc(Confirmed)) %>%
slice(0:10)
## # A tibble: 10 x 3
## Admin2 Province_State Confirmed
## <chr> <chr> <dbl>
## 1 New York City New York 224249
## 2 Los Angeles California 178642
## 3 Maricopa Arizona 111446
## 4 Miami-Dade Florida 110352
## 5 Cook Illinois 103441
## 6 Harris Texas 67655
## 7 Broward Florida 51657
## 8 Dallas Texas 48028
## 9 Nassau New York 43059
## 10 Suffolk New York 43024
covid19 %>%
filter(Country_Region == 'US') %>%
select(Admin2, Province_State, Confirmed) %>%
arrange(desc(Confirmed)) %>%
slice(6:10)
## # A tibble: 5 x 3
## Admin2 Province_State Confirmed
## <chr> <chr> <dbl>
## 1 Harris Texas 67655
## 2 Broward Florida 51657
## 3 Dallas Texas 48028
## 4 Nassau New York 43059
## 5 Suffolk New York 43024
covid19 %>%
slice(6:10)
## # A tibble: 5 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1 -85.3
## 2 29001 Adair Missouri US 2020-07-29 04:35:11 40.2 -92.6
## 3 40001 Adair Oklahoma US 2020-07-29 04:35:11 35.9 -94.7
## 4 8001 Adams Colorado US 2020-07-29 04:35:11 39.9 -104.
## 5 16003 Adams Idaho US 2020-07-29 04:35:11 44.9 -116.
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
covid19 %>%
slice_head(n=5)
## # A tibble: 5 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2 -82.5
## 2 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3 -92.4
## 3 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8 -75.6
## 4 16001 Ada Idaho US 2020-07-29 04:35:11 43.5 -116.
## 5 19001 Adair Iowa US 2020-07-29 04:35:11 41.3 -94.5
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
?slice_head
covid19 %>%
slice_sample(prop = 0.01)
## # A tibble: 39 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 26109 Menom… Michigan US 2020-07-29 04:35:11 45.6 -87.6
## 2 2275 Wrang… Alaska US 2020-07-29 04:35:11 56.3 -132.
## 3 72049 Culeb… Puerto Rico US 2020-07-29 04:35:11 18.3 -65.3
## 4 13003 Atkin… Georgia US 2020-07-29 04:35:11 31.3 -82.9
## 5 54109 Wyomi… West Virginia US 2020-07-29 04:35:11 37.6 -81.5
## 6 1105 Perry Alabama US 2020-07-29 04:35:11 32.6 -87.3
## 7 17151 Pope Illinois US 2020-07-29 04:35:11 37.4 -88.6
## 8 35025 Lea New Mexico US 2020-07-29 04:35:11 32.8 -103.
## 9 16083 Twin … Idaho US 2020-07-29 04:35:11 42.4 -115.
## 10 19039 Clarke Iowa US 2020-07-29 04:35:11 41.0 -93.8
## # … with 29 more rows, and 7 more variables: Confirmed <dbl>, Deaths <dbl>,
## # Recovered <dbl>, Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
covid19 %>%
slice_max(Confirmed, n= 3)
## # A tibble: 3 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 NA <NA> Sao Paulo Brazil 2020-07-29 04:35:11 -23.6 -46.6
## 2 NA <NA> <NA> South Africa 2020-07-29 04:35:11 -30.6 22.9
## 3 NA <NA> Maharashtra India 2020-07-29 04:35:11 19.4 76.1
## # … with 7 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
covid19 %>%
slice_min(Confirmed, n= 3)
## # A tibble: 50 x 14
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 31005 Arthur Nebraska US 2020-07-29 04:35:11 41.6 -102.
## 2 31009 Blaine Nebraska US 2020-07-29 04:35:11 41.9 -100.
## 3 31017 Brown Nebraska US 2020-07-29 04:35:11 42.4 -99.9
## 4 35011 De Ba… New Mexico US 2020-07-29 04:35:11 34.3 -104.
## 5 32009 Esmer… Nevada US 2020-07-29 04:35:11 37.8 -118.
## 6 31075 Grant Nebraska US 2020-07-29 04:35:11 41.9 -102.
## 7 31085 Hayes Nebraska US 2020-07-29 04:35:11 40.5 -101.
## 8 31103 Keya … Nebraska US 2020-07-29 04:35:11 42.9 -99.7
## 9 31113 Logan Nebraska US 2020-07-29 04:35:11 41.6 -100.
## 10 31115 Loup Nebraska US 2020-07-29 04:35:11 41.9 -99.5
## # … with 40 more rows, and 7 more variables: Confirmed <dbl>, Deaths <dbl>,
## # Recovered <dbl>, Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>
covid19 %>%
select(Admin2, Province_State, Country_Region, Confirmed, Deaths, `Case-Fatality_Ratio`) %>%
mutate(portion = (Deaths / Confirmed) * 100 ) %>%
head()
## # A tibble: 6 x 7
## Admin2 Province_State Country_Region Confirmed Deaths `Case-Fatality_… portion
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Abbev… South Carolina US 269 6 2.23 2.23
## 2 Acadia Louisiana US 2229 65 2.92 2.92
## 3 Accom… Virginia US 1069 15 1.40 1.40
## 4 Ada Idaho US 7378 50 0.678 0.678
## 5 Adair Iowa US 20 0 0 0
## 6 Adair Kentucky US 185 19 10.3 10.3
covid19 %>%
select(Admin2, Province_State, Country_Region, Confirmed, Deaths, `Case-Fatality_Ratio`) %>%
mutate(portion = (Deaths / Confirmed) * 100 ) %>%
select(portion, everything())
## # A tibble: 3,932 x 7
## portion Admin2 Province_State Country_Region Confirmed Deaths
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2.23 Abbev… South Carolina US 269 6
## 2 2.92 Acadia Louisiana US 2229 65
## 3 1.40 Accom… Virginia US 1069 15
## 4 0.678 Ada Idaho US 7378 50
## 5 0 Adair Iowa US 20 0
## 6 10.3 Adair Kentucky US 185 19
## 7 0 Adair Missouri US 123 0
## 8 1.88 Adair Oklahoma US 266 5
## 9 2.94 Adams Colorado US 5713 168
## 10 0 Adams Idaho US 17 0
## # … with 3,922 more rows, and 1 more variable: `Case-Fatality_Ratio` <dbl>
covid19 <- covid19 %>% mutate(portion = Deaths/Confirmed)
covid19 %>% head()
## # A tibble: 6 x 15
## FIPS Admin2 Province_State Country_Region Last_Update Lat Long_
## <dbl> <chr> <chr> <chr> <dttm> <dbl> <dbl>
## 1 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2 -82.5
## 2 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3 -92.4
## 3 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8 -75.6
## 4 16001 Ada Idaho US 2020-07-29 04:35:11 43.5 -116.
## 5 19001 Adair Iowa US 2020-07-29 04:35:11 41.3 -94.5
## 6 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1 -85.3
## # … with 8 more variables: Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>, portion <dbl>
covid19 %>%
transmute(portion = Deaths/Confirmed) %>%
head()
## # A tibble: 6 x 1
## portion
## <dbl>
## 1 0.0223
## 2 0.0292
## 3 0.0140
## 4 0.00678
## 5 0
## 6 0.103
covid19 %>% relocate(FIPS:Long_, .after = Confirmed) %>% head()
## # A tibble: 6 x 15
## Confirmed FIPS Admin2 Province_State Country_Region Last_Update Lat
## <dbl> <dbl> <chr> <chr> <chr> <dttm> <dbl>
## 1 269 45001 Abbev… South Carolina US 2020-07-29 04:35:11 34.2
## 2 2229 22001 Acadia Louisiana US 2020-07-29 04:35:11 30.3
## 3 1069 51001 Accom… Virginia US 2020-07-29 04:35:11 37.8
## 4 7378 16001 Ada Idaho US 2020-07-29 04:35:11 43.5
## 5 20 19001 Adair Iowa US 2020-07-29 04:35:11 41.3
## 6 185 21001 Adair Kentucky US 2020-07-29 04:35:11 37.1
## # … with 8 more variables: Long_ <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Combined_Key <chr>, Incidence_Rate <dbl>,
## # `Case-Fatality_Ratio` <dbl>, portion <dbl>
3/ 0
## [1] Inf
covid19 %>%
filter(Confirmed > 0) %>%
mutate(CFR = Deaths/ Confirmed) %>%
arrange(desc(CFR)) %>%
select(國家= Country_Region, 省份= Province_State,
確診數= Confirmed, 死亡數=Deaths, 康復數= Recovered,
確診死亡率=CFR)
## # A tibble: 3,882 x 6
## 國家 省份 確診數 死亡數 康復數 確診死亡率
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 US Louisiana 10 112 0 11.2
## 2 US Puerto Rico 432 209 0 0.484
## 3 US Rhode Island 2112 1005 0 0.476
## 4 Yemen <NA> 1703 484 840 0.284
## 5 US New Mexico 4 1 0 0.25
## 6 US Texas 4 1 0 0.25
## 7 US Montana 4 1 0 0.25
## 8 US Maine 60 14 0 0.233
## 9 US Texas 9 2 0 0.222
## 10 US Kansas 9 2 0 0.222
## # … with 3,872 more rows
g <- c(1,1,2,2,1)
grades <- c(50,60,80,90,55)
tapply(grades, g, mean)
## 1 2
## 55 85
head(tapply(covid19$Confirmed, covid19$Country_Region, sum))
## Afghanistan Albania Algeria Andorra
## 36368 4997 28615 907
## Angola Antigua and Barbuda
## 1000 86
covid19 %>%
select(Country_Region, Confirmed) %>%
group_by(Country_Region) %>%
summarize(Confirmed_Total = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 188 x 2
## Country_Region Confirmed_Total
## <chr> <dbl>
## 1 Afghanistan 36368
## 2 Albania 4997
## 3 Algeria 28615
## 4 Andorra 907
## 5 Angola 1000
## 6 Antigua and Barbuda 86
## 7 Argentina 173355
## 8 Armenia 37629
## 9 Australia 15582
## 10 Austria 20677
## # … with 178 more rows
covid19 %>%
select(Country_Region, Confirmed, Deaths, Recovered) %>%
group_by(Country_Region) %>%
summarize_each(funs(sum), Confirmed, Deaths, Recovered)
## Warning: `summarise_each_()` is deprecated as of dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 188 x 4
## Country_Region Confirmed Deaths Recovered
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 36368 1270 25358
## 2 Albania 4997 148 2789
## 3 Algeria 28615 1174 19233
## 4 Andorra 907 52 803
## 5 Angola 1000 47 266
## 6 Antigua and Barbuda 86 3 65
## 7 Argentina 173355 3179 75083
## 8 Armenia 37629 719 27357
## 9 Australia 15582 176 9617
## 10 Austria 20677 713 18379
## # … with 178 more rows
covid19 %>%
select(Confirmed, Deaths, Recovered) %>%
summarise_all(sum)
## # A tibble: 1 x 3
## Confirmed Deaths Recovered
## <dbl> <dbl> <dbl>
## 1 16682030 659374 9711187
659374 / 16682030
## [1] 0.039526
covid19 %>%
summarise_if(is.numeric, sum)
## # A tibble: 1 x 10
## FIPS Lat Long_ Confirmed Deaths Recovered Active Incidence_Rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA NA NA 16682030 659374 9711187 NA NA
## # … with 2 more variables: `Case-Fatality_Ratio` <dbl>, portion <dbl>
covid19 %>%
select(Country_Region, Confirmed, Deaths, Recovered) %>%
group_by(Country_Region) %>%
summarise_each(funs(min,max,sum), Confirmed, Deaths, Recovered)
## # A tibble: 188 x 10
## Country_Region Confirmed_min Deaths_min Recovered_min Confirmed_max
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 36368 1270 25358 36368
## 2 Albania 4997 148 2789 4997
## 3 Algeria 28615 1174 19233 28615
## 4 Andorra 907 52 803 907
## 5 Angola 1000 47 266 1000
## 6 Antigua and B… 86 3 65 86
## 7 Argentina 173355 3179 75083 173355
## 8 Armenia 37629 719 27357 37629
## 9 Australia 31 0 30 9304
## 10 Austria 20677 713 18379 20677
## # … with 178 more rows, and 5 more variables: Deaths_max <dbl>,
## # Recovered_max <dbl>, Confirmed_sum <dbl>, Deaths_sum <dbl>,
## # Recovered_sum <dbl>
length(table(covid19$Country_Region))
## [1] 188
covid19 %>%
select(Country_Region) %>%
summarize_each(funs(n()))
## # A tibble: 1 x 1
## Country_Region
## <int>
## 1 3932
# SELECT DISTINCT(Country_Region) FROM covid19;
covid19 %>%
select(Country_Region) %>%
summarise_each(funs(n_distinct(Country_Region)))
## # A tibble: 1 x 1
## Country_Region
## <int>
## 1 188
practice 3
covid19 %>%
group_by(Country_Region) %>%
summarize(Confirmed_Total = sum(Confirmed) ) %>%
arrange(desc(Confirmed_Total)) %>%
head(10)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 10 x 2
## Country_Region Confirmed_Total
## <chr> <dbl>
## 1 US 4351997
## 2 Brazil 2483191
## 3 India 1483156
## 4 Russia 822060
## 5 South Africa 459761
## 6 Mexico 402697
## 7 Peru 389717
## 8 Chile 349800
## 9 United Kingdom 302295
## 10 Iran 296273
covid19 %>%
group_by(Country_Region) %>%
summarize_each(funs(sum), Confirmed, Deaths, Recovered ) %>%
arrange(desc(Confirmed)) %>%
head(10)
## # A tibble: 10 x 4
## Country_Region Confirmed Deaths Recovered
## <chr> <dbl> <dbl> <dbl>
## 1 US 4351997 149256 1355363
## 2 Brazil 2483191 88539 1868749
## 3 India 1483156 33425 952743
## 4 Russia 822060 13483 611109
## 5 South Africa 459761 7257 287313
## 6 Mexico 402697 44876 308142
## 7 Peru 389717 18418 276452
## 8 Chile 349800 9240 322332
## 9 United Kingdom 302295 45963 1438
## 10 Iran 296273 16147 257019
#covid19 %>%
# group_by(Country_Region) %>%
# summarize_each(funs(sum), Confirmed, Deaths, Recovered ) %>%
# mutate(CFR = Deaths/ Confirmed)
# arrange(desc(CFR)) %>%
# head(10)
Join Table
download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-27-2020.csv', '07-27-2020.csv')
download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/07-28-2020.csv', '07-28-2020.csv')
library(readr)
covid_0728 <- read_csv('07-28-2020.csv')
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
covid_0727 <- read_csv('07-27-2020.csv')
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
covid_0728 %>%
filter(Country_Region == 'Taiwan*') %>%
select(Country_Region, Confirmed, Deaths, Recovered)
## # A tibble: 1 x 4
## Country_Region Confirmed Deaths Recovered
## <chr> <dbl> <dbl> <dbl>
## 1 Taiwan* 467 7 440
covid_0727%>%
filter(Country_Region == 'Taiwan*') %>%
select(Country_Region, Confirmed, Deaths, Recovered)
## # A tibble: 1 x 4
## Country_Region Confirmed Deaths Recovered
## <chr> <dbl> <dbl> <dbl>
## 1 Taiwan* 462 7 440
stat_0728 <- covid_0728 %>%
group_by(Country_Region) %>%
summarise(Confirmed_Total = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
stat_0727 <- covid_0727 %>%
group_by(Country_Region) %>%
summarise(Confirmed_Total = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
#stat_0728
#stat_0727
# SELECT * FROM stat_0727 INNER JOIN stat_0728 ON stat_0727.Country_Region = stat_0728.Country_Region
m <- stat_0728 %>%
inner_join(stat_0727, by = 'Country_Region')
m <- m %>%
mutate(diff = Confirmed_Total.x - Confirmed_Total.y)
m %>%
arrange(desc(diff)) %>%
head()
## # A tibble: 6 x 4
## Country_Region Confirmed_Total.x Confirmed_Total.y diff
## <chr> <dbl> <dbl> <dbl>
## 1 US 4351997 4290337 61660
## 2 Brazil 2483191 2442375 40816
## 3 Colombia 267385 257101 10284
## 4 South Africa 459761 452529 7232
## 5 Mexico 402697 395489 7208
## 6 Argentina 173355 167416 5939
m <- m %>%
mutate(increase_ratio = (Confirmed_Total.x / Confirmed_Total.y) - 1 )
m %>%
arrange(desc(increase_ratio)) %>%
head()
## # A tibble: 6 x 5
## Country_Region Confirmed_Total.x Confirmed_Total.y diff increase_ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Bahamas 447 382 65 0.170
## 2 Zambia 5002 4552 450 0.0989
## 3 Nicaragua 3672 3439 233 0.0678
## 4 Libya 3017 2827 190 0.0672
## 5 Angola 1000 950 50 0.0526
## 6 Ethiopia 15200 14547 653 0.0449
Growing Trends
download.file('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', 'time_series_covid19_confirmed_global.csv')
confirmed_global <- read_csv('time_series_covid19_confirmed_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
confirmed_global %>%
head()
## # A tibble: 6 x 193
## `Province/State` `Country/Region` Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and Bar… 17.1 -61.8 0 0 0
## # … with 186 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …