Tibble
t <- tibble(a = c('A', 'B', 'C'), b = c(1,1,2))
class(t)
## [1] "tbl_df" "tbl" "data.frame"
tribble(~x,~y,1,'A',2,'B')
## # A tibble: 2 x 2
## x y
## <dbl> <chr>
## 1 1 A
## 2 2 B
df <- data.frame(a =c('A','B','C'), b = c(1,1,2))
tf <- as_tibble(df)
class(df)
## [1] "data.frame"
class(tf)
## [1] "tbl_df" "tbl" "data.frame"
df <- as.data.frame(tf)
#install.packages('tidyr')
library(tidyr)
help(package='tidyr')
url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
filename <- 'time_series_covid19_confirmed_global.csv'
download.file(url, filename)
library(readr)
df <- read_csv(filename)
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
df %>% head()
## # A tibble: 6 x 200
## `Province/State` `Country/Region` Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and Bar… 17.1 -61.8 0 0 0
## # … with 193 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …
library(tidyr)
col_names <- colnames(df)
date_cols <- col_names[5:length(df)]
col_names[5:length(col_names)]
## [1] "1/22/20" "1/23/20" "1/24/20" "1/25/20" "1/26/20" "1/27/20" "1/28/20"
## [8] "1/29/20" "1/30/20" "1/31/20" "2/1/20" "2/2/20" "2/3/20" "2/4/20"
## [15] "2/5/20" "2/6/20" "2/7/20" "2/8/20" "2/9/20" "2/10/20" "2/11/20"
## [22] "2/12/20" "2/13/20" "2/14/20" "2/15/20" "2/16/20" "2/17/20" "2/18/20"
## [29] "2/19/20" "2/20/20" "2/21/20" "2/22/20" "2/23/20" "2/24/20" "2/25/20"
## [36] "2/26/20" "2/27/20" "2/28/20" "2/29/20" "3/1/20" "3/2/20" "3/3/20"
## [43] "3/4/20" "3/5/20" "3/6/20" "3/7/20" "3/8/20" "3/9/20" "3/10/20"
## [50] "3/11/20" "3/12/20" "3/13/20" "3/14/20" "3/15/20" "3/16/20" "3/17/20"
## [57] "3/18/20" "3/19/20" "3/20/20" "3/21/20" "3/22/20" "3/23/20" "3/24/20"
## [64] "3/25/20" "3/26/20" "3/27/20" "3/28/20" "3/29/20" "3/30/20" "3/31/20"
## [71] "4/1/20" "4/2/20" "4/3/20" "4/4/20" "4/5/20" "4/6/20" "4/7/20"
## [78] "4/8/20" "4/9/20" "4/10/20" "4/11/20" "4/12/20" "4/13/20" "4/14/20"
## [85] "4/15/20" "4/16/20" "4/17/20" "4/18/20" "4/19/20" "4/20/20" "4/21/20"
## [92] "4/22/20" "4/23/20" "4/24/20" "4/25/20" "4/26/20" "4/27/20" "4/28/20"
## [99] "4/29/20" "4/30/20" "5/1/20" "5/2/20" "5/3/20" "5/4/20" "5/5/20"
## [106] "5/6/20" "5/7/20" "5/8/20" "5/9/20" "5/10/20" "5/11/20" "5/12/20"
## [113] "5/13/20" "5/14/20" "5/15/20" "5/16/20" "5/17/20" "5/18/20" "5/19/20"
## [120] "5/20/20" "5/21/20" "5/22/20" "5/23/20" "5/24/20" "5/25/20" "5/26/20"
## [127] "5/27/20" "5/28/20" "5/29/20" "5/30/20" "5/31/20" "6/1/20" "6/2/20"
## [134] "6/3/20" "6/4/20" "6/5/20" "6/6/20" "6/7/20" "6/8/20" "6/9/20"
## [141] "6/10/20" "6/11/20" "6/12/20" "6/13/20" "6/14/20" "6/15/20" "6/16/20"
## [148] "6/17/20" "6/18/20" "6/19/20" "6/20/20" "6/21/20" "6/22/20" "6/23/20"
## [155] "6/24/20" "6/25/20" "6/26/20" "6/27/20" "6/28/20" "6/29/20" "6/30/20"
## [162] "7/1/20" "7/2/20" "7/3/20" "7/4/20" "7/5/20" "7/6/20" "7/7/20"
## [169] "7/8/20" "7/9/20" "7/10/20" "7/11/20" "7/12/20" "7/13/20" "7/14/20"
## [176] "7/15/20" "7/16/20" "7/17/20" "7/18/20" "7/19/20" "7/20/20" "7/21/20"
## [183] "7/22/20" "7/23/20" "7/24/20" "7/25/20" "7/26/20" "7/27/20" "7/28/20"
## [190] "7/29/20" "7/30/20" "7/31/20" "8/1/20" "8/2/20" "8/3/20" "8/4/20"
df_long <- df %>%
gather(Date, Confirmed, date_cols)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_cols)` instead of `date_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
df_long$Date <- as.Date(df_long$Date, '%m/%d/%Y')
library(dplyr)
taiwan_stat <- df_long %>%
filter(`Country/Region` == 'Taiwan*') %>%
arrange(Date) %>%
select(Date, Confirmed)
plot(taiwan_stat$Date, taiwan_stat$Confirmed, type = 'o')

練習題
url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
filename <- 'time_series_covid19_deaths_global.csv'
download.file(url, filename)
library(readr)
deaths <- read_csv(filename)
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
head(deaths)
## # A tibble: 6 x 200
## `Province/State` `Country/Region` Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and Bar… 17.1 -61.8 0 0 0
## # … with 193 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …
library(tidyr)
col_names <- colnames(deaths)
date_col <- col_names[5:length(col_names)]
deaths_df <- deaths %>%
gather(key = Date, value = Deaths, date_col) %>%
mutate(Date = as.Date(Date, '%m/%d/%y'))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_col)` instead of `date_col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
#deaths_df$Date <- as.Date(deaths_df$Date,'%m/%d/%Y')
#head(deaths_df)
deaths_stat <- deaths_df %>%
filter(`Country/Region` == 'US') %>%
select(Date, Deaths)
plot(Deaths ~ Date, deaths_stat, type= 'o', col= 'red', main = 'US Deaths')

Spread
tf <- tibble(idx = c(1,2,3,4), types = c('A','B','A', 'B'), val = c(5,6,7,8))
tf
## # A tibble: 4 x 3
## idx types val
## <dbl> <chr> <dbl>
## 1 1 A 5
## 2 2 B 6
## 3 3 A 7
## 4 4 B 8
tf %>%
spread(key = types, value = val)
## # A tibble: 4 x 3
## idx A B
## <dbl> <dbl> <dbl>
## 1 1 5 NA
## 2 2 NA 6
## 3 3 7 NA
## 4 4 NA 8
url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
filename <- 'time_series_covid19_confirmed_global.csv'
download.file(url, filename)
url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
filename <- 'time_series_covid19_deaths_global.csv'
download.file(url, filename)
url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
filename <- 'time_series_covid19_recovered_global.csv'
download.file(url, filename)
library(readr)
deaths<-read_csv('time_series_covid19_deaths_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
recovered<-read_csv('time_series_covid19_recovered_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
confirmed<-read_csv('time_series_covid19_confirmed_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
dim(deaths)
## [1] 266 200
dim(recovered)
## [1] 253 200
dim(confirmed)
## [1] 266 200
deaths$case_type<- 'deaths'
recovered$case_type <- 'recovered'
confirmed$case_type <- 'confirmed'
m <- do.call(rbind, list(deaths, recovered, confirmed))
dim(m)
## [1] 785 201
head(m)
## # A tibble: 6 x 201
## `Province/State` `Country/Region` Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and Bar… 17.1 -61.8 0 0 0
## # … with 194 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …
col_names <- colnames(m)
date_cols <- col_names[5:(length(col_names)-1) ]
m_long <- m %>%
gather(key = Date, value = Case, date_cols)
head(m_long)
## # A tibble: 6 x 7
## `Province/State` `Country/Region` Lat Long case_type Date Case
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 deaths 1/22/20 0
## 2 <NA> Albania 41.2 20.2 deaths 1/22/20 0
## 3 <NA> Algeria 28.0 1.66 deaths 1/22/20 0
## 4 <NA> Andorra 42.5 1.52 deaths 1/22/20 0
## 5 <NA> Angola -11.2 17.9 deaths 1/22/20 0
## 6 <NA> Antigua and Barbuda 17.1 -61.8 deaths 1/22/20 0
table(m_long$case_type)
##
## confirmed deaths recovered
## 52136 52136 49588
m_wide <- m_long %>%
spread(key = case_type, value = Case)
m_wide$Date <- as.Date(m_wide$Date, format='%m/%d/%y')
taiwan_stat <- m_wide %>%
filter(`Country/Region` == 'Taiwan*' )
head(taiwan_stat)
## # A tibble: 6 x 8
## `Province/State` `Country/Region` Lat Long Date confirmed deaths
## <chr> <chr> <dbl> <dbl> <date> <dbl> <dbl>
## 1 <NA> Taiwan* 23.7 121 2020-01-22 1 0
## 2 <NA> Taiwan* 23.7 121 2020-01-23 1 0
## 3 <NA> Taiwan* 23.7 121 2020-01-24 3 0
## 4 <NA> Taiwan* 23.7 121 2020-01-25 3 0
## 5 <NA> Taiwan* 23.7 121 2020-01-26 4 0
## 6 <NA> Taiwan* 23.7 121 2020-01-27 5 0
## # … with 1 more variable: recovered <dbl>
plot(confirmed ~ Date, taiwan_stat, col ='blue', type = 'l')
lines(deaths ~ Date, taiwan_stat, col ='red')
lines(recovered ~ Date, taiwan_stat, col ='green')

par(mfrow = c(3, 1))
plot(confirmed ~ Date, taiwan_stat, col ='blue', type = 'l')
plot(deaths ~ Date, taiwan_stat, col ='red', type = 'l')
plot(recovered ~ Date, taiwan_stat, col ='green', type = 'l')

us_stat <- m_wide %>%
filter(`Country/Region` == 'US')
#head(us_stat)
plot(confirmed ~ Date, us_stat, type= 'o', col = 'blue')
lines(recovered ~ Date, us_stat, type = 'o', col = 'green')
lines(deaths ~ Date, us_stat, type = 'o', col = 'red')
china_stat <- m_wide %>%
filter(`Country/Region` == 'China') %>%
select(`Country/Region`, Date, confirmed) %>%
group_by(`Country/Region`, Date) %>%
summarize(confirmed = sum(confirmed, na.rm=TRUE))
## `summarise()` regrouping output by 'Country/Region' (override with `.groups` argument)
us_stat <- m_wide %>%
filter(`Country/Region` == 'US')
plot(confirmed ~ Date, us_stat, col = 'blue', type = 'l')
lines(confirmed ~ Date, china_stat, col = 'red', type = 'l')
library(ggplot2)
g <- ggplot()

m_long2 <- m_long %>%
separate(Date, into = c('m', 'd', 'y'), sep = '/')
#m_long2
m_long %>%
separate_rows(Date, sep = '/') %>%
head()
## # A tibble: 6 x 7
## `Province/State` `Country/Region` Lat Long case_type Date Case
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 deaths 1 0
## 2 <NA> Afghanistan 33.9 67.7 deaths 22 0
## 3 <NA> Afghanistan 33.9 67.7 deaths 20 0
## 4 <NA> Albania 41.2 20.2 deaths 1 0
## 5 <NA> Albania 41.2 20.2 deaths 22 0
## 6 <NA> Albania 41.2 20.2 deaths 20 0
m_long2 %>%
mutate(year = paste0('20', y)) %>%
unite(year, m, col ='yearmonth', sep = '/')
## # A tibble: 153,860 x 9
## `Province/State` `Country/Region` Lat Long case_type yearmonth d
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 <NA> Afghanistan 33.9 67.7 deaths 2020/1 22
## 2 <NA> Albania 41.2 20.2 deaths 2020/1 22
## 3 <NA> Algeria 28.0 1.66 deaths 2020/1 22
## 4 <NA> Andorra 42.5 1.52 deaths 2020/1 22
## 5 <NA> Angola -11.2 17.9 deaths 2020/1 22
## 6 <NA> Antigua and Bar… 17.1 -61.8 deaths 2020/1 22
## 7 <NA> Argentina -38.4 -63.6 deaths 2020/1 22
## 8 <NA> Armenia 40.1 45.0 deaths 2020/1 22
## 9 Australian Capi… Australia -35.5 149. deaths 2020/1 22
## 10 New South Wales Australia -33.9 151. deaths 2020/1 22
## # … with 153,850 more rows, and 2 more variables: y <chr>, Case <dbl>
m_long %>%
mutate(FDate = as.Date(Date, '%m/%d/%y')) %>%
separate(Date, into=c('m','d','y'), sep = '/') %>%
filter(`Country/Region` == 'US') %>%
select(FDate, case_type, m, Case) %>%
group_by(case_type, m) %>%
summarize(max_date = max(FDate), Case, FDate) %>%
filter(max_date == FDate) %>%
head()
## `summarise()` regrouping output by 'case_type', 'm' (override with `.groups` argument)
## # A tibble: 6 x 5
## # Groups: case_type, m [6]
## case_type m max_date Case FDate
## <chr> <chr> <date> <dbl> <date>
## 1 confirmed 1 2020-01-31 7 2020-01-31
## 2 confirmed 2 2020-02-29 24 2020-02-29
## 3 confirmed 3 2020-03-31 188724 2020-03-31
## 4 confirmed 4 2020-04-30 1072667 2020-04-30
## 5 confirmed 5 2020-05-31 1799124 2020-05-31
## 6 confirmed 6 2020-06-30 2636414 2020-06-30
Missing Value
dim(confirmed)
## [1] 266 201
colSums(is.na(confirmed))
## Province/State Country/Region Lat Long 1/22/20
## 185 0 0 0 0
## 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20
## 0 0 0 0 0
## 1/28/20 1/29/20 1/30/20 1/31/20 2/1/20
## 0 0 0 0 0
## 2/2/20 2/3/20 2/4/20 2/5/20 2/6/20
## 0 0 0 0 0
## 2/7/20 2/8/20 2/9/20 2/10/20 2/11/20
## 0 0 0 0 0
## 2/12/20 2/13/20 2/14/20 2/15/20 2/16/20
## 0 0 0 0 0
## 2/17/20 2/18/20 2/19/20 2/20/20 2/21/20
## 0 0 0 0 0
## 2/22/20 2/23/20 2/24/20 2/25/20 2/26/20
## 0 0 0 0 0
## 2/27/20 2/28/20 2/29/20 3/1/20 3/2/20
## 0 0 0 0 0
## 3/3/20 3/4/20 3/5/20 3/6/20 3/7/20
## 0 0 0 0 0
## 3/8/20 3/9/20 3/10/20 3/11/20 3/12/20
## 0 0 0 0 0
## 3/13/20 3/14/20 3/15/20 3/16/20 3/17/20
## 0 0 0 0 0
## 3/18/20 3/19/20 3/20/20 3/21/20 3/22/20
## 0 0 0 0 0
## 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20
## 0 0 0 0 0
## 3/28/20 3/29/20 3/30/20 3/31/20 4/1/20
## 0 0 0 0 0
## 4/2/20 4/3/20 4/4/20 4/5/20 4/6/20
## 0 0 0 0 0
## 4/7/20 4/8/20 4/9/20 4/10/20 4/11/20
## 0 0 0 0 0
## 4/12/20 4/13/20 4/14/20 4/15/20 4/16/20
## 0 0 0 0 0
## 4/17/20 4/18/20 4/19/20 4/20/20 4/21/20
## 0 0 0 0 0
## 4/22/20 4/23/20 4/24/20 4/25/20 4/26/20
## 0 0 0 0 0
## 4/27/20 4/28/20 4/29/20 4/30/20 5/1/20
## 0 0 0 0 0
## 5/2/20 5/3/20 5/4/20 5/5/20 5/6/20
## 0 0 0 0 0
## 5/7/20 5/8/20 5/9/20 5/10/20 5/11/20
## 0 0 0 0 0
## 5/12/20 5/13/20 5/14/20 5/15/20 5/16/20
## 0 0 0 0 0
## 5/17/20 5/18/20 5/19/20 5/20/20 5/21/20
## 0 0 0 0 0
## 5/22/20 5/23/20 5/24/20 5/25/20 5/26/20
## 0 0 0 0 0
## 5/27/20 5/28/20 5/29/20 5/30/20 5/31/20
## 0 0 0 0 0
## 6/1/20 6/2/20 6/3/20 6/4/20 6/5/20
## 0 0 0 0 0
## 6/6/20 6/7/20 6/8/20 6/9/20 6/10/20
## 0 0 0 0 0
## 6/11/20 6/12/20 6/13/20 6/14/20 6/15/20
## 0 0 0 0 0
## 6/16/20 6/17/20 6/18/20 6/19/20 6/20/20
## 0 0 0 0 0
## 6/21/20 6/22/20 6/23/20 6/24/20 6/25/20
## 0 0 0 0 0
## 6/26/20 6/27/20 6/28/20 6/29/20 6/30/20
## 0 0 0 0 0
## 7/1/20 7/2/20 7/3/20 7/4/20 7/5/20
## 0 0 0 0 0
## 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20
## 0 0 0 0 0
## 7/11/20 7/12/20 7/13/20 7/14/20 7/15/20
## 0 0 0 0 0
## 7/16/20 7/17/20 7/18/20 7/19/20 7/20/20
## 0 0 0 0 0
## 7/21/20 7/22/20 7/23/20 7/24/20 7/25/20
## 0 0 0 0 0
## 7/26/20 7/27/20 7/28/20 7/29/20 7/30/20
## 0 0 0 0 0
## 7/31/20 8/1/20 8/2/20 8/3/20 8/4/20
## 0 0 0 0 0
## case_type
## 0
sum(is.na(confirmed))
## [1] 185
confirmed %>%
drop_na() %>%
is.na() %>%
colSums()
## Province/State Country/Region Lat Long 1/22/20
## 0 0 0 0 0
## 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20
## 0 0 0 0 0
## 1/28/20 1/29/20 1/30/20 1/31/20 2/1/20
## 0 0 0 0 0
## 2/2/20 2/3/20 2/4/20 2/5/20 2/6/20
## 0 0 0 0 0
## 2/7/20 2/8/20 2/9/20 2/10/20 2/11/20
## 0 0 0 0 0
## 2/12/20 2/13/20 2/14/20 2/15/20 2/16/20
## 0 0 0 0 0
## 2/17/20 2/18/20 2/19/20 2/20/20 2/21/20
## 0 0 0 0 0
## 2/22/20 2/23/20 2/24/20 2/25/20 2/26/20
## 0 0 0 0 0
## 2/27/20 2/28/20 2/29/20 3/1/20 3/2/20
## 0 0 0 0 0
## 3/3/20 3/4/20 3/5/20 3/6/20 3/7/20
## 0 0 0 0 0
## 3/8/20 3/9/20 3/10/20 3/11/20 3/12/20
## 0 0 0 0 0
## 3/13/20 3/14/20 3/15/20 3/16/20 3/17/20
## 0 0 0 0 0
## 3/18/20 3/19/20 3/20/20 3/21/20 3/22/20
## 0 0 0 0 0
## 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20
## 0 0 0 0 0
## 3/28/20 3/29/20 3/30/20 3/31/20 4/1/20
## 0 0 0 0 0
## 4/2/20 4/3/20 4/4/20 4/5/20 4/6/20
## 0 0 0 0 0
## 4/7/20 4/8/20 4/9/20 4/10/20 4/11/20
## 0 0 0 0 0
## 4/12/20 4/13/20 4/14/20 4/15/20 4/16/20
## 0 0 0 0 0
## 4/17/20 4/18/20 4/19/20 4/20/20 4/21/20
## 0 0 0 0 0
## 4/22/20 4/23/20 4/24/20 4/25/20 4/26/20
## 0 0 0 0 0
## 4/27/20 4/28/20 4/29/20 4/30/20 5/1/20
## 0 0 0 0 0
## 5/2/20 5/3/20 5/4/20 5/5/20 5/6/20
## 0 0 0 0 0
## 5/7/20 5/8/20 5/9/20 5/10/20 5/11/20
## 0 0 0 0 0
## 5/12/20 5/13/20 5/14/20 5/15/20 5/16/20
## 0 0 0 0 0
## 5/17/20 5/18/20 5/19/20 5/20/20 5/21/20
## 0 0 0 0 0
## 5/22/20 5/23/20 5/24/20 5/25/20 5/26/20
## 0 0 0 0 0
## 5/27/20 5/28/20 5/29/20 5/30/20 5/31/20
## 0 0 0 0 0
## 6/1/20 6/2/20 6/3/20 6/4/20 6/5/20
## 0 0 0 0 0
## 6/6/20 6/7/20 6/8/20 6/9/20 6/10/20
## 0 0 0 0 0
## 6/11/20 6/12/20 6/13/20 6/14/20 6/15/20
## 0 0 0 0 0
## 6/16/20 6/17/20 6/18/20 6/19/20 6/20/20
## 0 0 0 0 0
## 6/21/20 6/22/20 6/23/20 6/24/20 6/25/20
## 0 0 0 0 0
## 6/26/20 6/27/20 6/28/20 6/29/20 6/30/20
## 0 0 0 0 0
## 7/1/20 7/2/20 7/3/20 7/4/20 7/5/20
## 0 0 0 0 0
## 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20
## 0 0 0 0 0
## 7/11/20 7/12/20 7/13/20 7/14/20 7/15/20
## 0 0 0 0 0
## 7/16/20 7/17/20 7/18/20 7/19/20 7/20/20
## 0 0 0 0 0
## 7/21/20 7/22/20 7/23/20 7/24/20 7/25/20
## 0 0 0 0 0
## 7/26/20 7/27/20 7/28/20 7/29/20 7/30/20
## 0 0 0 0 0
## 7/31/20 8/1/20 8/2/20 8/3/20 8/4/20
## 0 0 0 0 0
## case_type
## 0
df <- data.frame(idx = c(1,2,3,4,5), col = c(1, NA, NA, 2, 3))
df %>% fill(col , .direction = c('up'))
## idx col
## 1 1 1
## 2 2 2
## 3 3 2
## 4 4 2
## 5 5 3
confirmed %>%
replace_na(list(`Province/State`= "NoPROVINCE"))
## # A tibble: 266 x 201
## `Province/State` `Country/Region` Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NoPROVINCE Afghanistan 33.9 67.7 0 0 0
## 2 NoPROVINCE Albania 41.2 20.2 0 0 0
## 3 NoPROVINCE Algeria 28.0 1.66 0 0 0
## 4 NoPROVINCE Andorra 42.5 1.52 0 0 0
## 5 NoPROVINCE Angola -11.2 17.9 0 0 0
## 6 NoPROVINCE Antigua and Bar… 17.1 -61.8 0 0 0
## 7 NoPROVINCE Argentina -38.4 -63.6 0 0 0
## 8 NoPROVINCE Armenia 40.1 45.0 0 0 0
## 9 Australian Capi… Australia -35.5 149. 0 0 0
## 10 New South Wales Australia -33.9 151. 0 0 0
## # … with 256 more rows, and 194 more variables: `1/25/20` <dbl>,
## # `1/26/20` <dbl>, `1/27/20` <dbl>, `1/28/20` <dbl>, `1/29/20` <dbl>,
## # `1/30/20` <dbl>, `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>,
## # `2/3/20` <dbl>, `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>,
## # `2/7/20` <dbl>, `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>,
## # `2/11/20` <dbl>, `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>,
## # `2/15/20` <dbl>, `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>,
## # `2/19/20` <dbl>, `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>,
## # `2/23/20` <dbl>, `2/24/20` <dbl>, `2/25/20` <dbl>, `2/26/20` <dbl>,
## # `2/27/20` <dbl>, `2/28/20` <dbl>, `2/29/20` <dbl>, `3/1/20` <dbl>,
## # `3/2/20` <dbl>, `3/3/20` <dbl>, `3/4/20` <dbl>, `3/5/20` <dbl>,
## # `3/6/20` <dbl>, `3/7/20` <dbl>, `3/8/20` <dbl>, `3/9/20` <dbl>,
## # `3/10/20` <dbl>, `3/11/20` <dbl>, `3/12/20` <dbl>, `3/13/20` <dbl>,
## # `3/14/20` <dbl>, `3/15/20` <dbl>, `3/16/20` <dbl>, `3/17/20` <dbl>,
## # `3/18/20` <dbl>, `3/19/20` <dbl>, `3/20/20` <dbl>, `3/21/20` <dbl>,
## # `3/22/20` <dbl>, `3/23/20` <dbl>, `3/24/20` <dbl>, `3/25/20` <dbl>,
## # `3/26/20` <dbl>, `3/27/20` <dbl>, `3/28/20` <dbl>, `3/29/20` <dbl>,
## # `3/30/20` <dbl>, `3/31/20` <dbl>, `4/1/20` <dbl>, `4/2/20` <dbl>,
## # `4/3/20` <dbl>, `4/4/20` <dbl>, `4/5/20` <dbl>, `4/6/20` <dbl>,
## # `4/7/20` <dbl>, `4/8/20` <dbl>, `4/9/20` <dbl>, `4/10/20` <dbl>,
## # `4/11/20` <dbl>, `4/12/20` <dbl>, `4/13/20` <dbl>, `4/14/20` <dbl>,
## # `4/15/20` <dbl>, `4/16/20` <dbl>, `4/17/20` <dbl>, `4/18/20` <dbl>,
## # `4/19/20` <dbl>, `4/20/20` <dbl>, `4/21/20` <dbl>, `4/22/20` <dbl>,
## # `4/23/20` <dbl>, `4/24/20` <dbl>, `4/25/20` <dbl>, `4/26/20` <dbl>,
## # `4/27/20` <dbl>, `4/28/20` <dbl>, `4/29/20` <dbl>, `4/30/20` <dbl>,
## # `5/1/20` <dbl>, `5/2/20` <dbl>, `5/3/20` <dbl>, …