#install.packages('RSQLite')
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.0.2
## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)
##死亡案例
deaths_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file <- 'time_series_covid19_deaths_global.csv'
download.file(deaths_url, deaths_file)
## 康復案例
recovered_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recovered_file <- 'time_series_covid19_recovered_global.csv'
download.file(recovered_url, recovered_file)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ─────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
recovered_df <- read_csv('time_series_covid19_recovered_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
recovered_df$Case <- 'recovered'
dim(recovered_df)
## [1] 267 208
deaths_df <- read_csv('time_series_covid19_deaths_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
deaths_df$Case <- 'deaths'
dim(deaths_df)
## [1] 280 208
confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
confirmed_df$Case <- 'confirmed'
dim(confirmed_df)
## [1] 280 208
merged_df <- list(confirmed = confirmed_df, deaths = deaths_df, recovered = recovered_df)
covid19_ts_df <- do.call(rbind, merged_df)
dim(covid19_ts_df)
## [1] 827 208
col_names <- colnames(covid19_ts_df)
#col_names
date_cols <- col_names[5:(length(col_names) - 1) ]
#date_cols
covid19_tidy_df <- covid19_ts_df %>%
gather(Date, Case_Number, date_cols)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_cols)` instead of `date_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
covid19_tidy_df$Date <- as.Date(covid19_tidy_df$Date, format = '%m/%d/%y')
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
covid19_tidy_df$Date <- format(as_date(covid19_tidy_df$Date), format="%Y-%m-%d")
#head(covid19_tidy_df)
寫入資料庫
library(DBI)
library(RSQLite)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
dbWriteTable(conn = con, name = 'covid19', value = covid19_tidy_df, overwrite=TRUE)
dbDisconnect(conn = con)
讀取資料庫
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
df <- dbReadTable(conn = con, name = 'covid19')
head(df)
## Province.State Country.Region Lat Long Case Date
## 1 <NA> Afghanistan 33.93911 67.70995 confirmed 2020-01-22
## 2 <NA> Albania 41.15330 20.16830 confirmed 2020-01-22
## 3 <NA> Algeria 28.03390 1.65960 confirmed 2020-01-22
## 4 <NA> Andorra 42.50630 1.52180 confirmed 2020-01-22
## 5 <NA> Angola -11.20270 17.87390 confirmed 2020-01-22
## 6 <NA> Antigua and Barbuda 17.06080 -61.79640 confirmed 2020-01-22
## Case_Number
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
df2 <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19;')
head(df2)
## Province/State Country/Region Lat Long Case Date
## 1 <NA> Afghanistan 33.93911 67.70995 confirmed 2020-01-22
## 2 <NA> Albania 41.15330 20.16830 confirmed 2020-01-22
## 3 <NA> Algeria 28.03390 1.65960 confirmed 2020-01-22
## 4 <NA> Andorra 42.50630 1.52180 confirmed 2020-01-22
## 5 <NA> Angola -11.20270 17.87390 confirmed 2020-01-22
## 6 <NA> Antigua and Barbuda 17.06080 -61.79640 confirmed 2020-01-22
## Case_Number
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
dbDisconnect(conn = con)
使用SELECT 查詢資料
library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19;')
#df <- dbGetQuery(conn = con, statement = 'SELECT Lat, Long FROM covid19;')
#df <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region`, `Case`, Date, Case_Number FROM covid19;')
df <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region` AS Country, `Case`, Date, Case_Number FROM covid19;')
dbDisconnect(conn = con)
head(df)
## Country Case Date Case_Number
## 1 Afghanistan confirmed 2020-01-22 0
## 2 Albania confirmed 2020-01-22 0
## 3 Algeria confirmed 2020-01-22 0
## 4 Andorra confirmed 2020-01-22 0
## 5 Angola confirmed 2020-01-22 0
## 6 Antigua and Barbuda confirmed 2020-01-22 0
使用WHERE 篩選資料
library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` = 'Taiwan*';")
#df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` = 'Taiwan*' AND `Case` = 'confirmed' ;")
df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` = 'Taiwan*' OR `Country/Region` = 'US' ;")
df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` IN ('Taiwan*', 'US') ;")
dbDisconnect(conn = con)
head(df)
## Province/State Country/Region Lat Long Case Date Case_Number
## 1 <NA> Taiwan* 23.7 121 confirmed 2020-01-22 1
## 2 <NA> US 40.0 -100 confirmed 2020-01-22 1
## 3 <NA> Taiwan* 23.7 121 deaths 2020-01-22 0
## 4 <NA> US 40.0 -100 deaths 2020-01-22 0
## 5 <NA> Taiwan* 23.7 121 recovered 2020-01-22 0
## 6 <NA> US 40.0 -100 recovered 2020-01-22 0
資料排序
library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19 ORDER BY Case_Number;')
#df <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19 ORDER BY Case_Number DESC;')
df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Case` = 'confirmed' AND Date = '2020-08-11' ORDER BY Case_Number DESC LIMIT 10;")
dbDisconnect(conn = con)
head(df,10)
## Province/State Country/Region Lat Long Case Date
## 1 <NA> US 40.00000 -100.00000 confirmed 2020-08-11
## 2 <NA> Brazil -14.23500 -51.92530 confirmed 2020-08-11
## 3 <NA> India 20.59368 78.96288 confirmed 2020-08-11
## 4 <NA> Russia 61.52401 105.31876 confirmed 2020-08-11
## 5 <NA> South Africa -30.55950 22.93750 confirmed 2020-08-11
## 6 <NA> Mexico 23.63450 -102.55280 confirmed 2020-08-11
## 7 <NA> Peru -9.19000 -75.01520 confirmed 2020-08-11
## 8 <NA> Colombia 4.57090 -74.29730 confirmed 2020-08-11
## 9 <NA> Chile -35.67510 -71.54300 confirmed 2020-08-11
## 10 <NA> Iran 32.42791 53.68805 confirmed 2020-08-11
## Case_Number
## 1 5141208
## 2 3057470
## 3 2329638
## 4 895691
## 5 566109
## 6 492522
## 7 483133
## 8 410453
## 9 376616
## 10 331189
限制回傳筆數
library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 LIMIT 3;")
df <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 LIMIT 3 OFFSET 3; ")
dbDisconnect(conn = con)
head(df,10)
## Province/State Country/Region Lat Long Case Date
## 1 <NA> Andorra 42.5063 1.5218 confirmed 2020-01-22
## 2 <NA> Angola -11.2027 17.8739 confirmed 2020-01-22
## 3 <NA> Antigua and Barbuda 17.0608 -61.7964 confirmed 2020-01-22
## Case_Number
## 1 0
## 2 0
## 3 0
練習題
library(DBI)
select_statement <- "
SELECT `Country/Region`, `Case`, Date, Case_Number
FROM covid19
WHERE `Country/Region` = 'Taiwan*' AND `Case` = 'confirmed'
ORDER BY Case_Number DESC
LIMIT 10
"
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = select_statement)
dbDisconnect(conn = con)
head(df)
## Country/Region Case Date Case_Number
## 1 Taiwan* confirmed 2020-08-11 480
## 2 Taiwan* confirmed 2020-08-08 479
## 3 Taiwan* confirmed 2020-08-06 477
## 4 Taiwan* confirmed 2020-08-07 477
## 5 Taiwan* confirmed 2020-08-09 477
## 6 Taiwan* confirmed 2020-08-10 477
#confirmed_df[confirmed_df$`Country/Region` == 'Taiwan*',]
聚合資料
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df <- dbGetQuery(conn = con, statement = 'SELECT COUNT(*) AS CNT FROM covid19;')
#df <- dbGetQuery(conn = con, statement = 'SELECT MAX(Case_Number) AS CNT FROM covid19;')
#df <- dbGetQuery(conn = con, statement = 'SELECT MIN(Case_Number) AS CNT FROM covid19;')
#df <- dbGetQuery(conn = con, statement = 'SELECT AVG(Case_Number) AS CNT FROM covid19;')
#df <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region`, MAX(Case_Number) AS CNT FROM covid19 GROUP BY `Country/Region`;')
#df <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region`, `Case`, MAX(Case_Number) AS CNT FROM covid19 GROUP BY `Country/Region`, `Case`;')
#df <- dbGetQuery(conn = con, statement = "SELECT `Country/Region`, `Case`, MAX(Case_Number) AS case_number FROM covid19 WHERE `Case` = 'confirmed' GROUP BY `Country/Region`, `Case` HAVING case_number > 1000000;")
df <- dbGetQuery(conn = con, statement = "SELECT `Country/Region`, `Case`, MAX(Case_Number) AS case_number FROM covid19 WHERE `Case` = 'confirmed' GROUP BY `Country/Region`, `Case` HAVING case_number > 1000000 ORDER BY case_number DESC;")
dbDisconnect(conn = con)
head(df)
## Country/Region Case case_number
## 1 US confirmed 5141208
## 2 Brazil confirmed 3057470
## 3 India confirmed 2329638
建立資料表
create_statement <- "
CREATE TABLE country_stat(
country VARCHAR(128) NOT NULL,
cnt_sum INTEGER
);
"
drop_statement <- "
DROP TABLE country_stat;
"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = drop_statement)
## [1] 0
dbExecute(conn = con, statement = create_statement)
## [1] 0
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbListTables(conn = con)
## [1] "country_cnt" "country_stat" "covid19"
dbDisconnect(con)
alter_statement <- '
ALTER TABLE country_stat ADD COLUMN dt DATE;
'
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = alter_statement)
## [1] 0
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM country_stat;')
dbDisconnect(con)
str(df)
## 'data.frame': 0 obs. of 3 variables:
## $ country: chr
## $ cnt_sum: int
## $ dt : num
新增資料
insert_statement <- "insert into country_stat(country, cnt_sum, dt) values('Taiwan', 0, '2020-08-10');"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = insert_statement)
## [1] 1
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM country_stat;')
dbDisconnect(con)
df
## country cnt_sum dt
## 1 Taiwan 0 2020-08-10
update_statement <- "UPDATE country_stat SET cnt_sum = 100 WHERE country = 'Taiwan';"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = update_statement)
## [1] 1
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM country_stat;')
dbDisconnect(con)
df
## country cnt_sum dt
## 1 Taiwan 100 2020-08-10
資料刪除
delete_statement <- "delete from country_stat where country ='Taiwan'; "
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = delete_statement)
## [1] 1
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM country_stat;')
dbDisconnect(con)
df
## [1] country cnt_sum dt
## <0 rows> (or 0-length row.names)
select_into_statement <- "
INSERT INTO country_stat(country, cnt_sum, dt)
SELECT `Country/Region`, MAX(`Case_Number`) AS cnt, MAX(`Date`) FROM covid19
WHERE `Case` = 'confirmed' GROUP BY `Country/Region`;
"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = select_into_statement)
## [1] 188
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM country_stat;')
dbDisconnect(con)
head(df)
## country cnt_sum dt
## 1 Afghanistan 37269 2020-08-11
## 2 Albania 6676 2020-08-11
## 3 Algeria 36204 2020-08-11
## 4 Andorra 963 2020-08-11
## 5 Angola 1735 2020-08-11
## 6 Antigua and Barbuda 92 2020-08-11
合併表格
join_statement <- "
SELECT * FROM covid19 INNER JOIN country_stat ON covid19.`Country/Region` = country_stat.country;
"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = join_statement)
dbDisconnect(con)
head(df)
## Province/State Country/Region Lat Long Case Date
## 1 <NA> Afghanistan 33.93911 67.70995 confirmed 2020-01-22
## 2 <NA> Albania 41.15330 20.16830 confirmed 2020-01-22
## 3 <NA> Algeria 28.03390 1.65960 confirmed 2020-01-22
## 4 <NA> Andorra 42.50630 1.52180 confirmed 2020-01-22
## 5 <NA> Angola -11.20270 17.87390 confirmed 2020-01-22
## 6 <NA> Antigua and Barbuda 17.06080 -61.79640 confirmed 2020-01-22
## Case_Number country cnt_sum dt
## 1 0 Afghanistan 37269 2020-08-11
## 2 0 Albania 6676 2020-08-11
## 3 0 Algeria 36204 2020-08-11
## 4 0 Andorra 963 2020-08-11
## 5 0 Angola 1735 2020-08-11
## 6 0 Antigua and Barbuda 92 2020-08-11
練習題
select_statement <- "
SELECT `Country/Region` as country, `Case`, max(Case_Number) as Case_Number FROM covid19 WHERE Date = '2020-08-11' GROUP BY `Country/Region`, `Case`
"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = select_statement)
dbDisconnect(con)
#head(df)
#df
m <- df %>%
spread(key = `Case`, value = Case_Number)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbWriteTable(conn = con, name = 'country_cnt',value = m, overwrite=TRUE)
dbDisconnect(con)
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df2 <- dbGetQuery(conn = con, statement = "select * from country_cnt;")
dbDisconnect(con)
head(df2)
## country confirmed deaths recovered
## 1 Afghanistan 37269 1344 26415
## 2 Albania 6676 205 3480
## 3 Algeria 36204 1322 25263
## 4 Andorra 963 52 839
## 5 Angola 1735 80 575
## 6 Antigua and Barbuda 92 3 76
RPostgres
#install.packages('RPostgres’)
con <- dbConnect(drv = RPostgres::Postgres(),
user = 'test', password = 'test', dbname = 'postgres',
host = 'localhost')
dbWriteTable(conn = con, name = 'covid19', value = covid19_tidy_df, overwrite=TRUE)
dbDisconnect(conn = con)
con <- dbConnect(drv = RPostgres::Postgres(),
user = 'test', password = 'test', dbname = 'postgres',
host = 'localhost')
df <- dbGetQuery(con, "SELECT * FROM covid19 ;")
head(df)
## Province/State Country/Region Lat Long Case Date
## 1 <NA> Afghanistan 33.93911 67.70995 confirmed 2020-01-22
## 2 <NA> Albania 41.15330 20.16830 confirmed 2020-01-22
## 3 <NA> Algeria 28.03390 1.65960 confirmed 2020-01-22
## 4 <NA> Andorra 42.50630 1.52180 confirmed 2020-01-22
## 5 <NA> Angola -11.20270 17.87390 confirmed 2020-01-22
## 6 <NA> Antigua and Barbuda 17.06080 -61.79640 confirmed 2020-01-22
## Case_Number
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
dbDisconnect(conn= con)