#install.packages('RSQLite')

library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.0.2
## 確診案例
confirmed_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file <- 'time_series_covid19_confirmed_global.csv'
download.file(confirmed_url, confirmed_file)


##死亡案例
deaths_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file <- 'time_series_covid19_deaths_global.csv'
download.file(deaths_url, deaths_file)

## 康復案例
recovered_url <- 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recovered_file <- 'time_series_covid19_recovered_global.csv'
download.file(recovered_url, recovered_file)


library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ─────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
recovered_df <- read_csv('time_series_covid19_recovered_global.csv')
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
recovered_df$Case <- 'recovered'
dim(recovered_df)
## [1] 267 208
deaths_df <- read_csv('time_series_covid19_deaths_global.csv')
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
deaths_df$Case <- 'deaths'
dim(deaths_df)
## [1] 280 208
confirmed_df <- read_csv('time_series_covid19_confirmed_global.csv')
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
confirmed_df$Case <- 'confirmed'
dim(confirmed_df)
## [1] 280 208
merged_df <- list(confirmed = confirmed_df, deaths = deaths_df, recovered = recovered_df)

covid19_ts_df <- do.call(rbind, merged_df)
dim(covid19_ts_df)
## [1] 827 208
col_names <- colnames(covid19_ts_df)
#col_names
date_cols <- col_names[5:(length(col_names) - 1) ]
#date_cols

covid19_tidy_df <- covid19_ts_df %>%
   gather(Date, Case_Number, date_cols)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(date_cols)` instead of `date_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
covid19_tidy_df$Date <- as.Date(covid19_tidy_df$Date, format = '%m/%d/%y')

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
covid19_tidy_df$Date <- format(as_date(covid19_tidy_df$Date), format="%Y-%m-%d")

#head(covid19_tidy_df)

寫入資料庫

library(DBI)
library(RSQLite)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
dbWriteTable(conn = con, name = 'covid19', value = covid19_tidy_df, overwrite=TRUE)
dbDisconnect(conn = con)

讀取資料庫

con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
df <- dbReadTable(conn = con, name = 'covid19')

head(df)
##   Province.State      Country.Region       Lat      Long      Case       Date
## 1           <NA>         Afghanistan  33.93911  67.70995 confirmed 2020-01-22
## 2           <NA>             Albania  41.15330  20.16830 confirmed 2020-01-22
## 3           <NA>             Algeria  28.03390   1.65960 confirmed 2020-01-22
## 4           <NA>             Andorra  42.50630   1.52180 confirmed 2020-01-22
## 5           <NA>              Angola -11.20270  17.87390 confirmed 2020-01-22
## 6           <NA> Antigua and Barbuda  17.06080 -61.79640 confirmed 2020-01-22
##   Case_Number
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0
df2 <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19;')
head(df2)
##   Province/State      Country/Region       Lat      Long      Case       Date
## 1           <NA>         Afghanistan  33.93911  67.70995 confirmed 2020-01-22
## 2           <NA>             Albania  41.15330  20.16830 confirmed 2020-01-22
## 3           <NA>             Algeria  28.03390   1.65960 confirmed 2020-01-22
## 4           <NA>             Andorra  42.50630   1.52180 confirmed 2020-01-22
## 5           <NA>              Angola -11.20270  17.87390 confirmed 2020-01-22
## 6           <NA> Antigua and Barbuda  17.06080 -61.79640 confirmed 2020-01-22
##   Case_Number
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0
dbDisconnect(conn = con)

使用SELECT 查詢資料

library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df  <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19;')
#df  <- dbGetQuery(conn = con, statement = 'SELECT Lat, Long FROM covid19;')
#df  <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region`, `Case`, Date, Case_Number FROM covid19;')

df  <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region` AS Country, `Case`, Date, Case_Number FROM covid19;')

dbDisconnect(conn = con)

head(df)
##               Country      Case       Date Case_Number
## 1         Afghanistan confirmed 2020-01-22           0
## 2             Albania confirmed 2020-01-22           0
## 3             Algeria confirmed 2020-01-22           0
## 4             Andorra confirmed 2020-01-22           0
## 5              Angola confirmed 2020-01-22           0
## 6 Antigua and Barbuda confirmed 2020-01-22           0

使用WHERE 篩選資料

library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')

#df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` = 'Taiwan*';")

#df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` = 'Taiwan*' AND `Case` = 'confirmed' ;")

df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` = 'Taiwan*' OR `Country/Region` = 'US' ;")


df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Country/Region` IN ('Taiwan*', 'US') ;")

dbDisconnect(conn = con)

head(df)
##   Province/State Country/Region  Lat Long      Case       Date Case_Number
## 1           <NA>        Taiwan* 23.7  121 confirmed 2020-01-22           1
## 2           <NA>             US 40.0 -100 confirmed 2020-01-22           1
## 3           <NA>        Taiwan* 23.7  121    deaths 2020-01-22           0
## 4           <NA>             US 40.0 -100    deaths 2020-01-22           0
## 5           <NA>        Taiwan* 23.7  121 recovered 2020-01-22           0
## 6           <NA>             US 40.0 -100 recovered 2020-01-22           0

資料排序

library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df  <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19 ORDER BY Case_Number;')

#df  <- dbGetQuery(conn = con, statement = 'SELECT * FROM covid19 ORDER BY Case_Number DESC;')

df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 WHERE `Case` = 'confirmed' AND Date = '2020-08-11' ORDER BY Case_Number DESC LIMIT 10;")

dbDisconnect(conn = con)

head(df,10)
##    Province/State Country/Region       Lat       Long      Case       Date
## 1            <NA>             US  40.00000 -100.00000 confirmed 2020-08-11
## 2            <NA>         Brazil -14.23500  -51.92530 confirmed 2020-08-11
## 3            <NA>          India  20.59368   78.96288 confirmed 2020-08-11
## 4            <NA>         Russia  61.52401  105.31876 confirmed 2020-08-11
## 5            <NA>   South Africa -30.55950   22.93750 confirmed 2020-08-11
## 6            <NA>         Mexico  23.63450 -102.55280 confirmed 2020-08-11
## 7            <NA>           Peru  -9.19000  -75.01520 confirmed 2020-08-11
## 8            <NA>       Colombia   4.57090  -74.29730 confirmed 2020-08-11
## 9            <NA>          Chile -35.67510  -71.54300 confirmed 2020-08-11
## 10           <NA>           Iran  32.42791   53.68805 confirmed 2020-08-11
##    Case_Number
## 1      5141208
## 2      3057470
## 3      2329638
## 4       895691
## 5       566109
## 6       492522
## 7       483133
## 8       410453
## 9       376616
## 10      331189

限制回傳筆數

library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 LIMIT 3;")
df  <- dbGetQuery(conn = con, statement = "SELECT * FROM covid19 LIMIT 3 OFFSET 3; ")
dbDisconnect(conn = con)

head(df,10)
##   Province/State      Country/Region      Lat     Long      Case       Date
## 1           <NA>             Andorra  42.5063   1.5218 confirmed 2020-01-22
## 2           <NA>              Angola -11.2027  17.8739 confirmed 2020-01-22
## 3           <NA> Antigua and Barbuda  17.0608 -61.7964 confirmed 2020-01-22
##   Case_Number
## 1           0
## 2           0
## 3           0

練習題

library(DBI)

select_statement <- "
SELECT `Country/Region`, `Case`, Date, Case_Number 
FROM covid19 
WHERE `Country/Region` = 'Taiwan*' AND `Case` = 'confirmed'
ORDER BY Case_Number DESC 
LIMIT 10
"

con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
df  <- dbGetQuery(conn = con, statement = select_statement)
dbDisconnect(conn = con)

head(df)
##   Country/Region      Case       Date Case_Number
## 1        Taiwan* confirmed 2020-08-11         480
## 2        Taiwan* confirmed 2020-08-08         479
## 3        Taiwan* confirmed 2020-08-06         477
## 4        Taiwan* confirmed 2020-08-07         477
## 5        Taiwan* confirmed 2020-08-09         477
## 6        Taiwan* confirmed 2020-08-10         477
#confirmed_df[confirmed_df$`Country/Region`  == 'Taiwan*',]

聚合資料

con <- dbConnect(RSQLite::SQLite(), dbname = 'demo.db')
#df  <- dbGetQuery(conn = con, statement = 'SELECT COUNT(*) AS CNT FROM covid19;')
#df  <- dbGetQuery(conn = con, statement = 'SELECT MAX(Case_Number) AS CNT FROM covid19;')
#df  <- dbGetQuery(conn = con, statement = 'SELECT MIN(Case_Number) AS CNT FROM covid19;')
#df  <- dbGetQuery(conn = con, statement = 'SELECT AVG(Case_Number) AS CNT FROM covid19;')

#df  <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region`, MAX(Case_Number) AS CNT FROM covid19 GROUP BY `Country/Region`;')

#df  <- dbGetQuery(conn = con, statement = 'SELECT `Country/Region`, `Case`, MAX(Case_Number) AS CNT FROM covid19 GROUP BY `Country/Region`, `Case`;')


#df  <- dbGetQuery(conn = con, statement = "SELECT `Country/Region`, `Case`, MAX(Case_Number) AS case_number FROM covid19 WHERE `Case` = 'confirmed' GROUP BY `Country/Region`, `Case` HAVING case_number > 1000000;")
df  <- dbGetQuery(conn = con, statement = "SELECT `Country/Region`, `Case`, MAX(Case_Number) AS case_number FROM covid19 WHERE `Case` = 'confirmed' GROUP BY `Country/Region`, `Case` HAVING case_number > 1000000 ORDER BY case_number DESC;")

dbDisconnect(conn = con)

head(df)
##   Country/Region      Case case_number
## 1             US confirmed     5141208
## 2         Brazil confirmed     3057470
## 3          India confirmed     2329638

建立資料表

create_statement <- "
CREATE TABLE country_stat(
  country VARCHAR(128) NOT NULL,
  cnt_sum INTEGER
);
"

drop_statement <- "
DROP TABLE country_stat;
"

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = drop_statement)
## [1] 0
dbExecute(conn = con, statement = create_statement)
## [1] 0
dbDisconnect(con)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbListTables(conn = con)
## [1] "country_cnt"  "country_stat" "covid19"
dbDisconnect(con)

alter_statement <- '
ALTER TABLE country_stat ADD COLUMN dt DATE;
'

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = alter_statement)
## [1] 0
dbDisconnect(con)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM  country_stat;')
dbDisconnect(con)

str(df)
## 'data.frame':    0 obs. of  3 variables:
##  $ country: chr 
##  $ cnt_sum: int 
##  $ dt     : num

新增資料

insert_statement <- "insert into country_stat(country, cnt_sum, dt) values('Taiwan', 0, '2020-08-10');"

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = insert_statement)
## [1] 1
dbDisconnect(con)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM  country_stat;')
dbDisconnect(con)
df
##   country cnt_sum         dt
## 1  Taiwan       0 2020-08-10
update_statement <- "UPDATE country_stat SET cnt_sum = 100 WHERE country = 'Taiwan';"
con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = update_statement)
## [1] 1
dbDisconnect(con)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM  country_stat;')
dbDisconnect(con)
df
##   country cnt_sum         dt
## 1  Taiwan     100 2020-08-10

資料刪除

delete_statement <- "delete from country_stat  where country ='Taiwan'; "

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = delete_statement)
## [1] 1
dbDisconnect(con)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM  country_stat;')
dbDisconnect(con)
df
## [1] country cnt_sum dt     
## <0 rows> (or 0-length row.names)
select_into_statement <- "
INSERT INTO country_stat(country, cnt_sum, dt) 
SELECT `Country/Region`, MAX(`Case_Number`) AS cnt, MAX(`Date`) FROM covid19 
WHERE `Case` = 'confirmed' GROUP BY `Country/Region`;
"

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbExecute(conn = con, statement = select_into_statement)
## [1] 188
dbDisconnect(con)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = 'SELECT * FROM  country_stat;')
dbDisconnect(con)
head(df)
##               country cnt_sum         dt
## 1         Afghanistan   37269 2020-08-11
## 2             Albania    6676 2020-08-11
## 3             Algeria   36204 2020-08-11
## 4             Andorra     963 2020-08-11
## 5              Angola    1735 2020-08-11
## 6 Antigua and Barbuda      92 2020-08-11

合併表格

join_statement <- "
SELECT * FROM covid19 INNER JOIN country_stat ON covid19.`Country/Region` = country_stat.country;
"


con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = join_statement)
dbDisconnect(con)
head(df)
##   Province/State      Country/Region       Lat      Long      Case       Date
## 1           <NA>         Afghanistan  33.93911  67.70995 confirmed 2020-01-22
## 2           <NA>             Albania  41.15330  20.16830 confirmed 2020-01-22
## 3           <NA>             Algeria  28.03390   1.65960 confirmed 2020-01-22
## 4           <NA>             Andorra  42.50630   1.52180 confirmed 2020-01-22
## 5           <NA>              Angola -11.20270  17.87390 confirmed 2020-01-22
## 6           <NA> Antigua and Barbuda  17.06080 -61.79640 confirmed 2020-01-22
##   Case_Number             country cnt_sum         dt
## 1           0         Afghanistan   37269 2020-08-11
## 2           0             Albania    6676 2020-08-11
## 3           0             Algeria   36204 2020-08-11
## 4           0             Andorra     963 2020-08-11
## 5           0              Angola    1735 2020-08-11
## 6           0 Antigua and Barbuda      92 2020-08-11

練習題

select_statement <- "
SELECT `Country/Region` as country, `Case`, max(Case_Number) as Case_Number FROM covid19 WHERE Date = '2020-08-11' GROUP BY  `Country/Region`, `Case`
"

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df <- dbGetQuery(conn = con, statement = select_statement)
dbDisconnect(con)
#head(df)
#df
m <- df %>% 
  spread(key = `Case`, value = Case_Number)

con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
dbWriteTable(conn = con, name = 'country_cnt',value = m, overwrite=TRUE)
dbDisconnect(con)


con <- dbConnect(drv = RSQLite::SQLite(), dbname = 'demo.db')
df2 <- dbGetQuery(conn = con, statement = "select * from country_cnt;")
dbDisconnect(con)
head(df2)
##               country confirmed deaths recovered
## 1         Afghanistan     37269   1344     26415
## 2             Albania      6676    205      3480
## 3             Algeria     36204   1322     25263
## 4             Andorra       963     52       839
## 5              Angola      1735     80       575
## 6 Antigua and Barbuda        92      3        76

RPostgres

#install.packages('RPostgres’)


con <- dbConnect(drv = RPostgres::Postgres(), 
                 user = 'test', password = 'test', dbname = 'postgres',
                 host = 'localhost')

dbWriteTable(conn = con, name = 'covid19', value = covid19_tidy_df, overwrite=TRUE)
dbDisconnect(conn = con)

con <- dbConnect(drv = RPostgres::Postgres(), 
                 user = 'test', password = 'test', dbname = 'postgres',
                 host = 'localhost')
df <- dbGetQuery(con, "SELECT * FROM covid19 ;")
head(df)
##   Province/State      Country/Region       Lat      Long      Case       Date
## 1           <NA>         Afghanistan  33.93911  67.70995 confirmed 2020-01-22
## 2           <NA>             Albania  41.15330  20.16830 confirmed 2020-01-22
## 3           <NA>             Algeria  28.03390   1.65960 confirmed 2020-01-22
## 4           <NA>             Andorra  42.50630   1.52180 confirmed 2020-01-22
## 5           <NA>              Angola -11.20270  17.87390 confirmed 2020-01-22
## 6           <NA> Antigua and Barbuda  17.06080 -61.79640 confirmed 2020-01-22
##   Case_Number
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0
dbDisconnect(conn= con)