I am using DBI library for database access and tidyverse library for data manipulation.
library(DBI)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
I will create ODBC connection object
con <- dbConnect(odbc::odbc(),
.connection_string = "Driver={SQL Server};Server=(local);Database=tb;Trusted_Connection=Yes;")
I will create three tables with foreign key references and primary keys. I will use a “wide” structure for tables.
rs = dbSendStatement(con, "DROP TABLE IF EXISTS dbo.AirCurrent;
DROP TABLE IF EXISTS dbo.Airline;
DROP TABLE IF EXISTS dbo.AirStatus;
CREATE TABLE dbo.Airline (
AirlineID INT NOT NULL PRIMARY KEY,
Airline VARCHAR(30) NOT NULL );
CREATE TABLE dbo.AirStatus (
AirStatusID INT NOT NULL PRIMARY KEY,
AirStatus VARCHAR(30) NOT NULL );
CREATE TABLE dbo.AirCurrent (
AirlineID INT NOT NULL FOREIGN KEY REFERENCES Airline(AirlineID),
AirStatusID INT NOT NULL FOREIGN KEY REFERENCES AirStatus(AirStatusID),
[Los_Angeles] INT NOT NULL,
Phoenix INT NOT NULL,
[San Diego] INT NOT NULL,
[San Francisco] INT NOT NULL,
Seattle INT NOT NULL,
CONSTRAINT PK_AirCurrent PRIMARY KEY (AirlineID,AirStatusID));")
dbClearResult(rs)
Because number of records is small I will use INSERT statement instead of bulk load from csv files.
rs = dbSendStatement(con, "INSERT INTO dbo.Airline (AirlineID, Airline) VALUES
(1, 'ALASKA'),
(2, 'AM WEST');
INSERT INTO dbo.AirStatus (AirStatusID, AirStatus) VALUES
(1, 'on time'),
(2, 'delayed');
INSERT INTO dbo.AirCurrent(AirlineID,AirStatusID,Los_Angeles,Phoenix,[San Diego],[San Francisco],Seattle)
VALUES
(1, 1, 497,221,212,503,1841),
(1,2,62,12,20,102,305),
(2,1,694,4840,383,320,201),
(2,2,117,416,65,129,61);")
dbClearResult(rs)
I will create a SELECT statement which will join three tables.
rs <- dbSendQuery(con, "SELECT Airline.Airline, AirStatus.AirStatus, Los_Angeles,Phoenix,[San Diego],[San Francisco],Seattle
FROM dbo.AirCurrent AirCurrent
JOIN dbo.Airline Airline ON AirCurrent.AirlineID = Airline.AirlineID
JOIN dbo.AirStatus AirStatus ON AirCurrent.AirStatusID = AirStatus.AirStatusID;")
df = data.frame(dbFetch(rs))
dbClearResult(rs)
head(df)
## Airline AirStatus Los_Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 416 65 129 61
tbl = tibble(df)
tbl
## # A tibble: 4 x 7
## Airline AirStatus Los_Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 416 65 129 61
I will use pivot_wider to pivoting original tibble to wide format
tbl2 = tbl %>% pivot_wider(names_from = AirStatus, values_from = c(Los_Angeles,Phoenix,San.Diego,San.Francisco,Seattle))
tbl3 =mutate(tbl2, arrival_delays = Los_Angeles_delayed+Phoenix_delayed+San.Diego_delayed+San.Francisco_delayed+Seattle_delayed)
select(tbl3,Airline, arrival_delays)
## # A tibble: 2 x 2
## Airline arrival_delays
## <chr> <int>
## 1 ALASKA 501
## 2 AM WEST 788