The chart below describes arrival delays for two airlines across five destinations. Counts of status - ‘on time’ and ‘delayed’ is given in the table for each airlines. Key objective is to tidying and transforming this data. Acceptance criteria and solution approach is described in next sections.
Two approach are implemented in this assignment
Libraries
* dplyr
* tidyr
* ggplot
* aws.s3
* RMySQL
Capabilities
* AWS RDS MySQL
* AWS S3
* RPubs.com
* Gitbub
# Load required libraries
library(tidyverse)
library(ggplot2)
library(dplyr)
library (readr)
library(aws.s3)
library(RMySQL)airlines : To retain airlines code and name
cities : To retain city code and name
status : Airlines flight status - on time, delayed are maintained in this table
airlines_status : This table is used maintain status count and relationship with other tables
clearpage
Using AWS S3 to save .CSV and .SQL files to initialize the database and use in the future
get_bucket(bucket = "msds-data607")## Bucket: msds-data607
##
## $Contents
## Key: airlines.csv
## LastModified: 2021-03-08T01:41:06.000Z
## ETag: "e5487f7cf62df09beee73036d19755d9"
## Size (B): 68
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: airlines_status.csv
## LastModified: 2021-03-08T01:41:07.000Z
## ETag: "5af483fab0d94327be7601a8fce85d3b"
## Size (B): 313
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: airlines_status_cities.csv
## LastModified: 2021-03-08T01:41:06.000Z
## ETag: "0dc676b7194fa1b2734ec35293c1bb2e"
## Size (B): 210
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: cities.csv
## LastModified: 2021-03-08T01:41:08.000Z
## ETag: "c06f72e1a8137ca11237f8e90f738090"
## Size (B): 109
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: loadflights-denormalized.sql
## LastModified: 2021-03-08T01:41:07.000Z
## ETag: "3a70ecb79a7e49ce2d7a8a7bfbd65129"
## Size (B): 567
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: loadflights-normalized.sql
## LastModified: 2021-03-08T01:41:07.000Z
## ETag: "f1e2b935d5c0e00f4602da6f9949aeb3"
## Size (B): 1810
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: loadflights.sql
## LastModified: 2021-03-08T02:15:46.000Z
## ETag: "f0f3a1ea9f34314d7eb8143208ecf76a"
## Size (B): 60
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
##
## $Contents
## Key: status.csv
## LastModified: 2021-03-08T01:41:07.000Z
## ETag: "53c54260ec1b74833a092ee51feed2a7"
## Size (B): 38
## Owner: eb3c34b660dd281cf73e5ddf8f1e26dc2872513ec7496aa57a7439ed44770b10
## Storage class: STANDARD
# Display contentsloadflights.sql
DROP DATABASE IF EXISTS flights;
CREATE DATABASE `flights`
USE flights;
DROP TABLE IF EXISTS airlines_status;
DROP TABLE IF EXISTS airlines_status_cities;
DROP TABLE IF EXISTS airlines;
DROP TABLE IF EXISTS cities;
DROP TABLE IF EXISTS status;
CREATE TABLE `airlines_status_cities` (
`airlines` varchar(25) NOT NULL,
`status` varchar(45) NOT NULL,
`los_angeles` int NOT NULL,
`phoenix` int NOT NULL,
`san_diego` int NOT NULL,
`san_francisco` int NOT NULL,
`seattle` int NOT NULL
);
LOAD DATA INFILE '<<Location>>\\airlines_status_cities.csv'
INTO TABLE airlines_status_cities
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
CREATE TABLE `airlines` (
`id_airlines` int NOT NULL,
`code_airlines` varchar(45) NOT NULL,
`name_airlines` varchar(30) NOT NULL,
PRIMARY KEY (`id_airlines`)
);
LOAD DATA INFILE '<<Location>>\\airlines.csv'
INTO TABLE airlines
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
CREATE TABLE `cities` (
`id_city` int NOT NULL,
`code_city` varchar(45) NOT NULL,
`name_city` varchar(45) NOT NULL,
PRIMARY KEY (`id_city`)
);
LOAD DATA INFILE '<<Location>>\\cities.csv'
INTO TABLE cities
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
CREATE TABLE `status` (
`id_status` int NOT NULL,
`status` varchar(45) NOT NULL,
PRIMARY KEY (`id_status`)
) ;
LOAD DATA INFILE '<<Location>>\\status.csv'
INTO TABLE status
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
CREATE TABLE `airlines_status` (
`id` int NOT NULL,
`id_airlines` int NOT NULL,
`id_city` int NOT NULL,
`id_status` int NOT NULL,
`count_status` int NOT NULL,
PRIMARY KEY (`id`),
KEY `FK_ID_AIRLINES_idx` (`id_airlines`),
KEY `FK_ID_CITY_idx` (`id_city`),
KEY `FK_ID_STATUS_idx` (`id_status`),
CONSTRAINT `FK_ID_AIRLINES` FOREIGN KEY (`id_airlines`) REFERENCES `airlines` (`id_airlines`),
CONSTRAINT `FK_ID_CITY` FOREIGN KEY (`id_city`) REFERENCES `cities` (`id_city`),
CONSTRAINT `FK_ID_STATUS` FOREIGN KEY (`id_status`) REFERENCES `status` (`id_status`)
);
LOAD DATA INFILE '<<Location>>\\airlines_status.csv'
INTO TABLE airlines_status
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
Load data into different normalized tables
airlines.csv
id_airlines,code_airlines,name_airlines
1,ALS,Alaska
2,AMW,AM West
cities.csv
id_city,code_city,name_city
1,LA,Los Angeles
2,PN,Phoenix
3,SD,San Diego
4,SF,San Francisco
5,ST,Seattle
status.csv
id_status,status
1,on time
2,delayed
airlines_status.csv
id,id_airlines,id_city,id_status,count_status
1,1,1,1,497
2,1,2,1,221
3,1,3,1,212
4,1,4,1,503
5,1,5,1,1841
6,1,1,2,62
7,1,2,2,12
8,1,3,2,20
9,1,4,2,102
10,1,5,2,305
11,2,1,1,694
12,2,2,1,4840
13,2,3,1,383
14,2,4,1,320
15,2,5,1,201
16,2,1,2,117
17,2,2,2,415
18,2,3,2,65
19,2,4,2,129
20,2,5,2,61
# Function to return data from a table
user <- rstudioapi::askForPassword("Database username")
password <- rstudioapi::askForPassword("Database password")
connection = dbConnect(MySQL(), user = user, password = password, dbname = 'flights', host = 'msds.cbs1lxtno2zh.us-east-2.rds.amazonaws.com')get_data<- function(table_name) {
dataset <- tbl(connection,dbplyr::in_schema("flights", table_name))
return(dataset)
}# Using dplyr to source data into R
airlines_data<-as.data.frame(get_data('airlines'))
status_data<-as.data.frame(get_data('status'))
cities_data<-as.data.frame(get_data('cities'))
airlines_status_data<-as.data.frame(get_data('airlines_status'))
# Join these data sets together
airlines_status_data <-(left_join(airlines_status_data,status_data,by = "id_status") %>%
left_join(cities_data,by = "id_city"))%>%
left_join(airlines_data,by = "id_airlines")airlines_status_data<-select(airlines_status_data,code_airlines,
name_airlines,status,code_city,name_city,count_status)
# This dataset is ready to tidy
print(airlines_status_data)## code_airlines name_airlines status code_city name_city count_status
## 1 ALS Alaska on time LA Los Angeles 497
## 2 ALS Alaska on time PN Phoenix 221
## 3 ALS Alaska on time SD San Diego 212
## 4 ALS Alaska on time SF San Francisco 503
## 5 ALS Alaska on time ST Seattle 1841
## 6 ALS Alaska delayed LA Los Angeles 62
## 7 ALS Alaska delayed PN Phoenix 12
## 8 ALS Alaska delayed SD San Diego 20
## 9 ALS Alaska delayed SF San Francisco 102
## 10 ALS Alaska delayed ST Seattle 305
## 11 AMW AM West on time LA Los Angeles 694
## 12 AMW AM West on time PN Phoenix 4840
## 13 AMW AM West on time SD San Diego 383
## 14 AMW AM West on time SF San Francisco 320
## 15 AMW AM West on time ST Seattle 201
## 16 AMW AM West delayed LA Los Angeles 117
## 17 AMW AM West delayed PN Phoenix 415
## 18 AMW AM West delayed SD San Diego 65
## 19 AMW AM West delayed SF San Francisco 129
## 20 AMW AM West delayed ST Seattle 61
Lets spread and transform data into required data format
# Lets spread to split status column into 'on time' and 'delayed' columns
airlines_status_data<-airlines_status_data %>%
spread(status,count_status)
names(airlines_status_data)[names(airlines_status_data) == "on time"] <- "on_time"
print(airlines_status_data)## code_airlines name_airlines code_city name_city delayed on_time
## 1 ALS Alaska LA Los Angeles 62 497
## 2 ALS Alaska PN Phoenix 12 221
## 3 ALS Alaska SD San Diego 20 212
## 4 ALS Alaska SF San Francisco 102 503
## 5 ALS Alaska ST Seattle 305 1841
## 6 AMW AM West LA Los Angeles 117 694
## 7 AMW AM West PN Phoenix 415 4840
## 8 AMW AM West SD San Diego 65 383
## 9 AMW AM West SF San Francisco 129 320
## 10 AMW AM West ST Seattle 61 201
airlines_status_cities : This table is used maintain tabular structure of origional datas
Create denormalized table
-- loadflights-denormalized.sql
DROP DATABASE IF EXISTS flights;
CREATE DATABASE `flights`
USE flights;
DROP TABLE IF EXISTS airlines_status_cities;
CREATE TABLE `airlines_status_cities` (
`airlines` varchar(25) NOT NULL,
`status` varchar(45) NOT NULL,
`los_angeles` int NOT NULL,
`phoenix` int NOT NULL,
`san_diego` int NOT NULL,
`san_francisco` int NOT NULL,
`seattle` int NOT NULL
);
LOAD DATA INFILE 'airlines_status_cities.csv'
INTO TABLE airlines_status_cities
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
Load data into denomalized table
airlines,status,los_angeles,phoenix,san_diego,san_francisco,seattle
Alaska,on time,497,221,212,503,1841
Alaska,delayed,62,12,20,102,305
AM West,on time,694,4840,383,320,201
AM West,delayed,117,415,65,129,61
# Using dplyr to source data into R
airlines_status_cities<-as.data.frame(get_data('airlines_status_cities'))
print(airlines_status_cities)## airlines status los_angeles phoenix san_diego san_francisco seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
airlines_status_cities<-(airlines_status_cities %>%
gather('los_angeles', 'phoenix', 'san_diego', 'san_francisco', 'seattle', key = "cities", value = "count_status"))
print(airlines_status_cities)## airlines status cities count_status
## 1 Alaska on time los_angeles 497
## 2 Alaska delayed los_angeles 62
## 3 AM West on time los_angeles 694
## 4 AM West delayed los_angeles 117
## 5 Alaska on time phoenix 221
## 6 Alaska delayed phoenix 12
## 7 AM West on time phoenix 4840
## 8 AM West delayed phoenix 415
## 9 Alaska on time san_diego 212
## 10 Alaska delayed san_diego 20
## 11 AM West on time san_diego 383
## 12 AM West delayed san_diego 65
## 13 Alaska on time san_francisco 503
## 14 Alaska delayed san_francisco 102
## 15 AM West on time san_francisco 320
## 16 AM West delayed san_francisco 129
## 17 Alaska on time seattle 1841
## 18 Alaska delayed seattle 305
## 19 AM West on time seattle 201
## 20 AM West delayed seattle 61
airlines_status_cities<-airlines_status_cities %>%
spread(status,count_status)
names(airlines_status_cities)[names(airlines_status_cities) == "on time"] <- "on_time"
print(airlines_status_cities)## airlines cities delayed on_time
## 1 Alaska los_angeles 62 497
## 2 Alaska phoenix 12 221
## 3 Alaska san_diego 20 212
## 4 Alaska san_francisco 102 503
## 5 Alaska seattle 305 1841
## 6 AM West los_angeles 117 694
## 7 AM West phoenix 415 4840
## 8 AM West san_diego 65 383
## 9 AM West san_francisco 129 320
## 10 AM West seattle 61 201
# This dataset is ready for data analysis
ggplot(data=airlines_status_data, aes(x=name_airlines, y =delayed, fill= name_airlines))+
geom_bar(stat="identity")+facet_wrap(~name_city)+
labs(title="Delayed flights by city", x= "Airlines", y = "Delay Count")+
theme(axis.text.x = element_blank(),plot.title = element_text(hjust=1),legend.position = "right")ggplot(data=airlines_status_data, aes(x=name_airlines, y =on_time, fill= name_airlines))+
geom_bar(stat="identity")+facet_wrap(~name_city)+
labs(title="On time flights by city", x= "Airlines", y = "On time Count")+
theme(axis.text.x = element_blank(),plot.title = element_text(hjust=1),legend.position = "right")Lets make more use of tidyr and dplyr
# Lets mutate to add 3 ore columns - total, percent delay, percent on time
airlines_status_data<-as.data.frame(airlines_status_data %>%
group_by(name_airlines, name_city) %>%
mutate(total = sum(on_time, delayed), precent_on_time = on_time / total, precent_delay = delayed / total))
airlines_status_data## code_airlines name_airlines code_city name_city delayed on_time total
## 1 ALS Alaska LA Los Angeles 62 497 559
## 2 ALS Alaska PN Phoenix 12 221 233
## 3 ALS Alaska SD San Diego 20 212 232
## 4 ALS Alaska SF San Francisco 102 503 605
## 5 ALS Alaska ST Seattle 305 1841 2146
## 6 AMW AM West LA Los Angeles 117 694 811
## 7 AMW AM West PN Phoenix 415 4840 5255
## 8 AMW AM West SD San Diego 65 383 448
## 9 AMW AM West SF San Francisco 129 320 449
## 10 AMW AM West ST Seattle 61 201 262
## precent_on_time precent_delay
## 1 0.8890877 0.11091234
## 2 0.9484979 0.05150215
## 3 0.9137931 0.08620690
## 4 0.8314050 0.16859504
## 5 0.8578751 0.14212488
## 6 0.8557337 0.14426634
## 7 0.9210276 0.07897241
## 8 0.8549107 0.14508929
## 9 0.7126949 0.28730512
## 10 0.7671756 0.23282443
ggplot(data = airlines_status_data,
aes(x = name_city, y = precent_delay*100, color=name_airlines,
shape=name_city)) +
geom_point()+xlab("Cities")+ylab("% Delay")+
ggtitle("Plot - % Delay for each city")ggplot(data = airlines_status_data,
aes(x = name_city, y = precent_on_time*100, color=name_airlines,
shape=name_city)) +
geom_point()+xlab("Cities")+ylab("% On time")+
ggtitle("Plot - % On time for each city")Following functions were used from dplyr and tidyr for tidying and transforming data
- tbl, in_schema
- gathering, spread
- joins, mutate,select, filter, group by