# Good Practise: Basic house keeping: cleanup the env before you start new work
rm(list=ls())
# Libraries
library(tidyverse)## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(RMySQL)## Loading required package: DBI
library(ggplot2)
library(knitr)/*
tidydata tables
DROP TABLE IF EXISTS tidydata;
# Create new table
CREATE TABLE tidydata
(
airline varchar(20) ,
status varchar(20) NOT NULL,
LosAngeles integer NOT NULL,
Phoenix integer NOT NULL,
SanDiego integer NOT NULL,
SanFrancisco integer NOT NULL,
Seattle integer NOT NULL
);
insert into tidydata values ("Alaska", "on time", 497, 221, 212, 503, 1841);
insert into tidydata values ("Alaska", "delayed", 62, 12, 20, 102, 305);
insert into tidydata values ("AM West", "on time", 694, 4840, 383, 320, 201);
insert into tidydata values ("AM West", "delayed", 117, 415, 65, 129, 61) ;
*/
##
# Create a database connection
##
mydb = dbConnect(MySQL(), user='root', password='mysql57', host='localhost', dbname="tb")##
# Create a Database connection
##
untidywidedata <- fetch(dbSendQuery(mydb, "select * from tidydata;"))
head(untidywidedata)## airline status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
##
# Use gather() to gather all the city names
##
tidylongdata <- gather(untidywidedata, city, flights, 3:7)
head(tidylongdata)## airline status city flights
## 1 Alaska on time LosAngeles 497
## 2 Alaska delayed LosAngeles 62
## 3 AM West on time LosAngeles 694
## 4 AM West delayed LosAngeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
##
# Use spread() to spread the data based on flight status
##
tidydata <- spread(tidylongdata, status, flights)
head(tidydata)## airline city delayed on time
## 1 Alaska LosAngeles 62 497
## 2 Alaska Phoenix 12 221
## 3 Alaska SanDiego 20 212
## 4 Alaska SanFrancisco 102 503
## 5 Alaska Seattle 305 1841
## 6 AM West LosAngeles 117 694
tidydata## airline city delayed on time
## 1 Alaska LosAngeles 62 497
## 2 Alaska Phoenix 12 221
## 3 Alaska SanDiego 20 212
## 4 Alaska SanFrancisco 102 503
## 5 Alaska Seattle 305 1841
## 6 AM West LosAngeles 117 694
## 7 AM West Phoenix 415 4840
## 8 AM West SanDiego 65 383
## 9 AM West SanFrancisco 129 320
## 10 AM West Seattle 61 201
##
# Group by airline and summarize and then mutate
##
airline_flights <- group_by(tidydata, airline) %>%
summarise(total_delay=sum(delayed), total_ontime=sum(`on time`), total=sum(delayed+`on time`))
head(airline_flights)## # A tibble: 2 x 4
## airline total_delay total_ontime total
## <chr> <int> <int> <int>
## 1 Alaska 501 3274 3775
## 2 AM West 787 6438 7225
summary_by_airline <- group_by(tidydata, airline) %>%
summarise(avg_delay = mean(delayed), avg_on_time = mean(`on time`)) %>%
mutate(percent_delay=100*(avg_delay/(avg_delay+avg_on_time)))
summary_by_airline <- inner_join(summary_by_airline, airline_flights, by="airline")
kable(summary_by_airline)| airline | avg_delay | avg_on_time | percent_delay | total_delay | total_ontime | total |
|---|---|---|---|---|---|---|
| Alaska | 100.2 | 654.8 | 13.27152 | 501 | 3274 | 3775 |
| AM West | 157.4 | 1287.6 | 10.89273 | 787 | 6438 | 7225 |
##
# Group by city and summarize and then mutate
##
summary_by_city <- group_by(tidydata, city) %>%
summarise(avg_delay = mean(delayed), avg_on_time = mean(`on time`)) %>%
mutate(percent_delay=100*(avg_delay/(avg_delay+avg_on_time)))
kable(summary_by_city)| city | avg_delay | avg_on_time | percent_delay |
|---|---|---|---|
| LosAngeles | 89.5 | 595.5 | 13.065693 |
| Phoenix | 213.5 | 2530.5 | 7.780612 |
| SanDiego | 42.5 | 297.5 | 12.500000 |
| SanFrancisco | 115.5 | 411.5 | 21.916508 |
| Seattle | 183.0 | 1021.0 | 15.199336 |
##
# Total Delays by Airline
##
ggplot(data=tidydata, aes(airline, delayed))+
geom_boxplot()+
theme_bw()+
labs(title="Total Delays by Airline", x="Airline Name", y="Total Delays")+
theme(plot.title = element_text(hjust = 0.5)) ##### This graph is misleading as it makes one believe that AM West had higher delays. While AM West did have higher number of delays but since they had greater number of flights, the percent of delays AM West was lower than Alaska airlines.
ggplot(data=summary_by_city, aes(city, percent_delay, color=city))+
geom_point()+
theme_bw()+
labs(title="Percent Delays by City", x="City", y="Percent Delay")+
theme(plot.title = element_text(hjust = 0.5))Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.