library(tidyverse)
library(RMySQL)
library(ggplot2)
host<- 'cunydata607sql.mysql.database.azure.com'
user<-Sys.getenv('user')
pw<- Sys.getenv('pw')
conn<- dbConnect(MySQL(),
user=user,
password=pw,
host=host,
dbname=user)
df<- dbGetQuery(conn, 'select * from delays')
print(df)
## Index Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 1 Alaska On Time 497 221 212 503 1841
## 2 2 Alaska Delayed 62 12 20 102 305
## 3 3 AM West On Time 694 4840 383 320 201
## 4 4 AM West Delayed 117 415 65 129 61
dropping the index column
df<- df %>%
subset(select=c(2:8))
df
pivotlonger
df1<-pivot_longer(
df,
cols=c('Los Angeles', 'Phoenix', 'San Diego', 'San Francisco', 'Seattle'),
names_to='Destination',
values_to='Number of flights')
df1
ggplot(df1, aes(x=Destination, y= `Number of flights`, fill=Status))+
geom_bar(stat='identity', position='dodge') +
facet_wrap(~Airline) +
theme(
axis.text = element_text(size = 6)
)
df2<- df1 %>%
pivot_wider(
names_from = Status,
values_from =`Number of flights`
)
df2<- df2 %>%
rename('Number of On time flights'='On Time', 'Number of Delayed flights' = 'Delayed')
print(df2)
## # A tibble: 10 × 4
## Airline Destination `Number of On time flights` `Number of Delayed flights`
## <chr> <chr> <int> <int>
## 1 Alaska Los Angeles 497 62
## 2 Alaska Phoenix 221 12
## 3 Alaska San Diego 212 20
## 4 Alaska San Francisco 503 102
## 5 Alaska Seattle 1841 305
## 6 AM West Los Angeles 694 117
## 7 AM West Phoenix 4840 415
## 8 AM West San Diego 383 65
## 9 AM West San Francisco 320 129
## 10 AM West Seattle 201 61
#Adding a column of percentage of delay
df2<- df2 %>%
mutate(`Probability of a delay`=
floor(100*`Number of Delayed flights`/(`Number of On time flights`+`Number of Delayed flights`)))
df2
ggplot(df2, aes(x=Destination, y=`Probability of a delay`, fill = Airline)) +
geom_bar(stat='identity', position = 'dodge') +
scale_fill_manual(values=c('Alaska'='blue', `AM West`='red'))