library(DBI)
library(odbc)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(scales)
library(ggpubr)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
#create the vector #create dataframe #write dataframe in .csv file #read the .csv file
data<- data.frame( “airline_name” = c(“Alaska”,““,”“,”AMWEST”,““),”status” = c(“on time”,“delayed”,““,”on time”,“delayed”), “Los.Abgeles” = c(“497”,“62”,“0”,“694”,“117”), “Phoenix” = c(“221”,“12”,“0”,“4840”,“415”), “San.Diego” = c(“212”,“20”,“0”,“383”,“65”), “San.Francisco” = c(“503”,“102”,“0”,“320”,“129”), “Seattle” = c(“1841”,“305”,“0”,“201”,“61”) ) write.csv(data, “data.csv”)
airline_name = c("Alaska","","","AMWEST","")
status <- c("on time","delayed","","on time","delayed")
Los.Abgeles <-c("497","62","0","694","117")
Phoenix <- c("221","12","0","4840","415")
San.Diego <-c("212","20","0","383","65")
San.Francisco <- c("503","102","0","320","129")
Seattle<-c("1841","305","0","201","61")
#create dataframe
data <- data.frame(airline_name,status,Los.Abgeles,Phoenix,San.Diego,San.Francisco,Seattle)
# create .csv file
write.csv(data, "data.csv")
#read .csv file
data <- read.csv("data.csv")
print(data)
## X airline_name status Los.Abgeles Phoenix San.Diego San.Francisco Seattle
## 1 1 Alaska on time 497 221 212 503 1841
## 2 2 delayed 62 12 20 102 305
## 3 3 0 0 0 0 0
## 4 4 AMWEST on time 694 4840 383 320 201
## 5 5 delayed 117 415 65 129 61
# Creating connection
mysqldbTBConn = dbConnect(RMySQL::MySQL(),
dbname= 'md.asaduzzaman39',
host= 'cunydata607sql.mysql.database.azure.com',
port=3306,
user='md.asaduzzaman39',
password='c1706f410226ffca')
dbListTables(mysqldbTBConn)
## [1] "airline_data" "color" "manufacturer" "model" "product"
result = dbSendQuery(mysqldbTBConn, "select * from airline_data")
data <- fetch(result)
print(data)
## airline_name status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 0 0 0 0 0
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
#used na.omit function to remove values # filled airline_name used previous values. # used gather function gather key pair values #used mutate function rename variable # used spread function spread the data #Calculate the deay rate.
data <- na.omit(data) %>%
mutate(airline_name = na_if(airline_name,'')) %>%
# fill(airline_name, .direction = 'up')
fill(airline_name, .direction = 'down') %>%
gather('Location','flight', 3:7) %>%
mutate(status = str_replace(status, 'on time','On_time'),
status = str_replace(status, 'delayed','Delayed')) %>%
spread(status, flight)%>%
mutate(flight = Delayed+On_time, delay_rate= Delayed/ (Delayed+On_time))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tidyr package.
## Please report the issue at <https://github.com/tidyverse/tidyr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
data
## airline_name Location V1 Delayed On_time flight delay_rate
## 1 ALASKA Los.Angeles 0 62 497 559 0.11091234
## 2 ALASKA Phoenix 0 12 221 233 0.05150215
## 3 ALASKA San.Diego 0 20 212 232 0.08620690
## 4 ALASKA San.Francisco 0 102 503 605 0.16859504
## 5 ALASKA Seattle 0 305 1841 2146 0.14212488
## 6 AM WEST Los.Angeles NA 117 694 811 0.14426634
## 7 AM WEST Phoenix NA 415 4840 5255 0.07897241
## 8 AM WEST San.Diego NA 65 383 448 0.14508929
## 9 AM WEST San.Francisco NA 129 320 449 0.28730512
## 10 AM WEST Seattle NA 61 201 262 0.23282443
Delays_data <- data %>%
select(airline_name, flight, Delayed, On_time) %>%
group_by(airline_name) %>%
summarize(Delayed = sum(Delayed), Flight = sum(flight), Delay.Rate = sum(Delayed)/sum(flight))
Delays_data
## # A tibble: 2 × 4
## airline_name Delayed Flight Delay.Rate
## <chr> <int> <int> <dbl>
## 1 ALASKA 501 3775 0.133
## 2 AM WEST 787 7225 0.109
plot1 <- ggplot(data = Delays_data, aes(x=airline_name, y=Flight, fill=airline_name))+
geom_bar(stat='identity')+
ggtitle('Total Flights by Air Line')+
geom_text(aes(label=Flight),vjust=-0.5, hjust=0.5)
plot2 <- ggplot(data = Delays_data, aes(x=airline_name, y=Delayed, fill=airline_name))+
geom_bar(stat='identity')+
ggtitle('Delay Rate by Air Line')+
geom_text(aes(label=percent(Delay.Rate)),vjust=-0.5, hjust=0.5)
ggarrange(plot1,plot2)
Conclusions: Overall Alaska Airline had 3775 flights, and delayed rate
is 13.3% and AM West Airline had 7225 flights,and delayed rate is 10.9%
but We compare the flight’s delay rate of the airline by
destinations(cities). Alaska airline flights have better performence
than AM West. So it is better to fly with Alaska airline’s flight.