library(DBI) 
library(odbc)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(scales)
library(ggpubr)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Create a .csv file.

#create the vector #create dataframe #write dataframe in .csv file #read the .csv file

data<- data.frame( “airline_name” = c(“Alaska”,““,”“,”AMWEST”,““),”status” = c(“on time”,“delayed”,““,”on time”,“delayed”), “Los.Abgeles” = c(“497”,“62”,“0”,“694”,“117”), “Phoenix” = c(“221”,“12”,“0”,“4840”,“415”), “San.Diego” = c(“212”,“20”,“0”,“383”,“65”), “San.Francisco” = c(“503”,“102”,“0”,“320”,“129”), “Seattle” = c(“1841”,“305”,“0”,“201”,“61”) ) write.csv(data, “data.csv”)

airline_name = c("Alaska","","","AMWEST","")
status <- c("on time","delayed","","on time","delayed")
Los.Abgeles <-c("497","62","0","694","117")
Phoenix <- c("221","12","0","4840","415")
San.Diego <-c("212","20","0","383","65")
San.Francisco <- c("503","102","0","320","129")
Seattle<-c("1841","305","0","201","61")

#create dataframe

data <- data.frame(airline_name,status,Los.Abgeles,Phoenix,San.Diego,San.Francisco,Seattle)

# create .csv file
write.csv(data, "data.csv")

#read .csv file

data <- read.csv("data.csv")

print(data)
##   X airline_name  status Los.Abgeles Phoenix San.Diego San.Francisco Seattle
## 1 1       Alaska on time         497     221       212           503    1841
## 2 2              delayed          62      12        20           102     305
## 3 3                                0       0         0             0       0
## 4 4       AMWEST on time         694    4840       383           320     201
## 5 5              delayed         117     415        65           129      61

connect to Azure database

# Creating connection


mysqldbTBConn = dbConnect(RMySQL::MySQL(),
                            dbname= 'md.asaduzzaman39',
                            host= 'cunydata607sql.mysql.database.azure.com',
                            port=3306,
                            user='md.asaduzzaman39',
                            password='c1706f410226ffca')
dbListTables(mysqldbTBConn)
## [1] "airline_data" "color"        "manufacturer" "model"        "product"

read table’s data from Azure database(table called airline_data)

result = dbSendQuery(mysqldbTBConn, "select * from airline_data")
data <- fetch(result)

print(data)
##   airline_name  status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1       ALASKA on time         497     221       212           503    1841
## 2              delayed          62      12        20           102     305
## 3                                0       0         0             0       0
## 4      AM WEST on time         694    4840       383           320     201
## 5              delayed         117     415        65           129      61

Analysis

#used na.omit function to remove values # filled airline_name used previous values. # used gather function gather key pair values #used mutate function rename variable # used spread function spread the data #Calculate the deay rate.

data <- na.omit(data) %>%
  mutate(airline_name = na_if(airline_name,'')) %>%
 # fill(airline_name, .direction = 'up')
  fill(airline_name, .direction = 'down') %>%
  gather('Location','flight', 3:7) %>%
   mutate(status = str_replace(status, 'on time','On_time'),
          status = str_replace(status, 'delayed','Delayed')) %>%
   spread(status, flight)%>%
  mutate(flight = Delayed+On_time, delay_rate= Delayed/ (Delayed+On_time))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tidyr package.
##   Please report the issue at <https://github.com/tidyverse/tidyr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
 data 
##    airline_name      Location V1 Delayed On_time flight delay_rate
## 1        ALASKA   Los.Angeles  0      62     497    559 0.11091234
## 2        ALASKA       Phoenix  0      12     221    233 0.05150215
## 3        ALASKA     San.Diego  0      20     212    232 0.08620690
## 4        ALASKA San.Francisco  0     102     503    605 0.16859504
## 5        ALASKA       Seattle  0     305    1841   2146 0.14212488
## 6       AM WEST   Los.Angeles NA     117     694    811 0.14426634
## 7       AM WEST       Phoenix NA     415    4840   5255 0.07897241
## 8       AM WEST     San.Diego NA      65     383    448 0.14508929
## 9       AM WEST San.Francisco NA     129     320    449 0.28730512
## 10      AM WEST       Seattle NA      61     201    262 0.23282443
  1. Perform analysis to compare the arrival delays for the two airlines.
Delays_data <- data %>%
  select(airline_name, flight, Delayed, On_time) %>%
  group_by(airline_name) %>%
  summarize(Delayed = sum(Delayed), Flight = sum(flight), Delay.Rate = sum(Delayed)/sum(flight))
Delays_data
## # A tibble: 2 × 4
##   airline_name Delayed Flight Delay.Rate
##   <chr>          <int>  <int>      <dbl>
## 1 ALASKA           501   3775      0.133
## 2 AM WEST          787   7225      0.109

plot two graphs to see the total flight by airline and delay rate by airline

plot1 <- ggplot(data = Delays_data, aes(x=airline_name, y=Flight, fill=airline_name))+
  geom_bar(stat='identity')+
  ggtitle('Total Flights by Air Line')+
  geom_text(aes(label=Flight),vjust=-0.5, hjust=0.5)

plot2 <- ggplot(data = Delays_data, aes(x=airline_name, y=Delayed, fill=airline_name))+
  geom_bar(stat='identity')+
  ggtitle('Delay Rate by Air Line')+
   geom_text(aes(label=percent(Delay.Rate)),vjust=-0.5, hjust=0.5)
   

ggarrange(plot1,plot2)

Conclusions: Overall Alaska Airline had 3775 flights, and delayed rate is 13.3% and AM West Airline had 7225 flights,and delayed rate is 10.9% but We compare the flight’s delay rate of the airline by destinations(cities). Alaska airline flights have better performence than AM West. So it is better to fly with Alaska airline’s flight.