Business case: Analyze flight data of two Airlines two five citities and compare perrformance of airlines.
# Preparation of initial file
airline<-read.csv("C:/Users/Arindam/Documents/Data Science/Cuny/Data 607/Assignments/Airline.csv",header=TRUE)
airline## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska On Time 497 221 212 503 1841
## 2 Delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM West On Time 694 4840 383 320 201
## 5 Delayed 117 415 65 129 61
# File formating
airline<-airline[airline$X.1!="",]
airline$X[airline$X==""]<-NA
colnames(airline)[1]<-"Airline_Name"
colnames(airline)[2]<-"Arrival_status"
airline## Airline_Name Arrival_status Los.Angeles Phoenix San.Diego San.Francisco
## 1 Alaska On Time 497 221 212 503
## 2 <NA> Delayed 62 12 20 102
## 4 AM West On Time 694 4840 383 320
## 5 <NA> Delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 4 201
## 5 61
# creating file for analysis
library(tidyr)## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)## Warning: package 'dplyr' was built under R version 3.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
airline<-airline %>% fill(Airline_Name)
airline## Airline_Name Arrival_status Los.Angeles Phoenix San.Diego San.Francisco
## 1 Alaska On Time 497 221 212 503
## 2 Alaska Delayed 62 12 20 102
## 4 AM West On Time 694 4840 383 320
## 5 AM West Delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 4 201
## 5 61
# Analysis of Airline performance
cbind(airline[,(1:2)],(prop.table(as.matrix(select(airline, Airline_Name, Arrival_status,Los.Angeles,Phoenix,San.Diego,San.Francisco, Seattle)[,-(1:2)]),2)))## Airline_Name Arrival_status Los.Angeles Phoenix San.Diego
## 1 Alaska On Time 0.36277372 0.040269679 0.31176471
## 2 Alaska Delayed 0.04525547 0.002186589 0.02941176
## 4 AM West On Time 0.50656934 0.881924198 0.56323529
## 5 AM West Delayed 0.08540146 0.075619534 0.09558824
## San.Francisco Seattle
## 1 0.47722960 0.76453488
## 2 0.09677419 0.12666113
## 4 0.30360531 0.08347176
## 5 0.12239089 0.02533223
# Analysis-Los Angeles AM West has slightly higher number of flights but also had larger share of delayed flights
#Phoenix has mainly flights operated by AM West.
# for Sandiego , AM West has higher number of fligts delayed and more number of flights too
# In San Franscisco Alaska airline has performed better than AM West
# for Seattle Alaska Airline is the main operator # Comparative performance of airlines
airline_alaska<-subset(airline, Airline_Name=="Alaska")
airline_AM_West<-subset(airline, Airline_Name=="AM West")
airline_alaska<-cbind(airline_alaska[,c(1,2)],prop.table(as.matrix(airline_alaska[,-(1:2)]),2))
airline_AM_West<-cbind(airline_AM_West[,c(1,2)],prop.table(as.matrix(airline_AM_West[,-(1:2)]),2))
Airline_final<-rbind(airline_AM_West,airline_alaska)library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.2.3
library(reshape2)## Warning: package 'reshape2' was built under R version 3.2.3
frame<-melt(Airline_final,id.var=c("Airline_Name","Arrival_status"))
ggplot(frame, aes(x = variable, y = value)) + geom_bar(stat = "identity") +
facet_wrap(Arrival_status ~Airline_Name)# Analysis:
# Looking at the chart it appears for Los Angeles, Phoenix, San Diego both the airlines are performed almost similar
# for San Francisco and Seattle Alaska has performed better