Business case: Analyze flight data of two Airlines two five citities and compare perrformance of airlines.

  1. File creation and data transformation for data analysis
# Preparation of initial file

airline<-read.csv("C:/Users/Arindam/Documents/Data Science/Cuny/Data 607/Assignments/Airline.csv",header=TRUE)

airline
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska On Time         497     221       212           503    1841
## 2         Delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM West On Time         694    4840       383           320     201
## 5         Delayed         117     415        65           129      61
# File formating

airline<-airline[airline$X.1!="",]
airline$X[airline$X==""]<-NA
colnames(airline)[1]<-"Airline_Name"
colnames(airline)[2]<-"Arrival_status"
airline
##   Airline_Name Arrival_status Los.Angeles Phoenix San.Diego San.Francisco
## 1       Alaska        On Time         497     221       212           503
## 2         <NA>        Delayed          62      12        20           102
## 4      AM West        On Time         694    4840       383           320
## 5         <NA>        Delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 4     201
## 5      61
# creating file for analysis 
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
airline<-airline %>% fill(Airline_Name)
airline
##   Airline_Name Arrival_status Los.Angeles Phoenix San.Diego San.Francisco
## 1       Alaska        On Time         497     221       212           503
## 2       Alaska        Delayed          62      12        20           102
## 4      AM West        On Time         694    4840       383           320
## 5      AM West        Delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 4     201
## 5      61
  1. Summarize data to do analysis of data
# Analysis of Airline performance

cbind(airline[,(1:2)],(prop.table(as.matrix(select(airline, Airline_Name, Arrival_status,Los.Angeles,Phoenix,San.Diego,San.Francisco, Seattle)[,-(1:2)]),2)))
##   Airline_Name Arrival_status Los.Angeles     Phoenix  San.Diego
## 1       Alaska        On Time  0.36277372 0.040269679 0.31176471
## 2       Alaska        Delayed  0.04525547 0.002186589 0.02941176
## 4      AM West        On Time  0.50656934 0.881924198 0.56323529
## 5      AM West        Delayed  0.08540146 0.075619534 0.09558824
##   San.Francisco    Seattle
## 1    0.47722960 0.76453488
## 2    0.09677419 0.12666113
## 4    0.30360531 0.08347176
## 5    0.12239089 0.02533223
# Analysis-Los Angeles AM West has slightly higher number of flights but also had larger share of delayed flights
#Phoenix has mainly flights operated by AM West.
# for Sandiego , AM West has higher number of fligts delayed and more number of flights too

# In San Franscisco Alaska airline has performed better than AM West
# for Seattle Alaska Airline is the main operator 
  1. Create a file with proportions of data group by Airlines
# Comparative performance of airlines

airline_alaska<-subset(airline, Airline_Name=="Alaska")
airline_AM_West<-subset(airline, Airline_Name=="AM West")

airline_alaska<-cbind(airline_alaska[,c(1,2)],prop.table(as.matrix(airline_alaska[,-(1:2)]),2))

airline_AM_West<-cbind(airline_AM_West[,c(1,2)],prop.table(as.matrix(airline_AM_West[,-(1:2)]),2))

Airline_final<-rbind(airline_AM_West,airline_alaska)
  1. Prepare chart to present comparative data
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.2.3
frame<-melt(Airline_final,id.var=c("Airline_Name","Arrival_status"))

ggplot(frame, aes(x = variable, y = value)) + geom_bar(stat = "identity") +
  facet_wrap(Arrival_status ~Airline_Name)

# Analysis:
# Looking at the chart it appears for Los Angeles, Phoenix, San Diego both the airlines are performed almost similar
# for San Francisco and Seattle Alaska has performed better