library(psych)
library(lattice)
library(corrplot)
## corrplot 0.84 loaded
library(corrgram)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
setwd("C:/Users/vaibhav/Desktop/DataSets")
flights.df<-read.csv("flights.csv")
flights_new.df<-cbind.data.frame(flights.df$YEAR,flights.df$MONTH,flights.df$DAY,flights.df$DAY_OF_WEEK,flights.df$AIRLINE,flights.df$FLIGHT_NUMBER,flights.df$ORIGIN_AIRPORT,flights.df$DESTINATION_AIRPORT,flights.df$DEPARTURE_DELAY,flights.df$AIR_TIME,flights.df$DISTANCE,flights.df$ARRIVAL_DELAY,flights.df$DIVERTED,flights.df$CANCELLED,flights.df$CANCELLATION_REASON,flights.df$WEATHER_DELAY,flights.df$LATE_AIRCRAFT_DELAY,flights.df$AIRLINE_DELAY,flights.df$SECURITY_DELAY,flights.df$AIR_SYSTEM_DELAY)
cancelled_flights.df<-subset(flights_new.df,flights_new.df$`flights.df$CANCELLED`==1)
nrow(cancelled_flights.df)/nrow(flights_new.df)*100
## [1] 1.544643
Percentage of Flights Cancelled:1.57 ###Finding which reasons lead to maximum cancellations
mytable1<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$CANCELLATION_REASON`))
prop.table(mytable1)*100
##
## A B C D
## 0.00000000 28.10511326 54.34893863 17.52147212 0.02447599
barchart(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,data=cancelled_flights.df,col="black")
## Warning in barchart.table(table(x), data, ...): explicit 'data'
## specification ignored
28% of cancellations were due to Airline or carrier reasons 54% of cancellations were due to Weather reasons 17% of cancellations were due to National Air System reasons 0.02% of cancellations were due to Security Reasons Hence maximum 54% of cancellations are due to weather reasons ###Getting into details Analyzing which month got maximum cancellations
mytable2<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$MONTH`))
prop.table(mytable2)*100
##
## 1 2 3 4 5 6 7
## 13.330515 22.826087 12.240221 5.028704 6.334832 10.146411 5.346892
## 8 9 10 11 12
## 5.620578 2.308531 2.730186 5.116595 8.970451
hist( cancelled_flights.df$`flights.df$MONTH`,
xlab="Count",
ylab="Month",
xlim=c(0,12),
main="Cancellations per month",
col=c("red")
)
Top 5 Months with maximum cancellations are 1)february 2)January 3)March 4)June 5)December It can be clearly seen maximum Cancellations are in the season of winter due to heavy snowfall This can be seen as shown by the following table:
mytable3<-table(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,cancelled_flights.df$`flights.df$MONTH`)
prop.table(mytable3,2)*100
##
## 1 2 3 4 5
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## A 23.985978968 13.720329483 22.668605708 39.734513274 35.247629083
## B 58.587881823 75.288784910 62.388656608 39.579646018 48.823322796
## C 17.417793357 10.986011600 14.897291402 20.685840708 15.911485774
## D 0.008345852 0.004874007 0.045446282 0.000000000 0.017562346
##
## 6 7 8 9 10
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## A 39.682017544 53.849354973 46.793349169 52.433734940 38.834555827
## B 36.458333333 18.352059925 25.930324624 24.337349398 39.812550937
## C 23.859649123 27.777777778 27.236737926 23.228915663 21.352893236
## D 0.000000000 0.020807324 0.039588282 0.000000000 0.000000000
##
## 11 12
## 0.000000000 0.000000000
## A 23.592085236 19.583281657
## B 50.858882366 69.614287486
## C 25.309849967 10.802430857
## D 0.239182431 0.000000000
plot(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,cancelled_flights.df$`flights.df$MONTH`)
Analyzing Which Airlines got maximum cancellations
mytable4<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$AIRLINE`))
prop.table(mytable4)*100
##
## AA AS B6 DL EV F9
## 12.1478795 0.7442926 4.7572427 4.2543723 16.9451738 0.6541765
## HA MQ NK OO UA US
## 0.1902452 16.7159895 2.2295403 11.0809488 7.3127587 4.5247208
## VX WN
## 0.5940991 17.8485604
Top 5 Airlines with maximum cancellations are 1)WN :Southwest Airlines Co. 2)EV :Atlantic Southeast Airlines 3)MQ :American Eagle Airlines Inc. 4)AA :American Airlines Inc. 5)OO : Skywest Airlines Inc.
Analyzing on which day flights got maximum cancellations
mytable5<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$DAY_OF_WEEK`))
prop.table(mytable5)*100
##
## 1 2 3 4 5 6 7
## 23.444662 16.768279 11.936496 13.674291 9.795959 9.733657 14.646656
hist(cancelled_flights.df$`flights.df$DAY_OF_WEEK`,
main="Cancellation of flights based on which day of week",
col=c("red"))
In decreasing order of cancellations 1)Monday 2)Tuesday 3)Sunday 4)Thursday 5)Wednesday 6)Friday 7)Saturday