## [1] 1048575 10
## [1] "X" "FlightNumber" "DepDelayMinutes" "DepDelay"
## [5] "Duration" "Distance" "DepTime" "DepDay"
## [9] "Airline" "OriginStateName"
## 'data.frame': 1048575 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ FlightNumber : int 1674 1674 1674 1674 1674 1674 1674 1674 1674 1674 ...
## $ DepDelayMinutes: int 4 0 0 0 2 0 1 0 0 0 ...
## $ DepDelay : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Duration : int 287 287 287 287 287 287 287 287 287 287 ...
## $ Distance : int 2125 2125 2125 2125 2125 2125 2125 2125 2125 2125 ...
## $ DepTime : chr "AM" "AM" "AM" "AM" ...
## $ DepDay : chr "Weekend" "Weekday" "Weekday" "Weekday" ...
## $ Airline : chr "American" "American" "American" "American" ...
## $ OriginStateName: chr "California" "California" "California" "California" ...
airDelay.df$FlightNumber <- as.factor(airDelay.df$FlightNumber)
airDelay.df$DepDelay <- as.factor(airDelay.df$DepDelay)
airDelay.df$DepTime <- as.factor(airDelay.df$DepTime)
airDelay.df$DepDay <- as.factor(airDelay.df$DepDay)
airDelay.df$Airline <- as.factor(airDelay.df$Airline)
airDelay.df$OriginStateName <- as.factor(airDelay.df$OriginStateName)## [1] 11.33995
## [1] 41.05392
## [1] 0
## [1] 2109
## [1] 0 2109
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 11.34 5.00 2109.00
## X FlightNumber DepDelayMinutes DepDelay
## Min. : 1 511 : 575 Min. : 0.00 0:877147
## 1st Qu.: 264329 34 : 564 1st Qu.: 0.00 1:171428
## Median : 528996 352 : 551 Median : 0.00
## Mean : 529263 423 : 525 Mean : 11.34
## 3rd Qu.: 793788 566 : 509 3rd Qu.: 5.00
## Max. :1059843 1905 : 509 Max. :2109.00
## (Other):1045342
## Duration Distance DepTime DepDay
## Min. : 21.0 Min. : 31.0 AM:438993 Weekday:788004
## 1st Qu.: 88.0 1st Qu.: 358.0 PM:609582 Weekend:260571
## Median :120.0 Median : 628.0
## Mean :140.1 Mean : 796.8
## 3rd Qu.:170.0 3rd Qu.:1024.0
## Max. :675.0 Max. :4983.0
##
## Airline OriginStateName
## American :152027 California:110633
## Delta :157957 Texas :109364
## Others :569196 Florida : 80479
## Southwest:169395 Georgia : 64433
## Illinois : 63093
## New York : 60224
## (Other) :560349
## vars n mean sd min max range se
## X 1 1048575 529263.23 305867.50 1 1059843 1059842 298.70
## FlightNumber 2 1048575 NaN NA Inf -Inf -Inf NA
## DepDelayMinutes 3 1048575 11.34 41.05 0 2109 2109 0.04
## DepDelay 4 1048575 NaN NA Inf -Inf -Inf NA
## Duration 5 1048575 140.06 73.11 21 675 654 0.07
## Distance 6 1048575 796.78 599.01 31 4983 4952 0.58
## DepTime 7 1048575 NaN NA Inf -Inf -Inf NA
## DepDay 8 1048575 NaN NA Inf -Inf -Inf NA
## Airline 9 1048575 NaN NA Inf -Inf -Inf NA
## OriginStateName 10 1048575 NaN NA Inf -Inf -Inf NA
# descriptive Statistics using aggregate() fuction
aggregate(airDelay.df[,c(3,5,6)],
by = list(Airline),mean)## Group.1 DepDelayMinutes Duration Distance
## 1 American 12.102929 166.0034 992.1403
## 2 Delta 8.057687 147.7035 857.5109
## 3 Others 12.388894 134.8432 743.1459
## 4 Southwest 10.191210 127.1974 745.0165
## Airline
## American Delta Others Southwest
## 152027 157957 569196 169395
## Airline
## American Delta Others Southwest
## 14.50 15.06 54.28 16.15
freqtab <- table(Airline,DepDay)
proptab <- prop.table(freqtab)
Pertab <- addmargins(round(proptab*100,2))
Pertab## DepDay
## Airline Weekday Weekend Sum
## American 10.85 3.65 14.50
## Delta 11.47 3.59 15.06
## Others 40.56 13.72 54.28
## Southwest 12.26 3.89 16.15
## Sum 75.14 24.85 99.99
freqtab <- table(Airline,DepDelay)
proptab <- prop.table(freqtab)
Pertab <- addmargins(round(proptab*100,2))
Pertab## DepDelay
## Airline 0 1 Sum
## American 12.07 2.43 14.50
## Delta 13.18 1.89 15.07
## Others 45.33 8.95 54.28
## Southwest 13.07 3.09 16.16
## Sum 83.65 16.36 100.01
freqtab <- table(Airline,DepDelay,DepDay)
proptab <- prop.table(freqtab)
Pertab <- addmargins(round(proptab*100,2))
Pertab## , , DepDay = Weekday
##
## DepDelay
## Airline 0 1 Sum
## American 9.01 1.84 10.85
## Delta 9.92 1.55 11.47
## Others 33.62 6.94 40.56
## Southwest 9.89 2.37 12.26
## Sum 62.44 12.70 75.14
##
## , , DepDay = Weekend
##
## DepDelay
## Airline 0 1 Sum
## American 3.06 0.59 3.65
## Delta 3.26 0.33 3.59
## Others 11.71 2.01 13.72
## Southwest 3.18 0.71 3.89
## Sum 21.21 3.64 24.85
##
## , , DepDay = Sum
##
## DepDelay
## Airline 0 1 Sum
## American 12.07 2.43 14.50
## Delta 13.18 1.88 15.06
## Others 45.33 8.95 54.28
## Southwest 13.07 3.08 16.15
## Sum 83.65 16.34 99.99
# 1) Create the Contingency Table for which you want a Pie Chart
tab1 <- table(DepDelay)
# 2) Calculate percentages %
tab2 = prop.table(tab1)
percent <- round(tab2*100,1)
# 3a) Create labels for each pie in the chart
pielabels <- paste(percent, "%", sep="")
# 3b) Generate the Pie Chart
pie(tab2,
col = c("lightblue","red"),
labels = pielabels,
main = '% Delayed Flights',
cex = 1.1)
# 3c) Legend for the pie chart
legend("topright",
c("0","1"),
cex=0.8,
fill=c("lightblue","red"))# 1) Create the Contingency Table for which you want a Pie Chart
tab1 <- table(Airline)
# 2) Calculate percentages %
tab2 = prop.table(tab1)
percent <- round(tab2*100,1)
bp <- barplot(percent,
xlab = "Airline", ylab = "Percent (%)",
main = "% flights by Airline",
col = c("yellow"),
beside = TRUE,
ylim = c(0, 60))
# 3) (Optional) Display the percentages on the Bar Plot
text(bp, 0, percent, pos=3)# 1)
tab1 <- table(DepDelay, Airline)
tab2 <- prop.table(tab1)
tab3 <- round(tab2*100, 2)
# 2) Grouped bar-plot
bp <- barplot(tab3,
xlab = "Airlines", ylab = "Percent (%)",
main = "% of Delayedflights by Airlines",
col = c("lightblue","orange"),
beside = TRUE,
ylim = c(0, 50),
legend = rownames(tab3))
# 3) (Optional) Display percentage on the bars
text(bp, 0, tab3, pos = 3)# 1)
tab1 <- table(DepDelay, DepDay)
tab2 <- prop.table(tab1)
tab3 <- round(tab2*100, 2)
# 2) Grouped bar-plot
bp <- barplot(tab3,
xlab = "DepDay", ylab = "Percent (%)",
main = " % of Delayed flights by DepDay (Weekend/Weekday)",
col = c("lightblue","orange"),
beside = TRUE,
ylim = c(0, 70),
legend = rownames(tab3))
# 3) (Optional) Display percentage on the bars
text(bp, 0, tab3, pos = 3)