Q1
library(rapportools)
## Loading required package: reshape
##
## Attaching package: 'rapportools'
## The following objects are masked from 'package:stats':
##
## IQR, median, sd, var
## The following objects are masked from 'package:base':
##
## max, mean, min, range, sum
library(outliers)
flights<-read.csv("/home/peopleanalytics/Documents/2008.csv", header=TRUE, sep=",")
str(flights)
## 'data.frame': 7009728 obs. of 29 variables:
## $ Year : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 3 3 3 3 3 3 3 3 3 3 ...
## $ DayOfWeek : int 4 4 4 4 4 4 4 4 4 4 ...
## $ DepTime : int 2003 754 628 926 1829 1940 1937 1039 617 1620 ...
## $ CRSDepTime : int 1955 735 620 930 1755 1915 1830 1040 615 1620 ...
## $ ArrTime : int 2211 1002 804 1054 1959 2121 2037 1132 652 1639 ...
## $ CRSArrTime : int 2225 1000 750 1100 1925 2110 1940 1150 650 1655 ...
## $ UniqueCarrier : Factor w/ 20 levels "9E","AA","AQ",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ FlightNum : int 335 3231 448 1746 3920 378 509 535 11 810 ...
## $ TailNum : Factor w/ 5374 levels "","80009E","80019E",..: 3769 4129 1961 3059 2142 3852 4062 1961 3616 3324 ...
## $ ActualElapsedTime: int 128 128 96 88 90 101 240 233 95 79 ...
## $ CRSElapsedTime : int 150 145 90 90 90 115 250 250 95 95 ...
## $ AirTime : int 116 113 76 78 77 87 230 219 70 70 ...
## $ ArrDelay : int -14 2 14 -6 34 11 57 -18 2 -16 ...
## $ DepDelay : int 8 19 8 -4 34 25 67 -1 2 0 ...
## $ Origin : Factor w/ 303 levels "ABE","ABI","ABQ",..: 136 136 141 141 141 141 141 141 141 141 ...
## $ Dest : Factor w/ 304 levels "ABE","ABI","ABQ",..: 287 287 49 49 49 151 157 157 177 177 ...
## $ Distance : int 810 810 515 515 515 688 1591 1591 451 451 ...
## $ TaxiIn : int 4 5 3 3 3 4 3 7 6 3 ...
## $ TaxiOut : int 8 10 17 7 10 10 7 7 19 6 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : Factor w/ 5 levels "","A","B","C",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CarrierDelay : int NA NA NA NA 2 NA 10 NA NA NA ...
## $ WeatherDelay : int NA NA NA NA 0 NA 0 NA NA NA ...
## $ NASDelay : int NA NA NA NA 0 NA 0 NA NA NA ...
## $ SecurityDelay : int NA NA NA NA 0 NA 0 NA NA NA ...
## $ LateAircraftDelay: int NA NA NA NA 32 NA 47 NA NA NA ...
summary(flights)
## Year Month DayofMonth DayOfWeek
## Min. :2008 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2008 1st Qu.: 3.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2008 Median : 6.000 Median :16.00 Median :4.000
## Mean :2008 Mean : 6.375 Mean :15.73 Mean :3.924
## 3rd Qu.:2008 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2008 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime CRSDepTime ArrTime CRSArrTime
## Min. : 1 Min. : 0 Min. : 1 Min. : 0
## 1st Qu.: 928 1st Qu.: 925 1st Qu.:1107 1st Qu.:1115
## Median :1325 Median :1320 Median :1512 Median :1517
## Mean :1334 Mean :1326 Mean :1481 Mean :1495
## 3rd Qu.:1728 3rd Qu.:1715 3rd Qu.:1909 3rd Qu.:1907
## Max. :2400 Max. :2359 Max. :2400 Max. :2400
## NA's :136246 NA's :151649
## UniqueCarrier FlightNum TailNum ActualElapsedTime
## WN :1201754 Min. : 1 : 83365 Min. : 12.0
## AA : 604885 1st Qu.: 622 N476HA : 4701 1st Qu.: 77.0
## OO : 567159 Median :1571 N477HA : 4548 Median : 110.0
## MQ : 490693 Mean :2224 N484HA : 4505 Mean : 127.3
## US : 453589 3rd Qu.:3518 N475HA : 4499 3rd Qu.: 157.0
## DL : 451931 Max. :9743 N480HA : 4416 Max. :1379.0
## (Other):3239717 (Other):6903694 NA's :154699
## CRSElapsedTime AirTime ArrDelay DepDelay
## Min. :-141.0 Min. : 0 Min. :-519.00 Min. :-534.00
## 1st Qu.: 80.0 1st Qu.: 55 1st Qu.: -10.00 1st Qu.: -4.00
## Median : 110.0 Median : 86 Median : -2.00 Median : -1.00
## Mean : 128.9 Mean : 104 Mean : 8.17 Mean : 9.97
## 3rd Qu.: 159.0 3rd Qu.: 132 3rd Qu.: 12.00 3rd Qu.: 8.00
## Max. :1435.0 Max. :1350 Max. :2461.00 Max. :2467.00
## NA's :844 NA's :154699 NA's :154699 NA's :136246
## Origin Dest Distance TaxiIn
## ATL : 414513 ATL : 414521 Min. : 11.0 Min. : 0.00
## ORD : 350380 ORD : 350452 1st Qu.: 325.0 1st Qu.: 4.00
## DFW : 281281 DFW : 281401 Median : 581.0 Median : 6.00
## DEN : 241443 DEN : 241470 Mean : 726.4 Mean : 6.86
## LAX : 215608 LAX : 215685 3rd Qu.: 954.0 3rd Qu.: 8.00
## PHX : 199408 PHX : 199416 Max. :4962.0 Max. :308.00
## (Other):5307095 (Other):5306783 NA's :151649
## TaxiOut Cancelled CancellationCode Diverted
## Min. : 0.00 Min. :0.00000 :6872294 Min. :0.000000
## 1st Qu.: 10.00 1st Qu.:0.00000 A: 54330 1st Qu.:0.000000
## Median : 14.00 Median :0.00000 B: 54904 Median :0.000000
## Mean : 16.45 Mean :0.01961 C: 28188 Mean :0.002463
## 3rd Qu.: 19.00 3rd Qu.:0.00000 D: 12 3rd Qu.:0.000000
## Max. :429.00 Max. :1.00000 Max. :1.000000
## NA's :137058
## CarrierDelay WeatherDelay NASDelay SecurityDelay
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 6 Median : 0
## Mean : 16 Mean : 3 Mean : 17 Mean : 0
## 3rd Qu.: 16 3rd Qu.: 0 3rd Qu.: 21 3rd Qu.: 0
## Max. :2436 Max. :1352 Max. :1357 Max. :392
## NA's :5484993 NA's :5484993 NA's :5484993 NA's :5484993
## LateAircraftDelay
## Min. : 0
## 1st Qu.: 0
## Median : 0
## Mean : 21
## 3rd Qu.: 26
## Max. :1316
## NA's :5484993
Q2
counts <- table(flights$UniqueCarrier)
barplot(counts,main="Carrier Distribution",xlab="Number of flights")

count <- table(flights$CancellationCode)
barplot(count,main="Cancellation Code", xlab="Number of cancellations")

Q3
flights <- subset(flights, !UniqueCarrier =='AQ')
flights <- subset(flights, !CancellationCode =='D')
Q4
boxplot(flights$ActualElapsedTime,horizontal = TRUE)

Q5
flights <- flights[which(!(flights$ActualElapsedTime %in% boxplot.stats(flights$ActualElapsedTime)$out)),]
Q6
flights <- subset(flights, TaxiIn < 120 & TaxiIn > 0)
Q7
flights <- subset(flights, TaxiOut < 50 & TaxiOut >0)
Q8
flights$ArrDelay <- ifelse(flights$ArrDelay==outlier(flights$ArrDelay),NA,flights$ArrDelay)
Q9
rp.outlier(flights$Distance)
## [1] 3972 3784 4243 3784 3711 4243 3386 4502 3711 3784
Q10
outliers <- scores(flights$CRSElapsedTime, type="chisq", prob=0.98)
table(outliers)
## outliers
## FALSE TRUE
## 6200053 210711