Q1

library(rapportools)
## Loading required package: reshape
## 
## Attaching package: 'rapportools'
## The following objects are masked from 'package:stats':
## 
##     IQR, median, sd, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, range, sum
library(outliers)
flights<-read.csv("/home/peopleanalytics/Documents/2008.csv", header=TRUE, sep=",")
str(flights)
## 'data.frame':    7009728 obs. of  29 variables:
##  $ Year             : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ DayOfWeek        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ DepTime          : int  2003 754 628 926 1829 1940 1937 1039 617 1620 ...
##  $ CRSDepTime       : int  1955 735 620 930 1755 1915 1830 1040 615 1620 ...
##  $ ArrTime          : int  2211 1002 804 1054 1959 2121 2037 1132 652 1639 ...
##  $ CRSArrTime       : int  2225 1000 750 1100 1925 2110 1940 1150 650 1655 ...
##  $ UniqueCarrier    : Factor w/ 20 levels "9E","AA","AQ",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ FlightNum        : int  335 3231 448 1746 3920 378 509 535 11 810 ...
##  $ TailNum          : Factor w/ 5374 levels "","80009E","80019E",..: 3769 4129 1961 3059 2142 3852 4062 1961 3616 3324 ...
##  $ ActualElapsedTime: int  128 128 96 88 90 101 240 233 95 79 ...
##  $ CRSElapsedTime   : int  150 145 90 90 90 115 250 250 95 95 ...
##  $ AirTime          : int  116 113 76 78 77 87 230 219 70 70 ...
##  $ ArrDelay         : int  -14 2 14 -6 34 11 57 -18 2 -16 ...
##  $ DepDelay         : int  8 19 8 -4 34 25 67 -1 2 0 ...
##  $ Origin           : Factor w/ 303 levels "ABE","ABI","ABQ",..: 136 136 141 141 141 141 141 141 141 141 ...
##  $ Dest             : Factor w/ 304 levels "ABE","ABI","ABQ",..: 287 287 49 49 49 151 157 157 177 177 ...
##  $ Distance         : int  810 810 515 515 515 688 1591 1591 451 451 ...
##  $ TaxiIn           : int  4 5 3 3 3 4 3 7 6 3 ...
##  $ TaxiOut          : int  8 10 17 7 10 10 7 7 19 6 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : Factor w/ 5 levels "","A","B","C",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CarrierDelay     : int  NA NA NA NA 2 NA 10 NA NA NA ...
##  $ WeatherDelay     : int  NA NA NA NA 0 NA 0 NA NA NA ...
##  $ NASDelay         : int  NA NA NA NA 0 NA 0 NA NA NA ...
##  $ SecurityDelay    : int  NA NA NA NA 0 NA 0 NA NA NA ...
##  $ LateAircraftDelay: int  NA NA NA NA 32 NA 47 NA NA NA ...
summary(flights)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2008   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2008   1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2008   Median : 6.000   Median :16.00   Median :4.000  
##  Mean   :2008   Mean   : 6.375   Mean   :15.73   Mean   :3.924  
##  3rd Qu.:2008   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2008   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime         CRSDepTime      ArrTime         CRSArrTime  
##  Min.   :   1     Min.   :   0   Min.   :   1     Min.   :   0  
##  1st Qu.: 928     1st Qu.: 925   1st Qu.:1107     1st Qu.:1115  
##  Median :1325     Median :1320   Median :1512     Median :1517  
##  Mean   :1334     Mean   :1326   Mean   :1481     Mean   :1495  
##  3rd Qu.:1728     3rd Qu.:1715   3rd Qu.:1909     3rd Qu.:1907  
##  Max.   :2400     Max.   :2359   Max.   :2400     Max.   :2400  
##  NA's   :136246                  NA's   :151649                 
##  UniqueCarrier       FlightNum       TailNum        ActualElapsedTime
##  WN     :1201754   Min.   :   1          :  83365   Min.   :  12.0   
##  AA     : 604885   1st Qu.: 622   N476HA :   4701   1st Qu.:  77.0   
##  OO     : 567159   Median :1571   N477HA :   4548   Median : 110.0   
##  MQ     : 490693   Mean   :2224   N484HA :   4505   Mean   : 127.3   
##  US     : 453589   3rd Qu.:3518   N475HA :   4499   3rd Qu.: 157.0   
##  DL     : 451931   Max.   :9743   N480HA :   4416   Max.   :1379.0   
##  (Other):3239717                  (Other):6903694   NA's   :154699   
##  CRSElapsedTime      AirTime          ArrDelay          DepDelay      
##  Min.   :-141.0   Min.   :   0     Min.   :-519.00   Min.   :-534.00  
##  1st Qu.:  80.0   1st Qu.:  55     1st Qu.: -10.00   1st Qu.:  -4.00  
##  Median : 110.0   Median :  86     Median :  -2.00   Median :  -1.00  
##  Mean   : 128.9   Mean   : 104     Mean   :   8.17   Mean   :   9.97  
##  3rd Qu.: 159.0   3rd Qu.: 132     3rd Qu.:  12.00   3rd Qu.:   8.00  
##  Max.   :1435.0   Max.   :1350     Max.   :2461.00   Max.   :2467.00  
##  NA's   :844      NA's   :154699   NA's   :154699    NA's   :136246   
##      Origin             Dest            Distance          TaxiIn      
##  ATL    : 414513   ATL    : 414521   Min.   :  11.0   Min.   :  0.00  
##  ORD    : 350380   ORD    : 350452   1st Qu.: 325.0   1st Qu.:  4.00  
##  DFW    : 281281   DFW    : 281401   Median : 581.0   Median :  6.00  
##  DEN    : 241443   DEN    : 241470   Mean   : 726.4   Mean   :  6.86  
##  LAX    : 215608   LAX    : 215685   3rd Qu.: 954.0   3rd Qu.:  8.00  
##  PHX    : 199408   PHX    : 199416   Max.   :4962.0   Max.   :308.00  
##  (Other):5307095   (Other):5306783                    NA's   :151649  
##     TaxiOut         Cancelled       CancellationCode    Diverted       
##  Min.   :  0.00   Min.   :0.00000    :6872294        Min.   :0.000000  
##  1st Qu.: 10.00   1st Qu.:0.00000   A:  54330        1st Qu.:0.000000  
##  Median : 14.00   Median :0.00000   B:  54904        Median :0.000000  
##  Mean   : 16.45   Mean   :0.01961   C:  28188        Mean   :0.002463  
##  3rd Qu.: 19.00   3rd Qu.:0.00000   D:     12        3rd Qu.:0.000000  
##  Max.   :429.00   Max.   :1.00000                    Max.   :1.000000  
##  NA's   :137058                                                        
##   CarrierDelay      WeatherDelay        NASDelay       SecurityDelay    
##  Min.   :   0      Min.   :   0      Min.   :   0      Min.   :  0      
##  1st Qu.:   0      1st Qu.:   0      1st Qu.:   0      1st Qu.:  0      
##  Median :   0      Median :   0      Median :   6      Median :  0      
##  Mean   :  16      Mean   :   3      Mean   :  17      Mean   :  0      
##  3rd Qu.:  16      3rd Qu.:   0      3rd Qu.:  21      3rd Qu.:  0      
##  Max.   :2436      Max.   :1352      Max.   :1357      Max.   :392      
##  NA's   :5484993   NA's   :5484993   NA's   :5484993   NA's   :5484993  
##  LateAircraftDelay
##  Min.   :   0     
##  1st Qu.:   0     
##  Median :   0     
##  Mean   :  21     
##  3rd Qu.:  26     
##  Max.   :1316     
##  NA's   :5484993

Q2

counts <- table(flights$UniqueCarrier)
barplot(counts,main="Carrier Distribution",xlab="Number of flights")

count <- table(flights$CancellationCode)
barplot(count,main="Cancellation Code", xlab="Number of cancellations")

Q3

flights <- subset(flights, !UniqueCarrier =='AQ')
flights <- subset(flights, !CancellationCode =='D')

Q4

boxplot(flights$ActualElapsedTime,horizontal = TRUE)

Q5

flights <- flights[which(!(flights$ActualElapsedTime %in% boxplot.stats(flights$ActualElapsedTime)$out)),]

Q6

flights <- subset(flights, TaxiIn < 120 & TaxiIn > 0)

Q7

flights <- subset(flights, TaxiOut < 50 & TaxiOut >0)

Q8

flights$ArrDelay <- ifelse(flights$ArrDelay==outlier(flights$ArrDelay),NA,flights$ArrDelay)

Q9

rp.outlier(flights$Distance)
##  [1] 3972 3784 4243 3784 3711 4243 3386 4502 3711 3784

Q10

outliers <- scores(flights$CRSElapsedTime, type="chisq", prob=0.98)
table(outliers)
## outliers
##   FALSE    TRUE 
## 6200053  210711