Each user must set a working director for her/his own computer.

setwd("~/Dropbox/MyRWork")
mydata <- read.csv("OnTime.csv")

Common commands to explore a data frame include:

dim(mydata)
## [1] 49101    50
names(mydata)
##  [1] "Year"                 "Quarter"              "Month"               
##  [4] "DayofMonth"           "DayOfWeek"            "FlightDate"          
##  [7] "UniqueCarrier"        "TailNum"              "FlightNum"           
## [10] "Origin"               "OriginCityName"       "OriginState"         
## [13] "OriginStateFips"      "OriginStateName"      "Dest"                
## [16] "DestCityName"         "DestState"            "DestStateFips"       
## [19] "DestStateName"        "CRSDepTime"           "DepTime"             
## [22] "DepDelay"             "DepDelayMinutes"      "DepDel15"            
## [25] "DepartureDelayGroups" "DepTimeBlk"           "TaxiOut"             
## [28] "WheelsOff"            "WheelsOn"             "TaxiIn"              
## [31] "CRSArrTime"           "ArrTime"              "ArrDelay"            
## [34] "ArrDelayMinutes"      "ArrDel15"             "ArrivalDelayGroups"  
## [37] "ArrTimeBlk"           "Cancelled"            "CancellationCode"    
## [40] "Diverted"             "CRSElapsedTime"       "Distan"              
## [43] "AirTime"              "Distance"             "DistanceGroup"       
## [46] "CarrierDelay"         "WeatherDelay"         "NASDelay"            
## [49] "SecurityDelay"        "LateAircraftDelay"
str(mydata)
## 'data.frame':    49101 obs. of  50 variables:
##  $ Year                : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ Quarter             : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Month               : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ DayofMonth          : int  23 23 21 21 21 21 21 21 21 21 ...
##  $ DayOfWeek           : int  4 4 2 2 2 2 2 2 2 2 ...
##  $ FlightDate          : Factor w/ 31 levels "10/1/2014","10/10/2014",..: 16 16 14 14 14 14 14 14 14 14 ...
##  $ UniqueCarrier       : Factor w/ 14 levels "AA","AS","B6",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ TailNum             : Factor w/ 4167 levels "","N001AA","N002AA",..: 1920 1920 4041 3971 3817 2029 3866 2033 1956 3256 ...
##  $ FlightNum           : int  1964 1964 1899 1906 1908 1915 1920 1921 1926 1927 ...
##  $ Origin              : Factor w/ 308 levels "ABE","ABI","ABQ",..: 20 158 37 93 218 20 258 93 20 112 ...
##  $ OriginCityName      : Factor w/ 304 levels "Aberdeen, SD",..: 18 140 200 82 59 18 260 82 18 102 ...
##  $ OriginState         : Factor w/ 53 levels "AK","AL","AR",..: 10 9 43 22 14 10 10 22 10 9 ...
##  $ OriginStateFips     : int  13 12 47 26 17 13 13 26 13 12 ...
##  $ OriginStateName     : Factor w/ 53 levels "Alabama","Alaska",..: 10 9 43 22 13 10 10 22 10 9 ...
##  $ Dest                : Factor w/ 307 levels "ABE","ABI","ABQ",..: 157 20 20 37 93 227 20 227 183 20 ...
##  $ DestCityName        : Factor w/ 303 levels "Aberdeen, SD",..: 139 18 18 199 82 224 18 224 147 18 ...
##  $ DestState           : Factor w/ 53 levels "AK","AL","AR",..: 9 10 10 43 22 4 10 4 24 10 ...
##  $ DestStateFips       : int  12 13 13 47 26 4 13 4 29 13 ...
##  $ DestStateName       : Factor w/ 53 levels "Alabama","Alaska",..: 9 10 10 43 22 3 10 3 25 10 ...
##  $ CRSDepTime          : int  1628 1830 1605 847 1400 2045 900 825 1752 745 ...
##  $ DepTime             : int  1628 1824 1604 842 1403 2043 855 825 1814 744 ...
##  $ DepDelay            : int  0 -6 -1 -5 3 -2 -5 0 22 -1 ...
##  $ DepDelayMinutes     : int  0 0 0 0 3 0 0 0 22 0 ...
##  $ DepDel15            : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ DepartureDelayGroups: int  0 -1 -1 -1 0 -1 -1 0 1 -1 ...
##  $ DepTimeBlk          : Factor w/ 19 levels "0001-0559","0600-0659",..: 12 14 12 4 10 16 5 4 13 3 ...
##  $ TaxiOut             : int  18 16 16 18 15 26 17 18 15 14 ...
##  $ WheelsOff           : int  1646 1840 1620 900 1418 2109 912 843 1829 758 ...
##  $ WheelsOn            : int  1728 1921 1757 901 1616 2135 951 917 1905 923 ...
##  $ TaxiIn              : int  4 13 5 7 5 5 14 2 6 6 ...
##  $ CRSArrTime          : int  1740 1949 1817 923 1621 2155 1010 940 1904 936 ...
##  $ ArrTime             : int  1732 1934 1802 908 1621 2140 1005 919 1911 929 ...
##  $ ArrDelay            : int  -8 -15 -15 -15 0 -15 -5 -21 7 -7 ...
##  $ ArrDelayMinutes     : int  0 0 0 0 0 0 0 0 7 0 ...
##  $ ArrDel15            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ArrivalDelayGroups  : int  -1 -1 -1 -1 0 -1 -1 -2 0 -1 ...
##  $ ArrTimeBlk          : Factor w/ 19 levels "0001-0559","0600-0659",..: 13 15 14 5 12 17 6 5 15 5 ...
##  $ Cancelled           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode    : Factor w/ 4 levels "","A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Diverted            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRSElapsedTime      : int  72 79 72 96 81 250 70 255 132 111 ...
##  $ Distan              : int  64 70 58 86 78 237 70 234 117 105 ...
##  $ AirTime             : int  42 41 37 61 58 206 39 214 96 85 ...
##  $ Distance            : int  270 270 214 456 235 1587 214 1671 692 581 ...
##  $ DistanceGroup       : int  2 2 1 2 1 7 1 7 3 3 ...
##  $ CarrierDelay        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WeatherDelay        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ NASDelay            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SecurityDelay       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LateAircraftDelay   : int  NA NA NA NA NA NA NA NA NA NA ...
head(mydata)
##   Year Quarter Month DayofMonth DayOfWeek FlightDate UniqueCarrier TailNum
## 1 2014       4    10         23         4 10/23/2014            DL  N530US
## 2 2014       4    10         23         4 10/23/2014            DL  N530US
## 3 2014       4    10         21         2 10/21/2014            DL  N959DN
## 4 2014       4    10         21         2 10/21/2014            DL  N948DN
## 5 2014       4    10         21         2 10/21/2014            DL  N926DL
## 6 2014       4    10         21         2 10/21/2014            DL  N555NW
##   FlightNum Origin   OriginCityName OriginState OriginStateFips
## 1      1964    ATL      Atlanta, GA          GA              13
## 2      1964    JAX Jacksonville, FL          FL              12
## 3      1899    BNA    Nashville, TN          TN              47
## 4      1906    DTW      Detroit, MI          MI              26
## 5      1908    ORD      Chicago, IL          IL              17
## 6      1915    ATL      Atlanta, GA          GA              13
##   OriginStateName Dest     DestCityName DestState DestStateFips
## 1         Georgia  JAX Jacksonville, FL        FL            12
## 2         Florida  ATL      Atlanta, GA        GA            13
## 3       Tennessee  ATL      Atlanta, GA        GA            13
## 4        Michigan  BNA    Nashville, TN        TN            47
## 5        Illinois  DTW      Detroit, MI        MI            26
## 6         Georgia  PHX      Phoenix, AZ        AZ             4
##   DestStateName CRSDepTime DepTime DepDelay DepDelayMinutes DepDel15
## 1       Florida       1628    1628        0               0        0
## 2       Georgia       1830    1824       -6               0        0
## 3       Georgia       1605    1604       -1               0        0
## 4     Tennessee        847     842       -5               0        0
## 5      Michigan       1400    1403        3               3        0
## 6       Arizona       2045    2043       -2               0        0
##   DepartureDelayGroups DepTimeBlk TaxiOut WheelsOff WheelsOn TaxiIn
## 1                    0  1600-1659      18      1646     1728      4
## 2                   -1  1800-1859      16      1840     1921     13
## 3                   -1  1600-1659      16      1620     1757      5
## 4                   -1  0800-0859      18       900      901      7
## 5                    0  1400-1459      15      1418     1616      5
## 6                   -1  2000-2059      26      2109     2135      5
##   CRSArrTime ArrTime ArrDelay ArrDelayMinutes ArrDel15 ArrivalDelayGroups
## 1       1740    1732       -8               0        0                 -1
## 2       1949    1934      -15               0        0                 -1
## 3       1817    1802      -15               0        0                 -1
## 4        923     908      -15               0        0                 -1
## 5       1621    1621        0               0        0                  0
## 6       2155    2140      -15               0        0                 -1
##   ArrTimeBlk Cancelled CancellationCode Diverted CRSElapsedTime Distan
## 1  1700-1759         0                         0             72     64
## 2  1900-1959         0                         0             79     70
## 3  1800-1859         0                         0             72     58
## 4  0900-0959         0                         0             96     86
## 5  1600-1659         0                         0             81     78
## 6  2100-2159         0                         0            250    237
##   AirTime Distance DistanceGroup CarrierDelay WeatherDelay NASDelay
## 1      42      270             2           NA           NA       NA
## 2      41      270             2           NA           NA       NA
## 3      37      214             1           NA           NA       NA
## 4      61      456             2           NA           NA       NA
## 5      58      235             1           NA           NA       NA
## 6     206     1587             7           NA           NA       NA
##   SecurityDelay LateAircraftDelay
## 1            NA                NA
## 2            NA                NA
## 3            NA                NA
## 4            NA                NA
## 5            NA                NA
## 6            NA                NA
summary(mydata$ArrDelay)   # compute stats for one variable
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -65.00  -11.00   -3.00    4.87    8.00 1308.00     606

Sometimes we want to use just a portion of a data table. Here we create a new object containing 9 of the 50 variables (as specified) for rows where ArrDelay > 15 minutes. This is simliar to Select… Where in SQL.

# to simplify the subset command, first set up a vector of column names
varlist <- c("DepDelay", "ArrDelay", "AirTime", "Distance")
myvars <-subset (mydata, ArrDelay > 15, select=varlist)
str(myvars)
## 'data.frame':    8759 obs. of  4 variables:
##  $ DepDelay: int  -4 11 73 39 10 85 68 139 25 47 ...
##  $ ArrDelay: int  28 28 64 42 18 67 67 134 68 31 ...
##  $ AirTime : int  100 206 130 125 272 332 103 67 79 76 ...
##  $ Distance: int  746 1399 1010 980 1990 2586 746 440 488 488 ...

This creates a more compact data table with fewer than 9000 rows. Suppose we want to build a regression model to estimate arrival delays. As standard practice in model-building with “big data” we first split the large data frame into two partitions:

# Demonstrate concept of partitioning a dataframe into Training
# and Testing samples

set.seed(1234)   # initialize the randomm number generator
# "ind" will be a vectors of randomly generated 1s and 2s. There will be as
# many values as there are rows of data in the myvars df. 60% of the ind values 
# will equal 1 nad 40% will equal 2.
ind <- sample(2,nrow(myvars),replace=TRUE, prob=c(0.6,0.4)) 
table(ind)
## ind
##    1    2 
## 5274 3485
train <- myvars[ind==1,]  # new df "train" will consists of randomly 
        # chosen rows from myvars, corresponding to the 1's in ind.
test <- myvars[ind==2,]

dim(train)
## [1] 5274    4
dim(test)
## [1] 3485    4

Before creating models, lets look at the correlations among the numeric variables. To help visualize the correlations, we’ll use the package called “corrplot”. Be sure to install the package before using library to call the package.

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.1.3
tr <- subset(train, select=c("ArrDelay","DepDelay","AirTime", "Distance"))

cm <- cor(tr, method="pearson")
cm
##             ArrDelay    DepDelay     AirTime    Distance
## ArrDelay  1.00000000  0.95447643 -0.01176882 -0.01592223
## DepDelay  0.95447643  1.00000000 -0.05361880 -0.02675997
## AirTime  -0.01176882 -0.05361880  1.00000000  0.98126974
## Distance -0.01592223 -0.02675997  0.98126974  1.00000000
corrplot(cm, method= "ellipse", type="lower" )

Now we’ll estimate 2 linear regression models to estimate arrival. We’ll choose the better model to re-run with the training data:

lm1 <- lm(ArrDelay ~ DepDelay, data = tr)
summary(lm1)
## 
## Call:
## lm(formula = ArrDelay ~ DepDelay, data = tr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.263 -11.307  -3.147   8.465 148.188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.287218   0.308209   33.38   <2e-16 ***
## DepDelay     0.917482   0.003949  232.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.24 on 5272 degrees of freedom
## Multiple R-squared:  0.911,  Adjusted R-squared:  0.911 
## F-statistic: 5.398e+04 on 1 and 5272 DF,  p-value: < 2.2e-16
lm2 <- lm(ArrDelay ~ DepDelay + AirTime, data = tr)
summary(lm2)
## 
## Call:
## lm(formula = ArrDelay ~ DepDelay + AirTime, data = tr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.764 -10.731  -2.599   8.044 143.304 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.623830   0.486052  13.628   <2e-16 ***
## DepDelay    0.919519   0.003920 234.559   <2e-16 ***
## AirTime     0.031431   0.003243   9.691   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.09 on 5271 degrees of freedom
## Multiple R-squared:  0.9126, Adjusted R-squared:  0.9125 
## F-statistic: 2.751e+04 on 2 and 5271 DF,  p-value: < 2.2e-16

The second model is the better fit. Now re-estimate it with the test data set and see if the quality of the model persists.

lm2t <- lm(ArrDelay ~ DepDelay + AirTime, data = test)
summary(lm2t)
## 
## Call:
## lm(formula = ArrDelay ~ DepDelay + AirTime, data = test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.112 -10.969  -3.005   8.049 124.828 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.336614   0.630047  11.645  < 2e-16 ***
## DepDelay    0.905136   0.005653 160.103  < 2e-16 ***
## AirTime     0.031856   0.004091   7.786 9.04e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.46 on 3482 degrees of freedom
## Multiple R-squared:  0.8805, Adjusted R-squared:  0.8804 
## F-statistic: 1.283e+04 on 2 and 3482 DF,  p-value: < 2.2e-16