Each user must set a working director for her/his own computer.
setwd("~/Dropbox/MyRWork")
mydata <- read.csv("OnTime.csv")
Common commands to explore a data frame include:
dim(mydata)
## [1] 49101 50
names(mydata)
## [1] "Year" "Quarter" "Month"
## [4] "DayofMonth" "DayOfWeek" "FlightDate"
## [7] "UniqueCarrier" "TailNum" "FlightNum"
## [10] "Origin" "OriginCityName" "OriginState"
## [13] "OriginStateFips" "OriginStateName" "Dest"
## [16] "DestCityName" "DestState" "DestStateFips"
## [19] "DestStateName" "CRSDepTime" "DepTime"
## [22] "DepDelay" "DepDelayMinutes" "DepDel15"
## [25] "DepartureDelayGroups" "DepTimeBlk" "TaxiOut"
## [28] "WheelsOff" "WheelsOn" "TaxiIn"
## [31] "CRSArrTime" "ArrTime" "ArrDelay"
## [34] "ArrDelayMinutes" "ArrDel15" "ArrivalDelayGroups"
## [37] "ArrTimeBlk" "Cancelled" "CancellationCode"
## [40] "Diverted" "CRSElapsedTime" "Distan"
## [43] "AirTime" "Distance" "DistanceGroup"
## [46] "CarrierDelay" "WeatherDelay" "NASDelay"
## [49] "SecurityDelay" "LateAircraftDelay"
str(mydata)
## 'data.frame': 49101 obs. of 50 variables:
## $ Year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ Quarter : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Month : int 10 10 10 10 10 10 10 10 10 10 ...
## $ DayofMonth : int 23 23 21 21 21 21 21 21 21 21 ...
## $ DayOfWeek : int 4 4 2 2 2 2 2 2 2 2 ...
## $ FlightDate : Factor w/ 31 levels "10/1/2014","10/10/2014",..: 16 16 14 14 14 14 14 14 14 14 ...
## $ UniqueCarrier : Factor w/ 14 levels "AA","AS","B6",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ TailNum : Factor w/ 4167 levels "","N001AA","N002AA",..: 1920 1920 4041 3971 3817 2029 3866 2033 1956 3256 ...
## $ FlightNum : int 1964 1964 1899 1906 1908 1915 1920 1921 1926 1927 ...
## $ Origin : Factor w/ 308 levels "ABE","ABI","ABQ",..: 20 158 37 93 218 20 258 93 20 112 ...
## $ OriginCityName : Factor w/ 304 levels "Aberdeen, SD",..: 18 140 200 82 59 18 260 82 18 102 ...
## $ OriginState : Factor w/ 53 levels "AK","AL","AR",..: 10 9 43 22 14 10 10 22 10 9 ...
## $ OriginStateFips : int 13 12 47 26 17 13 13 26 13 12 ...
## $ OriginStateName : Factor w/ 53 levels "Alabama","Alaska",..: 10 9 43 22 13 10 10 22 10 9 ...
## $ Dest : Factor w/ 307 levels "ABE","ABI","ABQ",..: 157 20 20 37 93 227 20 227 183 20 ...
## $ DestCityName : Factor w/ 303 levels "Aberdeen, SD",..: 139 18 18 199 82 224 18 224 147 18 ...
## $ DestState : Factor w/ 53 levels "AK","AL","AR",..: 9 10 10 43 22 4 10 4 24 10 ...
## $ DestStateFips : int 12 13 13 47 26 4 13 4 29 13 ...
## $ DestStateName : Factor w/ 53 levels "Alabama","Alaska",..: 9 10 10 43 22 3 10 3 25 10 ...
## $ CRSDepTime : int 1628 1830 1605 847 1400 2045 900 825 1752 745 ...
## $ DepTime : int 1628 1824 1604 842 1403 2043 855 825 1814 744 ...
## $ DepDelay : int 0 -6 -1 -5 3 -2 -5 0 22 -1 ...
## $ DepDelayMinutes : int 0 0 0 0 3 0 0 0 22 0 ...
## $ DepDel15 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ DepartureDelayGroups: int 0 -1 -1 -1 0 -1 -1 0 1 -1 ...
## $ DepTimeBlk : Factor w/ 19 levels "0001-0559","0600-0659",..: 12 14 12 4 10 16 5 4 13 3 ...
## $ TaxiOut : int 18 16 16 18 15 26 17 18 15 14 ...
## $ WheelsOff : int 1646 1840 1620 900 1418 2109 912 843 1829 758 ...
## $ WheelsOn : int 1728 1921 1757 901 1616 2135 951 917 1905 923 ...
## $ TaxiIn : int 4 13 5 7 5 5 14 2 6 6 ...
## $ CRSArrTime : int 1740 1949 1817 923 1621 2155 1010 940 1904 936 ...
## $ ArrTime : int 1732 1934 1802 908 1621 2140 1005 919 1911 929 ...
## $ ArrDelay : int -8 -15 -15 -15 0 -15 -5 -21 7 -7 ...
## $ ArrDelayMinutes : int 0 0 0 0 0 0 0 0 7 0 ...
## $ ArrDel15 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ArrivalDelayGroups : int -1 -1 -1 -1 0 -1 -1 -2 0 -1 ...
## $ ArrTimeBlk : Factor w/ 19 levels "0001-0559","0600-0659",..: 13 15 14 5 12 17 6 5 15 5 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : Factor w/ 4 levels "","A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 72 79 72 96 81 250 70 255 132 111 ...
## $ Distan : int 64 70 58 86 78 237 70 234 117 105 ...
## $ AirTime : int 42 41 37 61 58 206 39 214 96 85 ...
## $ Distance : int 270 270 214 456 235 1587 214 1671 692 581 ...
## $ DistanceGroup : int 2 2 1 2 1 7 1 7 3 3 ...
## $ CarrierDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ WeatherDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ NASDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ SecurityDelay : int NA NA NA NA NA NA NA NA NA NA ...
## $ LateAircraftDelay : int NA NA NA NA NA NA NA NA NA NA ...
head(mydata)
## Year Quarter Month DayofMonth DayOfWeek FlightDate UniqueCarrier TailNum
## 1 2014 4 10 23 4 10/23/2014 DL N530US
## 2 2014 4 10 23 4 10/23/2014 DL N530US
## 3 2014 4 10 21 2 10/21/2014 DL N959DN
## 4 2014 4 10 21 2 10/21/2014 DL N948DN
## 5 2014 4 10 21 2 10/21/2014 DL N926DL
## 6 2014 4 10 21 2 10/21/2014 DL N555NW
## FlightNum Origin OriginCityName OriginState OriginStateFips
## 1 1964 ATL Atlanta, GA GA 13
## 2 1964 JAX Jacksonville, FL FL 12
## 3 1899 BNA Nashville, TN TN 47
## 4 1906 DTW Detroit, MI MI 26
## 5 1908 ORD Chicago, IL IL 17
## 6 1915 ATL Atlanta, GA GA 13
## OriginStateName Dest DestCityName DestState DestStateFips
## 1 Georgia JAX Jacksonville, FL FL 12
## 2 Florida ATL Atlanta, GA GA 13
## 3 Tennessee ATL Atlanta, GA GA 13
## 4 Michigan BNA Nashville, TN TN 47
## 5 Illinois DTW Detroit, MI MI 26
## 6 Georgia PHX Phoenix, AZ AZ 4
## DestStateName CRSDepTime DepTime DepDelay DepDelayMinutes DepDel15
## 1 Florida 1628 1628 0 0 0
## 2 Georgia 1830 1824 -6 0 0
## 3 Georgia 1605 1604 -1 0 0
## 4 Tennessee 847 842 -5 0 0
## 5 Michigan 1400 1403 3 3 0
## 6 Arizona 2045 2043 -2 0 0
## DepartureDelayGroups DepTimeBlk TaxiOut WheelsOff WheelsOn TaxiIn
## 1 0 1600-1659 18 1646 1728 4
## 2 -1 1800-1859 16 1840 1921 13
## 3 -1 1600-1659 16 1620 1757 5
## 4 -1 0800-0859 18 900 901 7
## 5 0 1400-1459 15 1418 1616 5
## 6 -1 2000-2059 26 2109 2135 5
## CRSArrTime ArrTime ArrDelay ArrDelayMinutes ArrDel15 ArrivalDelayGroups
## 1 1740 1732 -8 0 0 -1
## 2 1949 1934 -15 0 0 -1
## 3 1817 1802 -15 0 0 -1
## 4 923 908 -15 0 0 -1
## 5 1621 1621 0 0 0 0
## 6 2155 2140 -15 0 0 -1
## ArrTimeBlk Cancelled CancellationCode Diverted CRSElapsedTime Distan
## 1 1700-1759 0 0 72 64
## 2 1900-1959 0 0 79 70
## 3 1800-1859 0 0 72 58
## 4 0900-0959 0 0 96 86
## 5 1600-1659 0 0 81 78
## 6 2100-2159 0 0 250 237
## AirTime Distance DistanceGroup CarrierDelay WeatherDelay NASDelay
## 1 42 270 2 NA NA NA
## 2 41 270 2 NA NA NA
## 3 37 214 1 NA NA NA
## 4 61 456 2 NA NA NA
## 5 58 235 1 NA NA NA
## 6 206 1587 7 NA NA NA
## SecurityDelay LateAircraftDelay
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
summary(mydata$ArrDelay) # compute stats for one variable
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -65.00 -11.00 -3.00 4.87 8.00 1308.00 606
Sometimes we want to use just a portion of a data table. Here we create a new object containing 9 of the 50 variables (as specified) for rows where ArrDelay > 15 minutes. This is simliar to Select… Where in SQL.
# to simplify the subset command, first set up a vector of column names
varlist <- c("DepDelay", "ArrDelay", "AirTime", "Distance")
myvars <-subset (mydata, ArrDelay > 15, select=varlist)
str(myvars)
## 'data.frame': 8759 obs. of 4 variables:
## $ DepDelay: int -4 11 73 39 10 85 68 139 25 47 ...
## $ ArrDelay: int 28 28 64 42 18 67 67 134 68 31 ...
## $ AirTime : int 100 206 130 125 272 332 103 67 79 76 ...
## $ Distance: int 746 1399 1010 980 1990 2586 746 440 488 488 ...
This creates a more compact data table with fewer than 9000 rows. Suppose we want to build a regression model to estimate arrival delays. As standard practice in model-building with “big data” we first split the large data frame into two partitions:
# Demonstrate concept of partitioning a dataframe into Training
# and Testing samples
set.seed(1234) # initialize the randomm number generator
# "ind" will be a vectors of randomly generated 1s and 2s. There will be as
# many values as there are rows of data in the myvars df. 60% of the ind values
# will equal 1 nad 40% will equal 2.
ind <- sample(2,nrow(myvars),replace=TRUE, prob=c(0.6,0.4))
table(ind)
## ind
## 1 2
## 5274 3485
train <- myvars[ind==1,] # new df "train" will consists of randomly
# chosen rows from myvars, corresponding to the 1's in ind.
test <- myvars[ind==2,]
dim(train)
## [1] 5274 4
dim(test)
## [1] 3485 4
Before creating models, lets look at the correlations among the numeric variables. To help visualize the correlations, we’ll use the package called “corrplot”. Be sure to install the package before using library to call the package.
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.1.3
tr <- subset(train, select=c("ArrDelay","DepDelay","AirTime", "Distance"))
cm <- cor(tr, method="pearson")
cm
## ArrDelay DepDelay AirTime Distance
## ArrDelay 1.00000000 0.95447643 -0.01176882 -0.01592223
## DepDelay 0.95447643 1.00000000 -0.05361880 -0.02675997
## AirTime -0.01176882 -0.05361880 1.00000000 0.98126974
## Distance -0.01592223 -0.02675997 0.98126974 1.00000000
corrplot(cm, method= "ellipse", type="lower" )
Now we’ll estimate 2 linear regression models to estimate arrival. We’ll choose the better model to re-run with the training data:
lm1 <- lm(ArrDelay ~ DepDelay, data = tr)
summary(lm1)
##
## Call:
## lm(formula = ArrDelay ~ DepDelay, data = tr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.263 -11.307 -3.147 8.465 148.188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.287218 0.308209 33.38 <2e-16 ***
## DepDelay 0.917482 0.003949 232.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.24 on 5272 degrees of freedom
## Multiple R-squared: 0.911, Adjusted R-squared: 0.911
## F-statistic: 5.398e+04 on 1 and 5272 DF, p-value: < 2.2e-16
lm2 <- lm(ArrDelay ~ DepDelay + AirTime, data = tr)
summary(lm2)
##
## Call:
## lm(formula = ArrDelay ~ DepDelay + AirTime, data = tr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.764 -10.731 -2.599 8.044 143.304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.623830 0.486052 13.628 <2e-16 ***
## DepDelay 0.919519 0.003920 234.559 <2e-16 ***
## AirTime 0.031431 0.003243 9.691 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.09 on 5271 degrees of freedom
## Multiple R-squared: 0.9126, Adjusted R-squared: 0.9125
## F-statistic: 2.751e+04 on 2 and 5271 DF, p-value: < 2.2e-16
The second model is the better fit. Now re-estimate it with the test data set and see if the quality of the model persists.
lm2t <- lm(ArrDelay ~ DepDelay + AirTime, data = test)
summary(lm2t)
##
## Call:
## lm(formula = ArrDelay ~ DepDelay + AirTime, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.112 -10.969 -3.005 8.049 124.828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.336614 0.630047 11.645 < 2e-16 ***
## DepDelay 0.905136 0.005653 160.103 < 2e-16 ***
## AirTime 0.031856 0.004091 7.786 9.04e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.46 on 3482 degrees of freedom
## Multiple R-squared: 0.8805, Adjusted R-squared: 0.8804
## F-statistic: 1.283e+04 on 2 and 3482 DF, p-value: < 2.2e-16