Flying Etiquette

The following data of flying etiquette is from a survey done by 538

I will try to predict the likelihood someone will recline in his or her seat on an airplane

library("knitr")
library("stringr")
flying <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/flying-etiquette-survey/flying-etiquette.csv", stringsAsFactors = FALSE)
flying <- flying[c(3,4, 23,24, 25,26)]
colnames(flying) <- c("Recline", "Height", "Gender", "Age", "Income", "Education")

#Convert Height in feet and inches into inches
for (rownum in 2:length(flying$Height)){
  a <- unlist(str_extract_all(flying[rownum,2], "[[:digit:]]"))
  a <- as.numeric(a)
  if (length(a)==2) {
    flying[rownum,2] = a[1]*12 + a[2]
  }
  else {
    flying[rownum,2] = a[1]*12 + a[2]*10 + a[3]
  } 
}  
flying$Height <- as.numeric(flying$Height)

#Create Dummy Variable for Gender - Male = 1; Female = 0
dummyMF <- NULL
dummyMF[flying$Gender == 'Male'] = 1
dummyMF[flying$Gender == 'Female'] = 0

#Create Dummy Variable for Recline - Always = 4; Usually = 3; About half the time = 2; Once in a while = 1; Never = 0
dummyRecl <- NULL
dummyRecl[flying$Recline == 'Always'] = 4
dummyRecl[flying$Recline == 'Usually'] = 3
dummyRecl[flying$Recline == 'About half the time'] = 2
dummyRecl[flying$Recline == 'Once in a while'] = 1
dummyRecl[flying$Recline == 'Never'] = 0

#Taking the average age in an interval to be the age in the interval
dummyAge <- NULL
dummyAge[flying$Age == '18-29'] = 23.5
dummyAge[flying$Age == '30-44'] = 37
dummyAge[flying$Age == '45-60'] = 52.5
dummyAge[flying$Age == '> 60'] = 70

#Taking the average income in an interval to be the income value for the interval
dummyInc <- NULL
dummyInc[flying$Income == '150000'] = 150000
dummyInc[flying$Income == '$100,000 - $149,999'] = 125000
dummyInc[flying$Income == '$50,000 - $99,999'] = 75000
dummyInc[flying$Income == '$25,000 - $49,999'] = 375000
dummyInc[flying$Income == '$0 - $24,999'] = 15000

#Create Dummy Variable for Education - Graduate Degree = 4; Bachelor Degree =3; Some college or Associate degree = 2; High school degree = 1; Less than high school degree =0
dummyEd <- NULL
dummyEd[flying$Education == 'Graduate degree'] = 4
dummyEd[flying$Education == 'Bachelor degree'] = 3
dummyEd[flying$Education == 'Some college or Associate degree'] = 2
dummyEd[flying$Education == 'High school degree'] = 1
dummyEd[flying$Education == 'Less than high school degree'] = 0

flying <- cbind(flying, dummyMF, dummyRecl, dummyAge, dummyInc, dummyEd)
head(flying)
##               Recline Height Gender   Age              Income
## 1                         NA                                 
## 2 About half the time     75   Male 30-44                    
## 3             Usually     68   Male 30-44 $100,000 - $149,999
## 4              Always     71   Male 30-44        $0 - $24,999
## 5 About half the time     67   Male 30-44   $50,000 - $99,999
## 6             Usually     69   Male 30-44   $25,000 - $49,999
##         Education dummyMF dummyRecl dummyAge dummyInc dummyEd
## 1                      NA        NA       NA       NA      NA
## 2 Graduate degree       1         2       37       NA       4
## 3 Bachelor degree       1         3       37   125000       3
## 4 Bachelor degree       1         4       37    15000       3
## 5 Bachelor degree       1         2       37    75000       3
## 6 Graduate degree       1         3       37   375000       4

Relationship between variables

flyingdf <- flying[c(2, 7,8, 9,10,11)]
pairs(flyingdf, gap=.5)

Backward Elimination

flyinglm <- lm(dummyRecl ~ dummyMF + Height + dummyAge + dummyEd + dummyInc, data=flyingdf)
summary(flyinglm)
## 
## Call:
## lm(formula = dummyRecl ~ dummyMF + Height + dummyAge + dummyEd + 
##     dummyInc, data = flyingdf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10028 -1.01133 -0.03005  1.14457  2.31016 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.851e+00  1.256e+00   1.474   0.1411  
## dummyMF      3.093e-02  1.446e-01   0.214   0.8306  
## Height      -4.874e-03  1.901e-02  -0.256   0.7977  
## dummyAge     6.717e-03  3.270e-03   2.054   0.0404 *
## dummyEd     -8.871e-04  5.771e-02  -0.015   0.9877  
## dummyInc     2.405e-07  4.706e-07   0.511   0.6094  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.387 on 680 degrees of freedom
##   (354 observations deleted due to missingness)
## Multiple R-squared:  0.006932,   Adjusted R-squared:  -0.0003705 
## F-statistic: 0.9493 on 5 and 680 DF,  p-value: 0.4484
#Income is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -dummyMF, data = flyingdf)
summary(flyinglm)
## 
## Call:
## lm(formula = dummyRecl ~ Height + dummyAge + dummyEd + dummyInc, 
##     data = flyingdf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10299 -1.01825 -0.03015  1.14057  2.31359 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.680e+00  9.666e-01   1.738   0.0827 .
## Height      -2.107e-03  1.392e-02  -0.151   0.8797  
## dummyAge     6.721e-03  3.268e-03   2.057   0.0401 *
## dummyEd     -9.785e-04  5.767e-02  -0.017   0.9865  
## dummyInc     2.395e-07  4.702e-07   0.509   0.6107  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.386 on 681 degrees of freedom
##   (354 observations deleted due to missingness)
## Multiple R-squared:  0.006865,   Adjusted R-squared:  0.001031 
## F-statistic: 1.177 on 4 and 681 DF,  p-value: 0.3197
#Gender is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -Height, data = flyingdf)
summary(flyinglm)
## 
## Call:
## lm(formula = dummyRecl ~ dummyAge + dummyEd + dummyInc, data = flyingdf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.08133 -1.01844 -0.02266  1.14928  2.29029 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.563e+00  2.214e-01   7.062    4e-12 ***
## dummyAge     6.466e-03  3.256e-03   1.986   0.0475 *  
## dummyEd     -2.812e-03  5.702e-02  -0.049   0.9607    
## dummyInc     1.815e-07  4.658e-07   0.390   0.6969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.385 on 692 degrees of freedom
##   (344 observations deleted due to missingness)
## Multiple R-squared:  0.006098,   Adjusted R-squared:  0.001789 
## F-statistic: 1.415 on 3 and 692 DF,  p-value: 0.2371
#Height is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -dummyInc, data = flyingdf)
summary(flyinglm)
## 
## Call:
## lm(formula = dummyRecl ~ dummyAge + dummyEd, data = flyingdf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.93963 -0.93195  0.06293  1.21549  2.28663 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.613152   0.181122   8.906   <2e-16 ***
## dummyAge     0.004700   0.002918   1.611    0.108    
## dummyEd     -0.002560   0.050924  -0.050    0.960    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.377 on 834 degrees of freedom
##   (203 observations deleted due to missingness)
## Multiple R-squared:  0.003248,   Adjusted R-squared:  0.0008581 
## F-statistic: 1.359 on 2 and 834 DF,  p-value: 0.2575
#Education is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -dummyEd, data = flyingdf)
summary(flyinglm)
## 
## Call:
## lm(formula = dummyRecl ~ dummyAge, data = flyingdf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.93628 -0.93628  0.06372  1.18501  2.28709 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.600022   0.140579  11.382   <2e-16 ***
## dummyAge    0.004804   0.002818   1.704   0.0887 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.375 on 841 degrees of freedom
##   (197 observations deleted due to missingness)
## Multiple R-squared:  0.003442,   Adjusted R-squared:  0.002257 
## F-statistic: 2.905 on 1 and 841 DF,  p-value: 0.08867

recline = 1.15 + 0.044*Age
This is a terrible model. I had thought there would be a relationship between height and the likelihood someone is going to recline on an airplane. That is not the case! The adjusted R squared value is .00436. However the p value does show a significant relationship between age and whether a person is likely to recline in his or her seat. Because I was trying to predict a categorical variable by making a dummy variable for it, the ability to predict using the linear regression model, was not appropriate.

Residual Analysis

plot(fitted(flyinglm),resid(flyinglm))

qqnorm(resid(flyinglm))
qqline(resid(flyinglm))