library("knitr")
library("stringr")
flying <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/flying-etiquette-survey/flying-etiquette.csv", stringsAsFactors = FALSE)
flying <- flying[c(3,4, 23,24, 25,26)]
colnames(flying) <- c("Recline", "Height", "Gender", "Age", "Income", "Education")
#Convert Height in feet and inches into inches
for (rownum in 2:length(flying$Height)){
a <- unlist(str_extract_all(flying[rownum,2], "[[:digit:]]"))
a <- as.numeric(a)
if (length(a)==2) {
flying[rownum,2] = a[1]*12 + a[2]
}
else {
flying[rownum,2] = a[1]*12 + a[2]*10 + a[3]
}
}
flying$Height <- as.numeric(flying$Height)
#Create Dummy Variable for Gender - Male = 1; Female = 0
dummyMF <- NULL
dummyMF[flying$Gender == 'Male'] = 1
dummyMF[flying$Gender == 'Female'] = 0
#Create Dummy Variable for Recline - Always = 4; Usually = 3; About half the time = 2; Once in a while = 1; Never = 0
dummyRecl <- NULL
dummyRecl[flying$Recline == 'Always'] = 4
dummyRecl[flying$Recline == 'Usually'] = 3
dummyRecl[flying$Recline == 'About half the time'] = 2
dummyRecl[flying$Recline == 'Once in a while'] = 1
dummyRecl[flying$Recline == 'Never'] = 0
#Taking the average age in an interval to be the age in the interval
dummyAge <- NULL
dummyAge[flying$Age == '18-29'] = 23.5
dummyAge[flying$Age == '30-44'] = 37
dummyAge[flying$Age == '45-60'] = 52.5
dummyAge[flying$Age == '> 60'] = 70
#Taking the average income in an interval to be the income value for the interval
dummyInc <- NULL
dummyInc[flying$Income == '150000'] = 150000
dummyInc[flying$Income == '$100,000 - $149,999'] = 125000
dummyInc[flying$Income == '$50,000 - $99,999'] = 75000
dummyInc[flying$Income == '$25,000 - $49,999'] = 375000
dummyInc[flying$Income == '$0 - $24,999'] = 15000
#Create Dummy Variable for Education - Graduate Degree = 4; Bachelor Degree =3; Some college or Associate degree = 2; High school degree = 1; Less than high school degree =0
dummyEd <- NULL
dummyEd[flying$Education == 'Graduate degree'] = 4
dummyEd[flying$Education == 'Bachelor degree'] = 3
dummyEd[flying$Education == 'Some college or Associate degree'] = 2
dummyEd[flying$Education == 'High school degree'] = 1
dummyEd[flying$Education == 'Less than high school degree'] = 0
flying <- cbind(flying, dummyMF, dummyRecl, dummyAge, dummyInc, dummyEd)
head(flying)
## Recline Height Gender Age Income
## 1 NA
## 2 About half the time 75 Male 30-44
## 3 Usually 68 Male 30-44 $100,000 - $149,999
## 4 Always 71 Male 30-44 $0 - $24,999
## 5 About half the time 67 Male 30-44 $50,000 - $99,999
## 6 Usually 69 Male 30-44 $25,000 - $49,999
## Education dummyMF dummyRecl dummyAge dummyInc dummyEd
## 1 NA NA NA NA NA
## 2 Graduate degree 1 2 37 NA 4
## 3 Bachelor degree 1 3 37 125000 3
## 4 Bachelor degree 1 4 37 15000 3
## 5 Bachelor degree 1 2 37 75000 3
## 6 Graduate degree 1 3 37 375000 4
flyingdf <- flying[c(2, 7,8, 9,10,11)]
pairs(flyingdf, gap=.5)
flyinglm <- lm(dummyRecl ~ dummyMF + Height + dummyAge + dummyEd + dummyInc, data=flyingdf)
summary(flyinglm)
##
## Call:
## lm(formula = dummyRecl ~ dummyMF + Height + dummyAge + dummyEd +
## dummyInc, data = flyingdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10028 -1.01133 -0.03005 1.14457 2.31016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.851e+00 1.256e+00 1.474 0.1411
## dummyMF 3.093e-02 1.446e-01 0.214 0.8306
## Height -4.874e-03 1.901e-02 -0.256 0.7977
## dummyAge 6.717e-03 3.270e-03 2.054 0.0404 *
## dummyEd -8.871e-04 5.771e-02 -0.015 0.9877
## dummyInc 2.405e-07 4.706e-07 0.511 0.6094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.387 on 680 degrees of freedom
## (354 observations deleted due to missingness)
## Multiple R-squared: 0.006932, Adjusted R-squared: -0.0003705
## F-statistic: 0.9493 on 5 and 680 DF, p-value: 0.4484
#Income is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -dummyMF, data = flyingdf)
summary(flyinglm)
##
## Call:
## lm(formula = dummyRecl ~ Height + dummyAge + dummyEd + dummyInc,
## data = flyingdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10299 -1.01825 -0.03015 1.14057 2.31359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.680e+00 9.666e-01 1.738 0.0827 .
## Height -2.107e-03 1.392e-02 -0.151 0.8797
## dummyAge 6.721e-03 3.268e-03 2.057 0.0401 *
## dummyEd -9.785e-04 5.767e-02 -0.017 0.9865
## dummyInc 2.395e-07 4.702e-07 0.509 0.6107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.386 on 681 degrees of freedom
## (354 observations deleted due to missingness)
## Multiple R-squared: 0.006865, Adjusted R-squared: 0.001031
## F-statistic: 1.177 on 4 and 681 DF, p-value: 0.3197
#Gender is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -Height, data = flyingdf)
summary(flyinglm)
##
## Call:
## lm(formula = dummyRecl ~ dummyAge + dummyEd + dummyInc, data = flyingdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.08133 -1.01844 -0.02266 1.14928 2.29029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.563e+00 2.214e-01 7.062 4e-12 ***
## dummyAge 6.466e-03 3.256e-03 1.986 0.0475 *
## dummyEd -2.812e-03 5.702e-02 -0.049 0.9607
## dummyInc 1.815e-07 4.658e-07 0.390 0.6969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.385 on 692 degrees of freedom
## (344 observations deleted due to missingness)
## Multiple R-squared: 0.006098, Adjusted R-squared: 0.001789
## F-statistic: 1.415 on 3 and 692 DF, p-value: 0.2371
#Height is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -dummyInc, data = flyingdf)
summary(flyinglm)
##
## Call:
## lm(formula = dummyRecl ~ dummyAge + dummyEd, data = flyingdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.93963 -0.93195 0.06293 1.21549 2.28663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.613152 0.181122 8.906 <2e-16 ***
## dummyAge 0.004700 0.002918 1.611 0.108
## dummyEd -0.002560 0.050924 -0.050 0.960
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.377 on 834 degrees of freedom
## (203 observations deleted due to missingness)
## Multiple R-squared: 0.003248, Adjusted R-squared: 0.0008581
## F-statistic: 1.359 on 2 and 834 DF, p-value: 0.2575
#Education is the least significant variable so I will remove it
flyinglm <- update(flyinglm, .~. -dummyEd, data = flyingdf)
summary(flyinglm)
##
## Call:
## lm(formula = dummyRecl ~ dummyAge, data = flyingdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.93628 -0.93628 0.06372 1.18501 2.28709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.600022 0.140579 11.382 <2e-16 ***
## dummyAge 0.004804 0.002818 1.704 0.0887 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.375 on 841 degrees of freedom
## (197 observations deleted due to missingness)
## Multiple R-squared: 0.003442, Adjusted R-squared: 0.002257
## F-statistic: 2.905 on 1 and 841 DF, p-value: 0.08867
recline = 1.15 + 0.044*Age
This is a terrible model. I had thought there would be a relationship between height and the likelihood someone is going to recline on an airplane. That is not the case! The adjusted R squared value is .00436. However the p value does show a significant relationship between age and whether a person is likely to recline in his or her seat. Because I was trying to predict a categorical variable by making a dummy variable for it, the ability to predict using the linear regression model, was not appropriate.
plot(fitted(flyinglm),resid(flyinglm))
qqnorm(resid(flyinglm))
qqline(resid(flyinglm))