> summary(airline)
Satisfaction Status Age Gender No.Flights
Min. :1.000 Blue :1304 Min. :15.00 Female:1035 Min. : 1.00
1st Qu.:3.000 Gold : 161 1st Qu.:34.00 Male : 862 1st Qu.:10.00
Median :4.000 Platinum: 57 Median :46.00 Median :18.00
Mean :3.396 Silver : 375 Mean :46.92 Mean :20.82
3rd Qu.:4.000 3rd Qu.:59.00 3rd Qu.:29.00
Max. :5.000 Max. :85.00 Max. :84.00
Type Shopping Eating Class
Business travel:1181 Min. : 0.00 Min. : 0.0 Business: 152
Mileage tickets: 153 1st Qu.: 0.00 1st Qu.: 30.0 Eco :1552
Personal Travel: 563 Median : 0.00 Median : 60.0 Eco Plus: 193
Mean : 26.22 Mean : 68.5
3rd Qu.: 30.00 3rd Qu.: 90.0
Max. :364.00 Max. :440.0
Dep.Delay Arr.Delay Cancel Time Distance
Min. : 0.00 Min. : 0.00 No :1860 Min. : 17.00 Min. : 67.0
1st Qu.: 0.00 1st Qu.: 0.00 Yes: 37 1st Qu.: 32.00 1st Qu.: 158.0
Median : 0.00 Median : 0.00 Median : 48.00 Median : 282.0
Mean : 11.67 Mean : 13.39 Mean : 56.25 Mean : 352.3
3rd Qu.: 9.00 3rd Qu.: 12.00 3rd Qu.: 64.25 3rd Qu.: 408.0
Max. :423.00 Max. :417.00 Max. :169.00 Max. :1250.0
NA's :34 NA's :37 NA's :37
> table(is.na(airline))
FALSE TRUE
26450 108
> airline$Dep.Delay[is.na(airline$Dep.Delay)] <- mean(airline$Dep.Delay, na.rm = TRUE)
> airline$Arr.Delay[is.na(airline$Arr.Delay)] <- mean(airline$Arr.Delay, na.rm = TRUE)
> airline$Time[is.na(airline$Time)] <- mean(airline$Time, na.rm = TRUE)
> airline$No.Flights[is.na(airline$No.Flights)] <- mean(airline$No.Flights, na.rm = TRUE)
> table(is.na(airline))
FALSE
26558
> library(caTools)
Warning: package 'caTools' was built under R version 4.1.3
> split = sample.split(airline, SplitRatio = 0.7)
> training_set = subset(airline, split == 'True')
> test_set = subset(airline, split == 'False')
> training_set
[1] Satisfaction Status Age Gender No.Flights
[6] Type Shopping Eating Class Dep.Delay
[11] Arr.Delay Cancel Time Distance
<0 rows> (or 0-length row.names)
> test_set
[1] Satisfaction Status Age Gender No.Flights
[6] Type Shopping Eating Class Dep.Delay
[11] Arr.Delay Cancel Time Distance
<0 rows> (or 0-length row.names)
> head(df1, n = 5)
Satisfaction Status Age Gender No.Flights Type Shopping Eating
1 5 Silver 32 Male 1 Business travel 10 46
2 2 Blue 66 Male 1 Personal Travel 255 335
3 3 Blue 28 Female 1 Business travel 30 70
4 3 Blue 50 Female 1 Personal Travel 15 60
5 4 Silver 46 Female 1 Business travel 120 150
Class Dep.Delay Arr.Delay Cancel Time Distance
1 Eco 66 61 No 43 227
2 Eco 55 59 No 44 227
3 Eco 15 21 No 27 140
4 Eco 10 2 No 97 772
5 Eco 2 16 No 113 987
> summary(df1)
Satisfaction Status Age Gender
Min. :1.000 Length:241 Min. :15.00 Length:241
1st Qu.:2.000 Class :character 1st Qu.:36.00 Class :character
Median :3.000 Mode :character Median :46.00 Mode :character
Mean :3.344 Mean :48.27
3rd Qu.:4.000 3rd Qu.:62.00
Max. :5.000 Max. :85.00
No.Flights Type Shopping Eating
Min. : 1.00 Length:241 Min. : 2.00 Min. : 8.00
1st Qu.:10.00 Class :character 1st Qu.: 20.00 1st Qu.: 40.00
Median :19.00 Mode :character Median : 45.00 Median : 65.00
Mean :19.44 Mean : 60.85 Mean : 79.44
3rd Qu.:26.00 3rd Qu.: 85.00 3rd Qu.:105.00
Max. :58.00 Max. :344.00 Max. :390.00
Class Dep.Delay Arr.Delay Cancel
Length:241 Min. : 1.00 Min. : 1.00 Length:241
Class :character 1st Qu.: 11.00 1st Qu.: 12.00 Class :character
Mode :character Median : 27.00 Median : 26.00 Mode :character
Mean : 37.11 Mean : 40.07
3rd Qu.: 47.00 3rd Qu.: 52.00
Max. :212.00 Max. :293.00
Time Distance
Min. : 18.00 Min. : 89.0
1st Qu.: 37.00 1st Qu.: 164.0
Median : 49.00 Median : 293.0
Mean : 58.84 Mean : 368.8
3rd Qu.: 72.00 3rd Qu.: 482.0
Max. :144.00 Max. :1250.0
> with(df1, Hist(Satisfaction, scale="frequency", breaks=3,
+ col="tomato"))
> normalityTest(~Satisfaction, test="shapiro.test", data=df1)
Shapiro-Wilk normality test
data: Satisfaction
W = 0.90087, p-value = 1.622e-11
> skewness(df1$Satisfaction)
[1] -0.1208948
> powerTransform(df1$Satisfaction)
Estimated transformation parameter
df1$Satisfaction
1.008903
> Sat <- (df1$Satisfaction)^1.2
> skewness(Sat)
[1] -0.007416673
> hist(Sat, breaks = 3, col = 'tomato')
> with(df1, Barplot(Type, xlab="Type", ylab="Frequency", label.bars=TRUE, col="plum"))
> with(df1, Barplot(Status, xlab="Status", ylab="Frequency",
+ label.bars=TRUE, col="plum"))
> with(df1, Barplot(Gender, xlab="Gender", ylab="Frequency",
+ label.bars=TRUE, col="plum"))
> with(df1, Barplot(Class, xlab="Class", ylab="Frequency",
+ label.bars=TRUE, col="plum"))
> Boxplot( ~ Distance, data=df1, id=list(method="y"), col="palegreen")
[1] "5" "17" "48" "72" "147" "161" "206"
> Boxplot( ~ Time, data=df1, id=list(method="y"), col="palegreen")
[1] "27" "35" "48" "49" "72" "91" "161" "178" "206"
> Boxplot( ~ Arr.Delay, data=df1, id=list(method="y"), col="palegreen")
[1] "199" "218" "23" "134" "22" "109" "130" "189" "229" "64"
> Boxplot( ~ Dep.Delay, data=df1, id=list(method="y"), col="palegreen")
[1] "134" "22" "199" "130" "218" "23" "109" "189" "229" "42"
> Boxplot( ~ Eating, data=df1, id=list(method="y"), col="palegreen")
[1] "2" "34" "43" "79" "133" "158" "196" "222" "230"
> Boxplot( ~ Shopping, data=df1, id=list(method="y"), col="palegreen")
[1] "13" "123" "7" "2" "119" "103" "127" "154" "85" "23"
> scatterplot(Time~Distance, regLine=FALSE, smooth=FALSE, boxplots=FALSE,
+ data=df1)
> scatterplot(Arr.Delay~Distance, regLine=FALSE, smooth=FALSE, boxplots=FALSE,
+ data=df1)
> scatterplot(Eating~Age, regLine=FALSE, smooth=FALSE, boxplots=FALSE,
+ data=df1)
> scatterplot(Shopping~Age, regLine=FALSE, smooth=FALSE, boxplots=FALSE,
+ data=df1)
> cor(df1[,c("Age","Arr.Delay","Dep.Delay","Distance","Eating","No.Flights",
+ "Satisfaction","Shopping","Time")], use="complete")
Age Arr.Delay Dep.Delay Distance Eating
Age 1.000000000 0.02860769 0.01428243 0.10304631 0.128048283
Arr.Delay 0.028607686 1.00000000 0.94982706 -0.02218009 0.040647718
Dep.Delay 0.014282427 0.94982706 1.00000000 -0.08909600 0.043504654
Distance 0.103046310 -0.02218009 -0.08909600 1.00000000 0.001800230
Eating 0.128048283 0.04064772 0.04350465 0.00180023 1.000000000
No.Flights 0.262497538 -0.03899427 -0.03284574 -0.10728595 -0.056043596
Satisfaction -0.388650992 -0.11575043 -0.09016724 -0.08272827 0.087443017
Shopping 0.007733124 0.08167980 0.09208845 0.01222653 0.100650875
Time 0.134453426 0.06785729 -0.03419199 0.96282207 0.004430232
No.Flights Satisfaction Shopping Time
Age 0.26249754 -0.388650992 0.007733124 0.134453426
Arr.Delay -0.03899427 -0.115750433 0.081679803 0.067857290
Dep.Delay -0.03284574 -0.090167239 0.092088452 -0.034191989
Distance -0.10728595 -0.082728268 0.012226529 0.962822068
Eating -0.05604360 0.087443017 0.100650875 0.004430232
No.Flights 1.00000000 -0.230368647 -0.161056860 -0.108591501
Satisfaction -0.23036865 1.000000000 -0.001115951 -0.117144434
Shopping -0.16105686 -0.001115951 1.000000000 0.013178219
Time -0.10859150 -0.117144434 0.013178219 1.000000000
> df1$ifBusinessClass <- with(df1, ifelse(Class=="Business",1,0))
> df1$ifEcoClass <- with(df1, ifelse(Class=="Eco",1,0))
> df1$ifFemale <- with(df1, ifelse(Gender=="Female",1,0))
> df1$ifSilver <- with(df1, ifelse(Status=="Silver",1,0))
> df1$ifGold <- with(df1, ifelse(Status=="Gold",1,0))
> df1$ifPlatinum <- with(df1, ifelse(Status=="Platinum",1,0))
> df1$ifBusinessTravel <- with(df1, ifelse(Type=="Business travel",1,0))
> df1$ifPersonalTravel <- with(df1, ifelse(Type=="Personal Travel",1,0))
> RegModel.1 <-
+ lm(Satisfaction~ifBusinessClass+ifBusinessTravel+ifEcoClass+ifFemale+ifGold+ifPersonalTravel+ifPlatinum+ifSilver,
+ data=df1)
> summary(RegModel.1)
Call:
lm(formula = Satisfaction ~ ifBusinessClass + ifBusinessTravel +
ifEcoClass + ifFemale + ifGold + ifPersonalTravel + ifPlatinum +
ifSilver, data = df1)
Residuals:
Min 1Q Median 3Q Max
-2.37695 -0.37695 -0.02631 0.47610 2.97369
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.5183 0.2940 11.966 < 2e-16 ***
ifBusinessClass 0.1595 0.2279 0.700 0.48487
ifBusinessTravel 0.2341 0.2444 0.958 0.33923
ifEcoClass -0.2285 0.1594 -1.433 0.15315
ifFemale -0.1469 0.1029 -1.428 0.15462
ifGold 0.5057 0.1679 3.012 0.00288 **
ifPersonalTravel -1.1165 0.2537 -4.401 1.64e-05 ***
ifPlatinum 1.1198 0.2443 4.584 7.47e-06 ***
ifSilver 0.6734 0.1260 5.343 2.18e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7747 on 232 degrees of freedom
Multiple R-squared: 0.5035, Adjusted R-squared: 0.4864
F-statistic: 29.41 on 8 and 232 DF, p-value: < 2.2e-16
> confint(RegModel.1)
2.5 % 97.5 %
(Intercept) 2.9389833 4.09754747
ifBusinessClass -0.2895953 0.60850060
ifBusinessTravel -0.2475090 0.71568691
ifEcoClass -0.5425296 0.08561130
ifFemale -0.3496712 0.05578752
ifGold 0.1748751 0.83647243
ifPersonalTravel -1.6163612 -0.61673859
ifPlatinum 0.6384887 1.60118727
ifSilver 0.4251043 0.92178696
> RegModel.3 <-
+ lm(Satisfaction~Age+Arr.Delay+Dep.Delay+Distance+Eating+ifBusinessClass+ifBusinessTravel+ifEcoClass+ifFemale+ifGold+ifPersonalTravel+ifPlatinum+ifSilver+No.Flights+Shopping+Time,
+ data=df1)
> summary(RegModel.3)
Call:
lm(formula = Satisfaction ~ Age + Arr.Delay + Dep.Delay + Distance +
Eating + ifBusinessClass + ifBusinessTravel + ifEcoClass +
ifFemale + ifGold + ifPersonalTravel + ifPlatinum + ifSilver +
No.Flights + Shopping + Time, data = df1)
Residuals:
Min 1Q Median 3Q Max
-2.2788 -0.4389 0.0099 0.4647 2.4056
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.129e+00 3.497e-01 11.806 < 2e-16 ***
Age -1.104e-02 3.358e-03 -3.289 0.001169 **
Arr.Delay -6.424e-03 4.169e-03 -1.541 0.124802
Dep.Delay 5.722e-03 4.701e-03 1.217 0.224788
Distance 9.223e-07 8.112e-04 0.001 0.999094
Eating 1.949e-03 9.004e-04 2.164 0.031491 *
ifBusinessClass 2.134e-01 2.214e-01 0.964 0.336066
ifBusinessTravel 1.835e-01 2.417e-01 0.759 0.448720
ifEcoClass -2.388e-01 1.539e-01 -1.551 0.122236
ifFemale -1.212e-01 1.038e-01 -1.168 0.244164
ifGold 5.956e-01 1.659e-01 3.589 0.000407 ***
ifPersonalTravel -1.005e+00 2.576e-01 -3.899 0.000127 ***
ifPlatinum 1.008e+00 2.400e-01 4.198 3.89e-05 ***
ifSilver 6.456e-01 1.246e-01 5.179 4.96e-07 ***
No.Flights -2.427e-03 4.533e-03 -0.535 0.592837
Shopping 3.355e-05 8.584e-04 0.039 0.968864
Time -2.574e-03 6.787e-03 -0.379 0.704901
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7456 on 224 degrees of freedom
Multiple R-squared: 0.5559, Adjusted R-squared: 0.5242
F-statistic: 17.52 on 16 and 224 DF, p-value: < 2.2e-16
> RegModel.4 <-
+ lm(Satisfaction~Age+Arr.Delay+Distance+Eating+ifGold+ifPersonalTravel+Shopping+Time,
+ data=df1)
> summary(RegModel.4)
Call:
lm(formula = Satisfaction ~ Age + Arr.Delay + Distance + Eating +
ifGold + ifPersonalTravel + Shopping + Time, data = df1)
Residuals:
Min 1Q Median 3Q Max
-2.61440 -0.51768 0.05178 0.46480 2.32185
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.283e+00 2.071e-01 20.686 < 2e-16 ***
Age -1.358e-02 3.525e-03 -3.854 0.00015 ***
Arr.Delay -1.482e-03 1.254e-03 -1.181 0.23870
Distance 6.001e-04 8.111e-04 0.740 0.46013
Eating 3.072e-03 9.467e-04 3.245 0.00135 **
ifGold 3.819e-01 1.738e-01 2.197 0.02899 *
ifPersonalTravel -1.302e+00 1.242e-01 -10.487 < 2e-16 ***
Shopping 7.112e-05 9.024e-04 0.079 0.93725
Time -5.940e-03 6.609e-03 -0.899 0.36968
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.8108 on 232 degrees of freedom
Multiple R-squared: 0.456, Adjusted R-squared: 0.4373
F-statistic: 24.31 on 8 and 232 DF, p-value: < 2.2e-16
> best <- ols_step_best_subset(RegModel.4)
> best
Best Subsets Regression
----------------------------------------------------------------------------------
Model Index Predictors
----------------------------------------------------------------------------------
1 ifPersonalTravel
2 Age ifPersonalTravel
3 Age Eating ifPersonalTravel
4 Age Eating ifGold ifPersonalTravel
5 Age Arr.Delay Eating ifGold ifPersonalTravel
6 Age Arr.Delay Eating ifGold ifPersonalTravel Time
7 Age Arr.Delay Distance Eating ifGold ifPersonalTravel Time
8 Age Arr.Delay Distance Eating ifGold ifPersonalTravel Shopping Time
----------------------------------------------------------------------------------
Subsets Regression Summary
----------------------------------------------------------------------------------------------------------------------------------
Adj. Pred
Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
----------------------------------------------------------------------------------------------------------------------------------
1 0.3842 0.3816 0.374 25.6549 609.6019 -74.6698 620.0563 174.1341 0.7285 0.0030 0.6261
2 0.4135 0.4086 0.398 15.1250 599.8217 -84.3282 613.7609 166.5267 0.6996 0.0029 0.6012
3 0.4389 0.4318 0.418 6.3245 591.1840 -92.6885 608.6080 160.0113 0.6749 0.0028 0.5801
4 0.4480 0.4387 0.422 4.4167 589.2163 -94.4761 610.1250 158.0682 0.6695 0.0028 0.5754
5 0.4535 0.4419 0.424 4.0746 588.8067 -94.7160 613.2002 157.1645 0.6683 0.0028 0.5744
6 0.4547 0.4408 0.422 5.5574 590.2712 -93.1515 618.1496 157.4887 0.6724 0.0028 0.5779
7 0.4560 0.4397 0.418 7.0062 591.6994 -91.6118 623.0626 157.7927 0.6764 0.0028 0.5813
8 0.4560 0.4373 0.413 9.0000 593.6929 -89.5402 628.5409 158.4716 0.6820 0.0028 0.5862
----------------------------------------------------------------------------------------------------------------------------------
AIC: Akaike Information Criteria
SBIC: Sawa's Bayesian Information Criteria
SBC: Schwarz Bayesian Criteria
MSEP: Estimated error of prediction, assuming multivariate normality
FPE: Final Prediction Error
HSP: Hocking's Sp
APC: Amemiya Prediction Criteria
> plot(best)
> RegModel.5 <-
+ lm(Satisfaction~Dep.Delay+ifBusinessClass+ifBusinessTravel+ifEcoClass+ifFemale+ifPlatinum+ifSilver+No.Flights,
+ data=df1)
> summary(RegModel.5)
Call:
lm(formula = Satisfaction ~ Dep.Delay + ifBusinessClass + ifBusinessTravel +
ifEcoClass + ifFemale + ifPlatinum + ifSilver + No.Flights,
data = df1)
Residuals:
Min 1Q Median 3Q Max
-2.5086 -0.4495 -0.0388 0.4823 2.7510
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.778997 0.234208 11.87 < 2e-16 ***
Dep.Delay -0.000897 0.001393 -0.64 0.52018
ifBusinessClass 0.233884 0.239246 0.98 0.32930
ifBusinessTravel 1.152673 0.116600 9.89 < 2e-16 ***
ifEcoClass -0.145256 0.166937 -0.87 0.38513
ifFemale -0.176991 0.108978 -1.62 0.10571
ifPlatinum 1.003483 0.257096 3.90 0.00012 ***
ifSilver 0.629273 0.129992 4.84 2.4e-06 ***
No.Flights -0.007755 0.004678 -1.66 0.09876 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.813 on 232 degrees of freedom
Multiple R-squared: 0.454, Adjusted R-squared: 0.435
F-statistic: 24.1 on 8 and 232 DF, p-value: <2e-16
> best2 <- ols_step_best_subset(RegModel.5)
> best2
Best Subsets Regression
------------------------------------------------------------------------------------------------------------
Model Index Predictors
------------------------------------------------------------------------------------------------------------
1 ifBusinessTravel
2 ifBusinessTravel ifSilver
3 ifBusinessTravel ifPlatinum ifSilver
4 ifBusinessClass ifBusinessTravel ifPlatinum ifSilver
5 ifBusinessClass ifBusinessTravel ifPlatinum ifSilver No.Flights
6 ifBusinessClass ifBusinessTravel ifFemale ifPlatinum ifSilver No.Flights
7 ifBusinessClass ifBusinessTravel ifEcoClass ifFemale ifPlatinum ifSilver No.Flights
8 Dep.Delay ifBusinessClass ifBusinessTravel ifEcoClass ifFemale ifPlatinum ifSilver No.Flights
------------------------------------------------------------------------------------------------------------
Subsets Regression Summary
----------------------------------------------------------------------------------------------------------------------------------
Adj. Pred
Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
----------------------------------------------------------------------------------------------------------------------------------
1 0.3440 0.3413 0.333 41.4859 624.8263 -59.6767 635.2807 185.4894 0.7761 0.0032 0.6670
2 0.3886 0.3835 0.373 24.5451 609.8510 -74.5163 623.7902 173.6030 0.7293 0.0030 0.6268
3 0.4307 0.4235 0.41 8.6888 594.6727 -89.2785 612.0966 162.3444 0.6848 0.0029 0.5885
4 0.4400 0.4305 0.414 6.7386 592.7012 -91.0904 613.6100 160.3706 0.6792 0.0028 0.5837
5 0.4451 0.4333 0.415 6.5823 592.5053 -91.1482 616.8989 159.5951 0.6787 0.0028 0.5833
6 0.4508 0.4367 0.415 6.1459 591.9999 -91.4591 619.8783 158.6224 0.6772 0.0028 0.5820
7 0.4525 0.4361 0.414 7.4148 593.2430 -90.0971 624.6062 158.8066 0.6807 0.0028 0.5851
8 0.4535 0.4347 0.411 9.0000 594.8125 -88.4206 629.6605 159.2094 0.6852 0.0029 0.5889
----------------------------------------------------------------------------------------------------------------------------------
AIC: Akaike Information Criteria
SBIC: Sawa's Bayesian Information Criteria
SBC: Schwarz Bayesian Criteria
MSEP: Estimated error of prediction, assuming multivariate normality
FPE: Final Prediction Error
HSP: Hocking's Sp
APC: Amemiya Prediction Criteria
> plot(best2)
University of Windsor, dragicef@uwindsor.ca↩︎