Write a fully executed R-Markdown program and submit a pdf file solving and answering questions listed below under Problems at the end of chapter 6. For clarity, make sure to give an appropriate title to each section.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
BH.df <- read.csv("DMBA-R-datasets/BostonHousing.csv")
Airfares.df <- read.csv("DMBA-R-datasets/Airfares.csv")
car.df <- read.csv("DMBA-R-datasets/ToyotaCorolla.csv")
Predicting Boston Housing Prices. The file BostonHousing.csv contains
information collected by the US Bureau of the Census concerning housing
in the area of Boston, Massachusetts. The dataset includes information
on 506 census housing tracts in the Boston area. The goal is to predict
the median house price in new tracts based on information such as crime
rate, pollution, and number of rooms. The dataset contains 13
predictors, and the response is the median house price (MEDV). Table 6.9
describes each of the predictors and the response. ## a. Why should the
data be partitioned into training and validation sets? What will the
training set be used for? What will the validation set be used
for?
We seperate the data into training set and validation set so that we can
verify the quality of model by comparing it with the real data. The
training set is used to fit the model. The validation set is used to
checking the variation between the predicted value and the real value.
We used R^2, R^2_adjust, P value, etc. to check how the model
perfromed.
colnames(BH.df)
## [1] "CRIM" "ZN" "INDUS" "CHAS" "NOX" "RM"
## [7] "AGE" "DIS" "RAD" "TAX" "PTRATIO" "LSTAT"
## [13] "MEDV" "CAT..MEDV"
# select variables for regression
selected.var <- c(1, 4,6,13)
# partition data
set.seed(1)
train.index<- sample(c(1:506),303)
train.df <- BH.df[train.index, selected.var]
valid.df <- BH.df[-train.index, selected.var]
MEDV = -30.3185 -0.2458 * CRIM + 5.8368 * CHAS + 8.4846 * RM
colnames(BH.df)
## [1] "CRIM" "ZN" "INDUS" "CHAS" "NOX" "RM"
## [7] "AGE" "DIS" "RAD" "TAX" "PTRATIO" "LSTAT"
## [13] "MEDV" "CAT..MEDV"
# select variables for regression
selected.var <- c(1, 4,6,13)
# partition data
set.seed(1)
train.index<- sample(c(1:506),303)
train.df <- BH.df[train.index, selected.var]
valid.df <- BH.df[-train.index, selected.var]
# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
#head(train.df)
BH.lm <- lm(MEDV ~.,data = train.df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(BH.lm)
##
## Call:
## lm(formula = MEDV ~ ., data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0609 -2.9621 -0.4755 2.4257 28.7247
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.3185 3.2974 -9.195 < 0.0000000000000002 ***
## CRIM -0.2458 0.0416 -5.910 0.00000000932 ***
## CHAS 5.8368 1.2757 4.575 0.00000697986 ***
## RM 8.4846 0.5161 16.439 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.974 on 299 degrees of freedom
## Multiple R-squared: 0.5875, Adjusted R-squared: 0.5834
## F-statistic: 142 on 3 and 299 DF, p-value: < 0.00000000000000022
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# use predict() to make predictions on a new set.
BH.lm.pred <- predict(BH.lm, valid.df)
options(scipen=999, digits = 1)
some.residuals <- valid.df$MEDV[1:20] - BH.lm.pred[1:20]
data.frame("Predicted" = BH.lm.pred[1:20], "Actual" = valid.df$MEDV[1:20],
"Residual" = some.residuals)
## Predicted Actual Residual
## 3 31 35 4.1
## 4 29 33 4.4
## 5 30 36 5.9
## 6 24 29 4.5
## 7 21 23 2.2
## 8 22 27 5.1
## 9 17 16 -0.9
## 10 21 19 -1.7
## 11 24 15 -8.7
## 12 21 19 -1.7
## 17 20 23 3.3
## 18 20 18 -2.8
## 21 17 14 -3.0
## 23 21 15 -6.3
## 26 17 14 -3.1
## 30 26 21 -5.1
## 32 21 14 -6.4
## 34 18 13 -4.7
## 38 19 21 1.7
## 46 18 19 1.5
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(BH.lm.pred, valid.df$MEDV)
## ME RMSE MAE MPE MAPE
## Test set 0.0327 6.51 4.36 -6.99 23.1
library(ggplot2)
library(reshape) # to generate input for the plot
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
cor.mat <- round(cor(BH.df),2) # rounded correlation matrix
melted.cor.mat <- melt(cor.mat)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
ggplot(melted.cor.mat, aes(x = X1, y = X2, fill = value)) +
geom_tile() +
geom_text(aes(x = X1, y = X2, label = value))
colnames(BH.df)
## [1] "CRIM" "ZN" "INDUS" "CHAS" "NOX" "RM"
## [7] "AGE" "DIS" "RAD" "TAX" "PTRATIO" "LSTAT"
## [13] "MEDV" "CAT..MEDV"
# select variables for regression
selected.var <- c(1,2,3,4,6,7,8,9,11,12,13)
# partition data
set.seed(1)
train.index<- sample(c(1:506),303)
train.df2 <- BH.df[train.index, selected.var]
valid.df2 <- BH.df[-train.index, selected.var]
BH.lm2 <- lm(MEDV ~.,data = train.df2)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(BH.lm2)
##
## Call:
## lm(formula = MEDV ~ ., data = train.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.286 -2.844 -0.517 1.791 22.911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.2556 5.5070 4.22 0.0000322514625443 ***
## CRIM -0.0926 0.0456 -2.03 0.04298 *
## ZN 0.0239 0.0186 1.29 0.19880
## INDUS -0.1533 0.0680 -2.26 0.02480 *
## CHAS 4.1955 1.0785 3.89 0.00012 ***
## RM 4.3924 0.5611 7.83 0.0000000000000919 ***
## AGE -0.0145 0.0172 -0.84 0.40016
## DIS -1.1397 0.2603 -4.38 0.0000166777058208 ***
## RAD 0.0392 0.0507 0.77 0.43954
## PTRATIO -0.7820 0.1700 -4.60 0.0000063212262466 ***
## LSTAT -0.5840 0.0699 -8.35 0.0000000000000027 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.9 on 292 degrees of freedom
## Multiple R-squared: 0.729, Adjusted R-squared: 0.72
## F-statistic: 78.6 on 10 and 292 DF, p-value: <0.0000000000000002
# use step() to run stepwise regression.
# set directions = to either "backward", "forward", or "both".
BH.lm.step <- step(BH.lm2, direction = "backward")
## Start: AIC=974
## MEDV ~ CRIM + ZN + INDUS + CHAS + RM + AGE + DIS + RAD + PTRATIO +
## LSTAT
##
## Df Sum of Sq RSS AIC
## - RAD 1 14 7023 972
## - AGE 1 17 7026 973
## - ZN 1 40 7049 974
## <none> 7009 974
## - CRIM 1 99 7108 976
## - INDUS 1 122 7131 977
## - CHAS 1 363 7372 987
## - DIS 1 460 7469 991
## - PTRATIO 1 508 7517 993
## - RM 1 1471 8480 1030
## - LSTAT 1 1674 8683 1037
##
## Step: AIC=972
## MEDV ~ CRIM + ZN + INDUS + CHAS + RM + AGE + DIS + PTRATIO +
## LSTAT
##
## Df Sum of Sq RSS AIC
## - AGE 1 18 7042 971
## - ZN 1 45 7069 972
## <none> 7023 972
## - CRIM 1 86 7110 974
## - INDUS 1 108 7132 975
## - CHAS 1 368 7391 986
## - DIS 1 488 7511 991
## - PTRATIO 1 509 7532 992
## - RM 1 1535 8558 1030
## - LSTAT 1 1663 8686 1035
##
## Step: AIC=971
## MEDV ~ CRIM + ZN + INDUS + CHAS + RM + DIS + PTRATIO + LSTAT
##
## Df Sum of Sq RSS AIC
## <none> 7042 971
## - ZN 1 51 7093 971
## - CRIM 1 83 7124 973
## - INDUS 1 118 7160 974
## - CHAS 1 359 7401 984
## - DIS 1 503 7544 990
## - PTRATIO 1 520 7562 991
## - RM 1 1524 8566 1029
## - LSTAT 1 2129 9170 1049
summary(BH.lm.step) # Which variables did it drop?
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + INDUS + CHAS + RM + DIS + PTRATIO +
## LSTAT, data = train.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.354 -2.895 -0.548 1.792 23.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.9638 5.3525 4.10 0.000052764596771 ***
## CRIM -0.0739 0.0398 -1.86 0.06445 .
## ZN 0.0269 0.0184 1.46 0.14435
## INDUS -0.1444 0.0650 -2.22 0.02718 *
## CHAS 4.1621 1.0749 3.87 0.00013 ***
## RM 4.3610 0.5467 7.98 0.000000000000034 ***
## DIS -1.0675 0.2330 -4.58 0.000006844041127 ***
## PTRATIO -0.7455 0.1600 -4.66 0.000004809295010 ***
## LSTAT -0.6052 0.0642 -9.43 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.89 on 294 degrees of freedom
## Multiple R-squared: 0.728, Adjusted R-squared: 0.72
## F-statistic: 98.3 on 8 and 294 DF, p-value: <0.0000000000000002
BH.lm.step.pred <- predict(BH.lm.step, valid.df2)
accuracy(BH.lm.step.pred, valid.df2$MEDV)
## ME RMSE MAE MPE MAPE
## Test set -0.222 5.14 3.55 -4.05 17.7
Best Model selected as MEDV ~ CRIM + ZN + INDUS + CHAS + RM + DIS + PTRATIO + LSTAT. The ME is =0.222. AIC=971.
Predicting Airfare on New Routes. The following problem takes place in the United States in the late 1990s, when many major US cities were facing issues with airport congestion, partly as a result of the 1978 deregulation of airlines. Both fares and routes were freed from regulation, and low-fare carriers such as Southwest (SW) began competing on existing routes and starting nonstop service on routes that previously lacked it. Building completely new airports is generally not feasible, but sometimes decommissioned military bases or smaller municipal airports can be reconfigured as regional or larger commercial airports. There are numerous players and interests involved in the issue (airlines, city, state and federal authorities, civic groups, the military, airport operators), and an aviation consulting firm is seeking advisory contracts with these players. The firm needs predictive models to support its consulting service. One thing the firm might want to be able to predict is fares, in the event a new airport is brought into service. The firm starts with the file Airfares.csv, which contains real data that were collected between Q3-1996 and Q2-1997. The variables in these data are listed in Table 6.11, and are believed to be important in predicting FARE. Some airport-to-airport data are available, but most data are at the city-to-city level. One question that will be of interest in the analysis is the effect that the presence or absence of Southwest has on FARE.
dim(Airfares.df) # find the dimension of data frame
## [1] 638 18
summary(Airfares.df)
## S_CODE S_CITY E_CODE E_CITY
## Length:638 Length:638 Length:638 Length:638
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## COUPON NEW VACATION SW
## Min. :1.00 Min. :0.00 Length:638 Length:638
## 1st Qu.:1.04 1st Qu.:3.00 Class :character Class :character
## Median :1.15 Median :3.00 Mode :character Mode :character
## Mean :1.20 Mean :2.75
## 3rd Qu.:1.30 3rd Qu.:3.00
## Max. :1.94 Max. :3.00
## HI S_INCOME E_INCOME S_POP
## Min. : 1230 Min. :14600 Min. :14600 Min. : 29838
## 1st Qu.: 3090 1st Qu.:24706 1st Qu.:23903 1st Qu.:1862106
## Median : 4208 Median :28637 Median :26409 Median :3532657
## Mean : 4442 Mean :27760 Mean :27664 Mean :4557004
## 3rd Qu.: 5481 3rd Qu.:29694 3rd Qu.:31981 3rd Qu.:7830332
## Max. :10000 Max. :38813 Max. :38813 Max. :9056076
## E_POP SLOT GATE DISTANCE
## Min. : 111745 Length:638 Length:638 Min. : 114
## 1st Qu.:1228816 Class :character Class :character 1st Qu.: 455
## Median :2195215 Mode :character Mode :character Median : 850
## Mean :3194503 Mean : 976
## 3rd Qu.:4549784 3rd Qu.:1306
## Max. :9056076 Max. :2764
## PAX FARE
## Min. : 1504 Min. : 42
## 1st Qu.: 5328 1st Qu.:106
## Median : 7792 Median :145
## Mean :12782 Mean :161
## 3rd Qu.:14090 3rd Qu.:209
## Max. :73892 Max. :402
Looks like Fare has the highest correlation with distance.
num_Airfares.df <- Airfares.df %>%
select_if(is.numeric)
Airfares_cor.mat <- round(cor(num_Airfares.df),2) # rounded correlation matrix
Airfares_melted.cor.mat <- melt(Airfares_cor.mat)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
ggplot(Airfares_melted.cor.mat, aes(x = X1, y = X2, fill = value)) +
geom_tile() +
geom_text(aes(x = X1, y = X2, label = value))
### Looks like Distance has a clearer postive trend with FARE.
ggplot(num_Airfares.df) + geom_point(aes(x = DISTANCE, y = FARE), colour = "navy", alpha = 0.7)
ggplot(num_Airfares.df) + geom_point(aes(x = COUPON, y = FARE), colour = "navy", alpha = 0.7)
ggplot(num_Airfares.df) + geom_point(aes(x = HI, y = FARE), colour = "navy", alpha = 0.7)
char_Airfares.df <- Airfares.df %>%
select(where(is.character), FARE,-S_CODE,-S_CITY,-E_CODE,-E_CITY)
char_Airfares.df %>%
group_by(VACATION) %>%
summarise(avg_Fare=mean(FARE))
## # A tibble: 2 × 2
## VACATION avg_Fare
## <chr> <dbl>
## 1 No 174.
## 2 Yes 126.
char_Airfares.df %>%
group_by(SW) %>%
summarise(avg_Fare=mean(FARE))
## # A tibble: 2 × 2
## SW avg_Fare
## <chr> <dbl>
## 1 No 188.
## 2 Yes 98.4
char_Airfares.df %>%
group_by(SLOT) %>%
summarise(avg_Fare=mean(FARE))
## # A tibble: 2 × 2
## SLOT avg_Fare
## <chr> <dbl>
## 1 Controlled 186.
## 2 Free 151.
char_Airfares.df %>%
group_by(GATE) %>%
summarise(avg_Fare=mean(FARE))
## # A tibble: 2 × 2
## GATE avg_Fare
## <chr> <dbl>
## 1 Constrained 193.
## 2 Free 153.
char_Airfares.df <- char_Airfares.df %>%
mutate(VACATION = if_else(VACATION=="Yes",1,0) ,
SW=ifelse(SW=="Yes", 1 , 0),
SLOT=ifelse(SLOT == "Controlled", 1 , 0),
GATE=ifelse(GATE == "Constrained", 1 , 0))
par(mfcol = c(1, 4))
boxplot( char_Airfares.df$FARE~char_Airfares.df$VACATION, xlab = "VACATION", ylab = "FARE")
boxplot( char_Airfares.df$FARE ~char_Airfares.df$SW , ylab = "FARE", xlab = "SW")
boxplot(char_Airfares.df$FARE ~char_Airfares.df$SLOT , ylab = "FARE", xlab = "SLOT")
boxplot( char_Airfares.df$FARE ~char_Airfares.df$GATE, ylab = "FARE", xlab = "GATE")
### Looks like SW has a clear difference of the average fear when it’s
yes and no. I would say this might be a good predictor for FARE.
All_dummies <- as.data.frame(model.matrix(~ 0 + VACATION + SW+ SLOT+GATE, data=Airfares.df))
Airfares.df2 <- cbind(Airfares.df[,c(-1,-2,-3,-4,-7,-8,-14,-15)], All_dummies[,-2])
set.seed(1) # set seed for reproducing the partition
Airfares_train.rows <- sample(rownames(Airfares.df2), dim(Airfares.df2)[1]*0.6)
# collect all the columns with training row ID into training set:
Airfares_train.data <- Airfares.df2[Airfares_train.rows, ]
# assign row IDs that are not already in the training set, into validation
Airfares_valid.rows <- setdiff(rownames(Airfares.df2), Airfares_train.rows)
Airfares_valid.data <- Airfares.df2[Airfares_valid.rows, ]
Airfares.lm <- lm(FARE ~.,data = Airfares_train.data)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(Airfares.lm )
##
## Call:
## lm(formula = FARE ~ ., data = Airfares_train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.59 -21.70 -1.19 20.33 97.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -46.967121908 33.434185869 -1.40 0.1609
## COUPON -13.454800714 14.976577551 -0.90 0.3696
## NEW -0.877296990 2.564424903 -0.34 0.7325
## HI 0.009445555 0.001255103 7.53 0.00000000000040707 ***
## S_INCOME 0.001686337 0.000635488 2.65 0.0083 **
## E_INCOME 0.002201549 0.000469532 4.69 0.00000387757600551 ***
## S_POP 0.000004094 0.000000853 4.80 0.00000229932297704 ***
## E_POP 0.000004289 0.000000992 4.32 0.00001987567026022 ***
## DISTANCE 0.078243255 0.004447075 17.59 < 0.0000000000000002 ***
## PAX -0.001182191 0.000189977 -6.22 0.00000000132990559 ***
## VACATIONNo 34.668200077 4.944725211 7.01 0.00000000001136935 ***
## SWYes -41.746544271 4.883751233 -8.55 0.00000000000000034 ***
## SLOTFree -13.518802529 4.920237876 -2.75 0.0063 **
## GATEFree -22.886724774 5.015235952 -4.56 0.00000686787794072 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35 on 368 degrees of freedom
## Multiple R-squared: 0.811, Adjusted R-squared: 0.805
## F-statistic: 122 on 13 and 368 DF, p-value: <0.0000000000000002
library(forecast)
# use predict() to make predictions on a new set.
Airfares.lm.pred <- predict(Airfares.lm, Airfares_valid.data)
options(scipen=999, digits = 1)
some.residuals <- Airfares_valid.data$FARE[1:20] - Airfares.lm.pred[1:20]
data.frame("Predicted" = Airfares.lm.pred[1:20], "Actual" = Airfares_valid.data$FARE[1:20],
"Residual" = some.residuals)
## Predicted Actual Residual
## 4 127 85 -41.7
## 6 83 57 -26.5
## 7 242 228 -14.3
## 8 122 117 -5.9
## 9 179 173 -6.6
## 10 127 115 -12.3
## 11 170 158 -11.7
## 12 274 229 -45.3
## 13 118 79 -38.9
## 17 176 117 -58.6
## 23 126 100 -26.7
## 24 109 107 -2.4
## 26 104 114 10.0
## 30 142 134 -7.5
## 38 181 234 52.9
## 46 144 136 -8.0
## 47 201 231 30.0
## 52 144 197 53.2
## 54 69 69 0.4
## 55 134 92 -41.7
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(Airfares.lm.pred, Airfares_valid.data$FARE)
## ME RMSE MAE MPE MAPE
## Test set -3.64 37.5 29.2 -5.91 24.1
## stepwise regression
# use step() to run stepwise regression.
# set directions = to either "backward", "forward", or "both".
Airfares.lm.step <- step(Airfares.lm , direction = "backward")
## Start: AIC=2730
## FARE ~ COUPON + NEW + HI + S_INCOME + E_INCOME + S_POP + E_POP +
## DISTANCE + PAX + VACATIONNo + SWYes + SLOTFree + GATEFree
##
## Df Sum of Sq RSS AIC
## - NEW 1 143 450613 2728
## - COUPON 1 988 451457 2729
## <none> 450469 2730
## - S_INCOME 1 8620 459089 2735
## - SLOTFree 1 9241 459710 2735
## - E_POP 1 22872 473341 2747
## - GATEFree 1 25492 475961 2749
## - E_INCOME 1 26912 477381 2750
## - S_POP 1 28216 478686 2751
## - PAX 1 47401 497871 2766
## - VACATIONNo 1 60172 510642 2776
## - HI 1 69329 519798 2782
## - SWYes 1 89444 539913 2797
## - DISTANCE 1 378933 829402 2961
##
## Step: AIC=2728
## FARE ~ COUPON + HI + S_INCOME + E_INCOME + S_POP + E_POP + DISTANCE +
## PAX + VACATIONNo + SWYes + SLOTFree + GATEFree
##
## Df Sum of Sq RSS AIC
## - COUPON 1 968 451581 2727
## <none> 450613 2728
## - S_INCOME 1 8650 459263 2733
## - SLOTFree 1 9209 459822 2734
## - E_POP 1 22868 473481 2745
## - GATEFree 1 25656 476269 2747
## - E_INCOME 1 26787 477400 2748
## - S_POP 1 28074 478687 2749
## - PAX 1 47417 498029 2764
## - VACATIONNo 1 60284 510897 2774
## - HI 1 69405 520018 2781
## - SWYes 1 89496 540109 2795
## - DISTANCE 1 380274 830886 2960
##
## Step: AIC=2727
## FARE ~ HI + S_INCOME + E_INCOME + S_POP + E_POP + DISTANCE +
## PAX + VACATIONNo + SWYes + SLOTFree + GATEFree
##
## Df Sum of Sq RSS AIC
## <none> 451581 2727
## - SLOTFree 1 8583 460164 2732
## - S_INCOME 1 9414 460995 2733
## - E_POP 1 22269 473850 2743
## - GATEFree 1 25041 476622 2745
## - E_INCOME 1 27986 479567 2748
## - S_POP 1 29234 480815 2749
## - PAX 1 49297 500878 2764
## - VACATIONNo 1 59437 511018 2772
## - HI 1 80123 531704 2787
## - SWYes 1 88661 540242 2793
## - DISTANCE 1 718092 1169673 3088
summary(Airfares.lm.step) # Which variables did it drop?
##
## Call:
## lm(formula = FARE ~ HI + S_INCOME + E_INCOME + S_POP + E_POP +
## DISTANCE + PAX + VACATIONNo + SWYes + SLOTFree + GATEFree,
## data = Airfares_train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.43 -21.74 -0.57 20.27 96.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -67.831697658 25.151390800 -2.70 0.0073 **
## HI 0.009708760 0.001198267 8.10 0.0000000000000079 ***
## S_INCOME 0.001751416 0.000630636 2.78 0.0058 **
## E_INCOME 0.002232869 0.000466297 4.79 0.0000024344503155 ***
## S_POP 0.000004139 0.000000846 4.89 0.0000014768569666 ***
## E_POP 0.000004219 0.000000988 4.27 0.0000247161658388 ***
## DISTANCE 0.075309573 0.003104754 24.26 < 0.0000000000000002 ***
## PAX -0.001119766 0.000176192 -6.36 0.0000000006118543 ***
## VACATIONNo 34.335068366 4.920133781 6.98 0.0000000000138579 ***
## SWYes -41.473372868 4.865964401 -8.52 0.0000000000000004 ***
## SLOTFree -12.910587203 4.868584414 -2.65 0.0084 **
## GATEFree -22.601683731 4.989799565 -4.53 0.0000079845334289 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.9 on 370 degrees of freedom
## Multiple R-squared: 0.811, Adjusted R-squared: 0.805
## F-statistic: 144 on 11 and 370 DF, p-value: <0.0000000000000002
Airfares.lm.step.pred <- predict(Airfares.lm.step, Airfares_valid.data )
accuracy(Airfares.lm.step.pred, Airfares_valid.data$FARE)
## ME RMSE MAE MPE MAPE
## Test set -3.6 37.4 29.1 -5.89 24.1
library(leaps)
Airfares_search <- regsubsets(FARE ~ ., data = Airfares_train.data, nbest = 1, nvmax = dim(Airfares_train.data)[2],
method = "exhaustive")
sum <- summary(Airfares_search)
# show models
sum$which
## (Intercept) COUPON NEW HI S_INCOME E_INCOME S_POP E_POP DISTANCE PAX
## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 2 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 3 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 4 TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 5 TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 6 TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 7 TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
## 8 TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 9 TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 10 TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 11 TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 12 TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## VACATIONNo SWYes SLOTFree GATEFree
## 1 FALSE FALSE FALSE FALSE
## 2 FALSE TRUE FALSE FALSE
## 3 TRUE TRUE FALSE FALSE
## 4 TRUE TRUE FALSE FALSE
## 5 TRUE TRUE FALSE TRUE
## 6 TRUE TRUE TRUE TRUE
## 7 TRUE TRUE TRUE TRUE
## 8 TRUE TRUE FALSE FALSE
## 9 TRUE TRUE FALSE TRUE
## 10 TRUE TRUE FALSE TRUE
## 11 TRUE TRUE TRUE TRUE
## 12 TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE
# show metrics
sum$rsq
## [1] 0.451 0.617 0.712 0.751 0.764 0.780 0.784 0.793 0.801 0.807 0.811 0.811
## [13] 0.811
sum$adjr2
## [1] 0.450 0.615 0.710 0.749 0.761 0.776 0.780 0.789 0.797 0.802 0.805 0.805
## [13] 0.805
sum$cp
## [1] 692.9 370.1 187.8 113.0 90.6 61.7 55.6 39.3 25.4 15.9 10.9 12.1
## [13] 14.0
Compare the predictive accuracy of both models (ii) and (iii) using measures such as RMSE and average error and lift charts. ### Since we are looking at the same model, the RMSE should be the close. ### ME RMSE MAE MPE MAPE ###Test set -3.6 37.4 29.1 -5.89 24.1
Using model (iii), predict the average fare on a route with the following characteristics: COUPON = 1.202, NEW = 3, VACATION = No, SW = No, HI = 4442.141, S_INCOME = $28,760, E_INCOME = $27,664, S_POP = 4,557,004, E_POP = 3,195,503, SLOT = Free, GATE = Free, PAX = 12,782, DISTANCE = 1976 miles.
Airfares.lm.step.pred2 <- predict(Airfares.lm.step, data.frame(COUPON = 1.202, NEW = 3,VACATIONNo=1, SWYes=0,HI = 4442.141, S_INCOME = 28760, E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503,SLOTFree=1, GATEFree=1,PAX = 12782, DISTANCE = 1976) )
Airfares.lm.step.pred2
## 1
## 253
Airfares.lm.step.pred3 <- predict(Airfares.lm.step, data.frame(COUPON = 1.202, NEW = 3,VACATIONNo=1, SWYes=1,HI = 4442.141, S_INCOME = 28760, E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503,SLOTFree=1, GATEFree=1,PAX = 12782, DISTANCE = 1976) )
Airfares.lm.step.pred3
## 1
## 212
In reality, which of the factors will not be available for predicting the average fare from a new airport (i.e., before flights start operating on those routes)? Which ones can be estimated? How? ### VACATION, GATE, PAX, NEW, HI can not be collected before the route start to be use. ### VACATION and GATE can be estimated with the histroy data and comparison.
Select a model that includes only factors that are available before flights begin to operate on the new route. Use an exhaustive search to find such a model. ### The best model is the 6th one. ### FARE ~ VACATIONNo+S_INCOME+E_INCOME+SLOTFree+GATEFree+DISTANCE
Airfares_search2 <- regsubsets(FARE ~ VACATIONNo+S_INCOME+E_INCOME+S_POP+E_POP+SLOTFree+GATEFree+DISTANCE,
data = Airfares_train.data, nbest = 1, nvmax = dim(Airfares_train.data)[2],
method = "exhaustive")
sum2 <- summary(Airfares_search2)
# show models
sum2$which
## (Intercept) VACATIONNo S_INCOME E_INCOME S_POP E_POP SLOTFree GATEFree
## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## 4 TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE
## 5 TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## 6 TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## 7 TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## 8 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## DISTANCE
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
## 8 TRUE
# show metrics
sum2$rsq
## [1] 0.451 0.562 0.617 0.650 0.675 0.686 0.687 0.687
sum2$adjr2
## [1] 0.450 0.560 0.614 0.647 0.670 0.681 0.681 0.681
sum2$cp
## [1] 276.72 145.79 82.73 45.01 18.16 6.99 7.44 9.00
new<- data.frame(COUPON = 1.202, NEW = 3,VACATIONNo=1, SWYes=1,HI = 4442.141, S_INCOME = 28760, E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503,SLOTFree=1, GATEFree=1,PAX = 12782, DISTANCE = 1976)
Airfares.lm4 <- lm(FARE ~ VACATIONNo+S_INCOME+E_INCOME+SLOTFree+GATEFree+DISTANCE,
data = Airfares_train.data)
summary(Airfares.lm4)
##
## Call:
## lm(formula = FARE ~ VACATIONNo + S_INCOME + E_INCOME + SLOTFree +
## GATEFree + DISTANCE, data = Airfares_train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -111.05 -31.41 -2.79 26.65 134.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -58.577904 27.996795 -2.09 0.03708 *
## VACATIONNo 48.697327 5.678366 8.58 0.00000000000000026 ***
## S_INCOME 0.002845 0.000667 4.26 0.00002543387720283 ***
## E_INCOME 0.002908 0.000519 5.61 0.00000004001265233 ***
## SLOTFree -20.058345 5.526168 -3.63 0.00032 ***
## GATEFree -45.831424 5.957661 -7.69 0.00000000000012827 ***
## DISTANCE 0.078717 0.003495 22.52 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.7 on 375 degrees of freedom
## Multiple R-squared: 0.686, Adjusted R-squared: 0.681
## F-statistic: 136 on 6 and 375 DF, p-value: <0.0000000000000002
Airfares.lm.step.pred4 <- predict(Airfares.lm4, newdata = Airfares_valid.data)
accuracy(Airfares.lm.step.pred4, Airfares_valid.data$FARE)
## ME RMSE MAE MPE MAPE
## Test set -3.06 42.3 33.1 -9.72 27.1
pred4.new <- predict(Airfares.lm4, newdata = new)
pred4.new
## 1
## 242
ME RMSE MAE MPE MAPE
Test set -3.06 42.3 33.1 -9.72 27.1
##Problem 4: Predicting prices of used cars (a, b) 6.4 Predicting Prices of Used Cars. The file ToyotaCorolla.csv contains data on used cars (Toyota Corolla) on sale during late summer of 2004 in the Netherlands. It has 1436 records containing details on 38 attributes, including Price, Age, Kilometers, HP, and other specifications. The goal is to predict the price of a used Toyota Corolla based on its specifications. (The example in Section 6.3 is a subset of this dataset.) Split the data into training (50%), validation (30%), and test (20%) datasets. Run a multiple linear regression with the outcome variable Price and predictor variables Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar. a. What appear to be the three or four most important car specifications for predicting the car’s price?
car_train.rows <- sample(rownames(car.df), dim(car.df)[1]*0.5)
# sample 30% of the row IDs into the validation set, drawing only from records
# not already in the training set
# use setdiff() to find records not already in the training set
car_valid.rows <- sample(setdiff(rownames(car.df), car_train.rows),
dim(car.df)[1]*0.3)
# assign the remaining 20% row IDs serve as test
car_test.rows <- setdiff(rownames(car.df), union(car_train.rows, car_valid.rows))
# create the 3 data frames by collecting all columns from the appropriate rows
car_train.data <- car.df[car_train.rows, ]
car_valid.data <- car.df[car_valid.rows, ]
car_test.data <- car.df[car_test.rows, ]
car.lm <- lm(Price ~ Age_08_04+ KM+ Fuel_Type+ HP+ Automatic+ Doors+ Quarterly_Tax+ Mfr_Guarantee+ Guarantee_Period+ Airco+ Automatic_airco+ CD_Player+ Powered_Windows+ Sport_Model+ Tow_Bar,
data = car_train.data)
summary(car.lm)
##
## Call:
## lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic +
## Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period +
## Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model +
## Tow_Bar, data = car_train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3283 -817 -62 722 6858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9269.09401 755.20013 12.27 < 0.0000000000000002 ***
## Age_08_04 -110.80262 3.94135 -28.11 < 0.0000000000000002 ***
## KM -0.01791 0.00186 -9.64 < 0.0000000000000002 ***
## Fuel_TypeDiesel 2489.63227 423.39089 5.88 0.000000006340390 ***
## Fuel_TypePetrol 2281.18095 462.09420 4.94 0.000000994323208 ***
## HP 34.68438 4.08563 8.49 < 0.0000000000000002 ***
## Automatic 382.92666 221.31368 1.73 0.0840 .
## Doors 120.99446 50.74883 2.38 0.0174 *
## Quarterly_Tax 17.39058 2.26654 7.67 0.000000000000057 ***
## Mfr_Guarantee 425.46952 98.87754 4.30 0.000019246356122 ***
## Guarantee_Period 56.89675 17.34367 3.28 0.0011 **
## Airco 7.49196 122.12605 0.06 0.9511
## Automatic_airco 3278.37632 217.47597 15.07 < 0.0000000000000002 ***
## CD_Player 316.09835 128.26070 2.46 0.0140 *
## Powered_Windows 549.26604 115.54277 4.75 0.000002423588794 ***
## Sport_Model 345.05427 109.42417 3.15 0.0017 **
## Tow_Bar -81.06935 107.96767 -0.75 0.4530
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1240 on 701 degrees of freedom
## Multiple R-squared: 0.893, Adjusted R-squared: 0.89
## F-statistic: 365 on 16 and 701 DF, p-value: <0.0000000000000002
# use predict() to make predictions on a new set.
car.lm.pred <- predict(car.lm, car_valid.data)
options(scipen=999, digits = 1)
some.residuals <- car_valid.data$Price[1:20] - car.lm.pred[1:20]
data.frame("Predicted" = car.lm.pred[1:20], "Actual" = car_valid.data$Price[1:20],
"Residual" = some.residuals)
## Predicted Actual Residual
## 25 18624 16250 -2374
## 1015 10858 10450 -408
## 82 15691 17250 1559
## 1002 9324 8950 -374
## 982 10148 9750 -398
## 729 8973 9500 527
## 816 9152 9450 298
## 171 18225 18245 20
## 106 15873 16950 1077
## 506 10980 11500 520
## 1193 6123 6750 627
## 1221 9082 7900 -1182
## 513 12313 13950 1637
## 997 10424 9950 -474
## 836 9635 9750 115
## 173 17745 19500 1755
## 150 20506 20950 444
## 1316 8101 8950 849
## 1362 8294 6495 -1799
## 496 10845 11250 405
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(car.lm.pred, car_valid.data$Price)
## ME RMSE MAE MPE MAPE
## Test set 72.5 1254 925 -0.384 9.2