library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: 패키지 'ggplot2'는 R 버전 4.2.2에서 작성되었습니다
library(caret) # for cross-validation
## Warning: 패키지 'caret'는 R 버전 4.2.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
library(vip) # variance importance
## Warning: 패키지 'vip'는 R 버전 4.2.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(h2o)
## Warning: 패키지 'h2o'는 R 버전 4.2.2에서 작성되었습니다
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## 다음의 패키지를 부착합니다: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
Ideally, we want estimates of β0 and β1 that give us the “best fitting” line. But what is meant by “best fitting”? The most common approach is to use the method of least squares (LS) estimation.
# Import data from AmesHousing library and split training & testing data sets
library(AmesHousing)
## Warning: 패키지 'AmesHousing'는 R 버전 4.2.2에서 작성되었습니다
library(rsample)
## Warning: 패키지 'rsample'는 R 버전 4.2.2에서 작성되었습니다
ames<-AmesHousing::make_ames()
#h2o.init()
#ames.h2o<-as.h2o(ames)
set.seed(123)
split<-initial_split(ames, prop = 0.7, strata = 'Sale_Price')
ames_train<-training(split)
ames_test<-testing(split)
Simple linear regression model
model1<-lm(Sale_Price ~ Gr_Liv_Area, data = ames_train)
summary(model1)
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area, data = ames_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -474682 -30794 -1678 23353 328183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15938.173 3851.853 4.138 3.65e-05 ***
## Gr_Liv_Area 109.667 2.421 45.303 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56790 on 2047 degrees of freedom
## Multiple R-squared: 0.5007, Adjusted R-squared: 0.5004
## F-statistic: 2052 on 1 and 2047 DF, p-value: < 2.2e-16
The estimated coefficients from our model β0 = 6767.865, β1 = 116.610
Interpretation: The mean selling price increases by 116.61 for each additional one square food of above ground living space
mean square error/MSE = y - y^, which is the difference between observed and predicted y value.
#RMSE, residual standard error
sigma(model1)
## [1] 56787.94
#MSE, R^2
sigma(model1)^2
## [1] 3224869786
t-statistics for such a test are nothing more than the estimated
coefficients divided by their corresponding estimated standard errors
(i.e., in the output from summary(), t value = Estimate / Std. Error).
The reported t-statistics measure the number of standard deviations each
coefficient is away from 0. Thus, large t-statistics (greater than two
in absolute value, say) roughly indicate statistical significance at
the
α = 0.05 level.
Confidence intervals for 95%
confint(model1, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 8384.213 23492.1336
## Gr_Liv_Area 104.920 114.4149
Interpretation: 95% confidence that the mean selling price increases between 111.74 ~ 121.477 for each additional one square foot of above ground living space
In practice, we have more variables(predictors), with Ames Housing data, we may wish to understand if Gr_Liv_Area and the Year_Built are related to Sale_Price.
Y = β0 + β1X1 + β2X2 + ϵ
model2<-lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
model2
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
##
## Coefficients:
## (Intercept) Gr_Liv_Area Year_Built
## -2.103e+06 9.383e+01 1.087e+03
Altenatively, we can use update() function to update the model1.
# everything from model 1 left and right and add Year_Built on the right side
options(scipen=999)
model2<-update(model1, . ~ . + Year_Built )
model2
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
##
## Coefficients:
## (Intercept) Gr_Liv_Area Year_Built
## -2102904.57 93.83 1086.85
beta1 is 93.83, beta2 is 1086.85 and the estimated intercept is -2102904.57.
Interpretation: every one square foot increase to Gr_Liv_Area(above ground square footage) is associated with an additional $93.83 in mean selling price when holding others constant. likewise, for everying year newer a home is approximately an increase of $1086.85 in selling price when holding others constant.
An interaction occurs when the effect of one predictor on the response depends on the values of other predictors
X1 * X2 = X1 + X2 + X1:X2
In here, the size of above ground square footage is depends on the year built. The newer house tends to smaller due to the smaller size of family.
lm(Sale_Price ~Gr_Liv_Area * Year_Built, data = ames_train)
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area * Year_Built, data = ames_train)
##
## Coefficients:
## (Intercept) Gr_Liv_Area Year_Built
## -810498.3134 -728.5084 430.8755
## Gr_Liv_Area:Year_Built
## 0.4168
# Include all possible main effects
model3<-lm(Sale_Price ~., data = ames_train)
# print estimated coefficient in a tidy data frame
broom::tidy(model3)
## # A tibble: 300 × 5
## term estim…¹ std.e…² stati…³ p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.73e7 1.10e7 -2.47 0.0134
## 2 MS_SubClassOne_Story_1945_and_Older 3.90e3 3.58e3 1.09 0.275
## 3 MS_SubClassOne_Story_with_Finished_Attic_All… -5.39e3 1.22e4 -0.443 0.658
## 4 MS_SubClassOne_and_Half_Story_Unfinished_All… -4.41e2 1.39e4 -0.0316 0.975
## 5 MS_SubClassOne_and_Half_Story_Finished_All_A… 1.04e3 7.25e3 0.143 0.886
## 6 MS_SubClassTwo_Story_1946_and_Newer -6.67e3 5.51e3 -1.21 0.226
## 7 MS_SubClassTwo_Story_1945_and_Older 1.57e3 6.07e3 0.259 0.795
## 8 MS_SubClassTwo_and_Half_Story_All_Ages 3.41e3 1.01e4 0.336 0.737
## 9 MS_SubClassSplit_or_Multilevel -6.67e3 1.17e4 -0.571 0.568
## 10 MS_SubClassSplit_Foyer 1.49e3 7.51e3 0.199 0.843
## # … with 290 more rows, and abbreviated variable names ¹estimate, ²std.error,
## # ³statistic
Among the 3 models (model1, model2, model3) which model is the best? In our case, use the RMSE metric and cross-validation to determine the best model.
we can use caret::train() function to train a linear model using cross-validation.
# train model using 10-fold cross validation
set.seed(123)
cv_model1<- train(
form = Sale_Price ~ Gr_Liv_Area, # regression formula
data = ames_train, # training data set
method = 'lm', # linear regression model
trControl = trainControl(method = 'cv', number = 10) # 10 fold cross validation
)
cv_model1
## Linear Regression
##
## 2049 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1843, 1844, 1844, 1844, 1844, 1844, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 56644.76 0.510273 38851.99
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
The result of cross-validation RMSE is 56644.76 (the average RMSE across the 10 cv folds)
Interpretation of RMSE: When apply to a new data set, the prediction of this model makes are, on average, about $56410.89 off from the actual price.
let’s continue for the rest of two models.
# cv for model 2
set.seed(123)
cv_model2 <- train(
form = Sale_Price ~ Gr_Liv_Area + Year_Built,
data = ames_train,
method = 'lm',
trControl = trainControl(method = 'cv', number = 10)
)
cv_model2
## Linear Regression
##
## 2049 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1843, 1844, 1844, 1844, 1844, 1844, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 46865.68 0.6631008 31695.48
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
RMSE is $46865.68, when apply to a new data set, on average, the model 2 data off about 46865.68 dollars, it is better than the first model.
# cv for model3
cv_model3 <- train(
form = Sale_Price ~.,
method = 'lm',
data = ames_train,
trControl = trainControl(method = 'cv', number = 10)
)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
cv_model3
## Linear Regression
##
## 2049 samples
## 80 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1844, 1845, 1844, 1845, 1843, 1844, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 34315.55 0.8238844 16577.68
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Extract out of sample performance measures
summary(resamples(list(
model1 = cv_model1,
model2 = cv_model2,
model3 = cv_model3
)))
##
## Call:
## summary.resamples(object = resamples(list(model1 = cv_model1, model2
## = cv_model2, model3 = cv_model3)))
##
## Models: model1, model2, model3
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 34076.73 37656.23 39785.18 38851.99 40200.92 42058.68 0
## model2 29227.14 30885.17 32003.59 31695.48 32710.41 33942.26 0
## model3 13016.48 15153.81 16616.51 16577.68 17858.46 20167.83 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 45604.65 55896.58 57000.74 56644.76 59544.08 66198.59 0
## model2 37174.26 42650.00 46869.84 46865.68 51155.14 55780.47 0
## model3 19016.47 22101.90 26422.35 34315.55 42506.56 69942.84 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 0.4230788 0.4621034 0.5090642 0.5102730 0.5681246 0.5996400 0
## model2 0.5829425 0.6075293 0.6865871 0.6631008 0.6976664 0.7254572 0
## model3 0.5254168 0.7460866 0.9075542 0.8238844 0.9264143 0.9353155 0
The best model among three models is model3, the full model.
Non-linear relationships can be made linear or near-linear relationship between sale price and the year a home was built with log transformation.
let’s plot original and log-transformed relationships.
p1<-ggplot(ames_train, aes(Year_Built, Sale_Price)) +
geom_point(size = 1, alpha = 0.4) +
geom_smooth(se = FALSE) + # standard error shadow
scale_y_continuous('Sale Price', labels = scales::dollar)+
xlab('Year Built')+
ggtitle(paste('Non-transformed variable with a\n',
'non-linear relationship.'))
p2<-ggplot(ames_train, aes(Year_Built, Sale_Price))+
geom_point(size = 1, alpha = 0.4)+
geom_smooth(method = 'lm', se = FALSE)+
scale_y_log10("Sale Price", labels = scales::dollar, breaks = seq(0,400000, by = 100000))+
xlab("Year built")+
ggtitle(paste("Transforming variables can provide a\n", "near-linear relationship."))
# align together
gridExtra::grid.arrange(p1,p2, nrow = 1)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
Linear regression assumes the variance among error terms are constant(homoscedasticity).If the error variance is not constant, the p-values and confidence intervals for the coefficients will be invalid. non-constant variance can oftern be resolved with variable transformations or by including additional predictors.
let’s see below comparison of model 1 and model 3. The con-shape is not good.
# augment adds information about observations to the tail of ames_train data frame.
df1<-broom::augment(cv_model1$finalModel, data = ames_train)
p1<-ggplot(df1, aes(.fitted, .std.resid))+
geom_point(size = 1, alpha = 0.4)+
xlab('Predicted values')+
ylab('Residuals')+
ggtitle('Model 1', subtitle = 'Sale_Price ~ Gr_Liv_Area')
df2<-broom::augment(cv_model3$finalModel, data = ames_train)
p2<-ggplot(df2, aes(.fitted, .std.resid))+
geom_point(size = 1, alpha = 0.4)+
xlab("Predicted values")+
ylab("Residuals")+
ggtitle("Model3", subtitle = "Sale_Price ~.")
gridExtra::grid.arrange(p1,p2,nrow = 1)
## Warning: Removed 20 rows containing missing values (`geom_point()`).
Auto correlation is a characteristic of data which shows the degree of similarity between the values of the same variables over successive time intervals.
Linear regression assumes the errors are independent and uncorrelated If there is correlation among the errors, then the estimated standard errors of the coefficients will be biased leading to prediction intervals being narrow than they should be. (if the error has autocorrelation, the error residual graph has a pattern of up and down)
df1<-mutate(df1, id = row_number()) # add row number column
df2<-mutate(df2, id = row_number())
p1<-ggplot(df1, aes(id, .std.resid))+
geom_point(size = 1, alpha = 0.4)+
xlab('Row ID')+
ylab('Residuals')+
ggtitle("Model 1", subtitle = "Correlated residuals")
p2<-ggplot(df2, aes(id, .std.resid))+
geom_point(size = 1, alpha = 0.4)+
xlab("Row ID")+
ylab("Residuals")+
ggtitle("Model 3", subtitle = 'Uncorrelated residuals')
gridExtra::grid.arrange(p1,p2, nrow = 1)
## Warning: Removed 20 rows containing missing values (`geom_point()`).
When the number of features exceeds the number of observations, the OLS estimates are not obtainable. To solve this issue an analyst can remove variables one at a time until p <n. An alternatives use regularized regression to solve the problem.
Collinearity refers to the situation in which two or more predictor variables are closely related to one another. collinearity can cause predictor variables to appear as statistically insignificant when in fact they are significant. This obviously leads to an inaccurate interpretation of coefficients and makes it difficult to identify influential predictors.+
for example, Garage_Area and Garage_Cars are two variables that have a correlation of 0.89 and both variables are strongly related to our response variable (Sale_Price). Looking at our full model where both of these variables are included, we see that Garage_Area is found to be statistically significant but Garage_Cars is not:
summary(cv_model3)%>%
broom::tidy()%>%
filter(term %in%c("Garage_Area", "Garage_Cars"))
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Garage_Cars 3151. 1733. 1.82 0.0693
## 2 Garage_Area 11.9 5.69 2.10 0.0359
One major use of PCR lies in overcoming the multicollinearity problem which arises when two or more of the explanatory variables are close to being collinear.[2] PCR can aptly deal with such situations by excluding some of the low-variance principal components in the regression step. In addition, by usually regressing on only a subset of all the principal components, PCR can result in dimension reduction through substantially lowering the effective number of parameters characterizing the underlying model. This can be particularly useful in settings with high-dimensional covariates. Also, through appropriate selection of the principal components to be used for regression, PCR can lead to efficient prediction of the outcome based on the assumed model.
principal component regression is a regression analysis technique that is based on principal component analysis. More specifically, PCR is used for estimating the unknown regression coefficients in a standard linear regression model.
PCR uses the principal components as the predictor variables for regression analysis instead of the original features.
Performing PCR with caret is an easy extension from our previous model. We simply specify method = “pcr” within train() to perform PCA on all our numeric predictors prior to fitting the model.
# perform 10-fold cross validation on a PCR model tuning the number of principal components to use as predictors from 1- 100
set.seed(123)
cv_model_pcr<-train(
Sale_Price ~.,
data = ames_train,
method = 'pcr', # principal component regression
trContrl = trainControl(method = 'cv', number = 10), # 10 fold cross validation
preProcess = c('zv','center','scale'),
tuneLength = 100 # 1 ~ 100 variables
)
# model with lowest RMSE
cv_model_pcr$bestTune
## ncomp
## 97 97
# results for model with lowest RMSE
cv_model_pcr$results %>%
dplyr::filter(ncomp == pull(cv_model_pcr$bestTune))
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 97 33929.87 0.8229825 21113.28 2635.455 0.02313147 749.9407
# plot cross-validation RMSE
ggplot(cv_model_pcr)
An alternative approach to reduce the impact of multicolinearity is partial least squares.
Partial least squares (PLS) can be viewed as a supervised dimension reduction procedure
set.seed(123)
cv_model_pls <-train(
Sale_Price ~.,
data = ames_train,
method = 'pls',
trControl = trainControl(method = 'cv', number = 10),
preProcess = c('zv','center','scale'),
tuneLength = 30
)
# model with the lowest RMSE
cv_model_pls$bestTune
## ncomp
## 3 3
# result for model with lowest RMSE
library(dplyr)
cv_model_pls$results %>%
filter(ncomp ==pull(cv_model_pls$bestTune))
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 3 31077.42 0.8493617 19137.51 7878.004 0.07079317 1784.467
# plot cross-validated RMSE
ggplot(cv_model_pls)
Once we’ve found the model that maximizes the predictive accuracy, our next goal is to interpret the model structure.Variable importance seeks to identify those variables that are most influential in our model. For linear regression models, this is most often measured by the absolute value of the t-statistic for each model parameter used.
# variable importance
vip(cv_model_pls, num_features = 20, method = 'model')
## Warning: 패키지 'pls'는 R 버전 4.2.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
partial dependence plots (PDPs). PDPs plot the change in the average predicted value (y) as specified feature(s) vary over their marginal distribution. PDPs of linear models help illustrate how a fixed change in x relates to a fixed linear change in y while taking into account the average effect of all the other features in the model.
library(pdp)
## Warning: 패키지 'pdp'는 R 버전 4.2.2에서 작성되었습니다
a<-partial(cv_model_pls, 'Gr_Liv_Area', grid.resolution = 20, plot = TRUE)
b<-partial(cv_model_pls, 'First_Flr_SF', grid.resolution = 20, plot = TRUE)
c<-partial(cv_model_pls, 'Garage_Area', grid.resolution = 20, plot = TRUE)
d<-partial(cv_model_pls, 'Total_Bsmt_SF', grid.resolution = 20, plot = TRUE)
gridExtra::grid.arrange(a,b,c,d, nrow = 2)