require(ISLR)
Carseats <- read.csv(file = 'C:/Users/user0522/Downloads/dataset-11424.csv')
I made a copy of Carseats.
Because library() does not work on R
markdown, instead, I use require().
dim(Carseats)
## [1] 400 11
Total rows are 400 and total columns are 11.
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age
## Min. : 10.0 Min. : 24.0 Length:400 Min. :25.00
## 1st Qu.:139.0 1st Qu.:100.0 Class :character 1st Qu.:39.75
## Median :272.0 Median :117.0 Mode :character Median :54.50
## Mean :264.8 Mean :115.8 Mean :53.32
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00
## Max. :509.0 Max. :191.0 Max. :80.00
## Education Urban US
## Min. :10.0 Length:400 Length:400
## 1st Qu.:12.0 Class :character Class :character
## Median :14.0 Mode :character Mode :character
## Mean :13.9
## 3rd Qu.:16.0
## Max. :18.0
lm.model_1 <- lm(Carseats$Sales ~ Carseats$Price + Carseats$Advertising)
summary(lm.model_1)
##
## Call:
## lm(formula = Carseats$Sales ~ Carseats$Price + Carseats$Advertising)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9011 -1.5470 -0.0223 1.5361 6.3748
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.003427 0.606850 21.428 < 2e-16 ***
## Carseats$Price -0.054613 0.005078 -10.755 < 2e-16 ***
## Carseats$Advertising 0.123107 0.018079 6.809 3.64e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.399 on 397 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2782
## F-statistic: 77.91 on 2 and 397 DF, p-value: < 2.2e-16
lm.model_2 <- lm(Sales ~ Price + Advertising + Education, data = Carseats[1:200,])
summary(lm.model_2)
##
## Call:
## lm(formula = Sales ~ Price + Advertising + Education, data = Carseats[1:200,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1095 -1.4929 -0.1362 1.3478 6.3480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.882594 1.285484 10.800 < 2e-16 ***
## Price -0.057705 0.007311 -7.893 2.05e-13 ***
## Advertising 0.152776 0.028626 5.337 2.60e-07 ***
## Education -0.054431 0.065058 -0.837 0.404
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.447 on 196 degrees of freedom
## Multiple R-squared: 0.3201, Adjusted R-squared: 0.3097
## F-statistic: 30.76 on 3 and 196 DF, p-value: 2.432e-16
require(arm)
coefplot(lm.model_1)
coefplot(lm.model_2)
lm.model_3 <- lm(Carseats$Sales ~ Carseats$Income + Carseats$Age + Carseats$Education + Carseats$Price)
summary(lm.model_3)
##
## Call:
## lm(formula = Carseats$Sales ~ Carseats$Income + Carseats$Age +
## Carseats$Education + Carseats$Price)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.907 -1.611 -0.228 1.705 7.189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.242705 1.042763 15.577 < 2e-16 ***
## Carseats$Income 0.012319 0.004285 2.875 0.00426 **
## Carseats$Age -0.048570 0.007417 -6.548 1.81e-10 ***
## Carseats$Education -0.040663 0.045687 -0.890 0.37399
## Carseats$Price -0.055590 0.005083 -10.936 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.387 on 395 degrees of freedom
## Multiple R-squared: 0.2925, Adjusted R-squared: 0.2853
## F-statistic: 40.82 on 4 and 395 DF, p-value: < 2.2e-16
#store output that will be printed later.
outputList <- list()
# Mean Squared Error
mse <- mean((lm.model_3$residuals)^2)
outputList <- append(outputList, c("MSE", mse))
# root Mean Squared Error
rmse <- sqrt(mse)
outputList <- append(outputList, c("RMSE", rmse))
# RSS (residual sum of squares)
rss <- sum((lm.model_3$residuals)^2)
outputList <- append(outputList, c("RSS", rss))
# RSE (Residual Standard Error)
numerator <- sum((lm.model_3$residuals)^2)
df <- length(lm.model_3$residuals) - (length(lm.model_3$coefficients) - 1) - 1
rse <- sqrt(numerator / df)
outputList <- append(outputList, c("RSE", rse))
# TSS(total sum of square)
rSquared <- 0.2925
tss <- rss / (1 - rSquared)
# IF I put: tss <- sum((Carseats$Sales - mean(Carseats$Sales))^2)
# tss == 3182.275
outputList <- append(outputList, c("TSS", tss))
for (elements in outputList){
print(elements)
}
## [1] "MSE"
## [1] "5.62888456473612"
## [1] "RMSE"
## [1] "2.37252704193991"
## [1] "RSS"
## [1] "2251.55382589445"
## [1] "RSE"
## [1] "2.38749581530255"
## [1] "TSS"
## [1] "3182.40823447979"
anova(lm.model_3)
## Analysis of Variance Table
##
## Response: Carseats$Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Carseats$Income 1 73.48 73.48 12.8902 0.0003719 ***
## Carseats$Age 1 169.97 169.97 29.8184 8.405e-08 ***
## Carseats$Education 1 5.60 5.60 0.9823 0.3222376
## Carseats$Price 1 681.68 681.68 119.5896 < 2.2e-16 ***
## Residuals 395 2251.55 5.70
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
price accounts for the largest variance in the model.
Bias is the difference between the expected value of the estimator and the actual value. Variance here is the variance of estimated value. If high bias exists, it leads to high probability of wrong prediction. Instead, if high variance exists, it might make some good prediction but each prediction points count possibly varies a lot.
lm.model_4 <- lm(Carseats$Sales ~ Carseats$Income + Carseats$Age)
lm.model_5 <- lm(Carseats$Sales ~ Carseats$Income + Carseats$Age + Carseats$Education + Carseats$Price)
#define function to calculate bias
calc_bias <- function(predicted, actual) {
mean(sum((predicted - actual)^2))
}
#bias
bias_model4 <- calc_bias(lm.model_4$fitted.values, Carseats$Sales)
bias_model5 <- calc_bias(lm.model_5$fitted.values, Carseats$Sales)
# variance (the sum of variance-covariance matrix)
var_model4 <- sum(vcov(lm.model_4))
var_model5 <- sum(vcov(lm.model_5))
# dataframe
table <- data.frame("Model" = c("Model 4", "Model 5"), "Bias" = c(bias_model4, bias_model5), "Variance" = c(var_model4, var_model5))
#install.package("gridExtra")
require(gridExtra)
require(grid)
grid.table(table)
It literally shows that if more independent variables involve in a regression model the fitted values make better prediction while the variance of fitted values is higher. That is, there’s a trade-off.