A description of variable names and measures can be seen here.
# install.packages("ISLR")
library(ISLR)
## load the data
data(Carseats)
## Check out variable "names" in the data
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
## Check out the dimension of the data
dim(Carseats)
## [1] 400 11
## Look at summary statistics to see what's inside this dataframe and variable characteristics
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age Education
## Min. : 10.0 Min. : 24.0 Bad : 96 Min. :25.00 Min. :10.0
## 1st Qu.:139.0 1st Qu.:100.0 Good : 85 1st Qu.:39.75 1st Qu.:12.0
## Median :272.0 Median :117.0 Medium:219 Median :54.50 Median :14.0
## Mean :264.8 Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :509.0 Max. :191.0 Max. :80.00 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
## Accessing the DV (Sales) and check out its distribution: use $ to subset variable(s) from the dataframe
hist(Carseats$Sales)
## A univariate model: estimating Sales only as a function of Price. Note that there are three ways to do this:
lm.fit.a <- lm(Sales ~ Price, data = Carseats)
lm.fit.b <- lm(Carseats$Sales ~ Carseats$Price)
attach(Carseats)
lm.fit.c <- lm(Sales ~ Price)
detach(Carseats)
## All the above methods should obtain the same result, you can verify this with summary() function
summary(lm.fit.a)
##
## Call:
## lm(formula = Sales ~ Price, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5224 -1.8442 -0.1459 1.6503 7.5108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.641915 0.632812 21.558 <2e-16 ***
## Price -0.053073 0.005354 -9.912 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.532 on 398 degrees of freedom
## Multiple R-squared: 0.198, Adjusted R-squared: 0.196
## F-statistic: 98.25 on 1 and 398 DF, p-value: < 2.2e-16
## Use plot(x, y) function to plot the relationship between Sales and Price. Specify x and y variable, point size (cex), symbol color (col), pch (point shape), title and axis names for the main header, x- and y-axis
plot(Carseats$Price, Carseats$Sales, cex = 1.3, col = "red", pch = 19, main = "The relationship between Car Sales and Seat Price", xlab = "Seat Price", ylab = "Car Sales")
## Add the predicted line onto the graph
abline(lm.fit.a)
## Multivariate linear model
lm.fit.2 <- lm(Sales ~ Price + Advertising, data = Carseats)
summary(lm.fit.2)
##
## Call:
## lm(formula = Sales ~ Price + Advertising, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9011 -1.5470 -0.0223 1.5361 6.3748
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.003427 0.606850 21.428 < 2e-16 ***
## Price -0.054613 0.005078 -10.755 < 2e-16 ***
## Advertising 0.123107 0.018079 6.809 3.64e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.399 on 397 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2782
## F-statistic: 77.91 on 2 and 397 DF, p-value: < 2.2e-16
lm.fit.3 <- lm(Sales ~ Price + Advertising + Income, data = Carseats)
summary(lm.fit.3)
##
## Call:
## lm(formula = Sales ~ Price + Advertising + Income, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4568 -1.5420 -0.0753 1.4975 6.7877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.172701 0.682736 17.829 < 2e-16 ***
## Price -0.053836 0.005051 -10.658 < 2e-16 ***
## Advertising 0.120237 0.017985 6.685 7.85e-11 ***
## Income 0.011066 0.004276 2.588 0.01 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.382 on 396 degrees of freedom
## Multiple R-squared: 0.2938, Adjusted R-squared: 0.2884
## F-statistic: 54.91 on 3 and 396 DF, p-value: < 2.2e-16
Calculating MSE, RMSE, RSS, RSE, R\(^2\), and ANOVA table
## MSE
library(ISLR)
data(Carseats)
lm.fit <- lm(Sales ~ Price, data = Carseats)
## Check out what's inside the lm.fit object
names(lm.fit) # Show a list of variable names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
## Calculate relevant statistics
mse <- mean(residuals(lm.fit)^2)
mse
## [1] 6.380611
## RMSE
rmse <- sqrt(mse)
rmse
## [1] 2.525987
## RSS
rss <- sum(residuals(lm.fit)^2)
rss
## [1] 2552.244
## RSE (or RMS)
## You should verify the nrow() of the df of your model output. You can do this by enter "lm.fit$df.residual" in your command line and see if R throws you 398, because the df for our base model is calculated as 400 - Y - X (only 1 variable used) = 398
rse <- sqrt(sum(residuals(lm.fit)^2) / lm.fit$df.residual)
rse
## [1] 2.532326
## R-squared is in the output :-)
0.198
## [1] 0.198
## You can also retrieve the exact r^2 from model output using subset "$"
names(summary(lm.fit))
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
rsq <- summary(lm.fit)$r.squared
## TSS
tss <- rss / (1 - rsq)
tss
## [1] 3182.275
## ANOVA table
## One-way anova
anova(lm.fit)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 98.248 < 2.2e-16 ***
## Residuals 398 2552.24 6.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Another way to do ANOVA is to use the built-in aov() function (aov = analysis of variance)
aov(Sales ~ Price, data = Carseats)
## Call:
## aov(formula = Sales ~ Price, data = Carseats)
##
## Terms:
## Price Residuals
## Sum of Squares 630.0304 2552.2443
## Deg. of Freedom 1 398
##
## Residual standard error: 2.532326
## Estimated effects may be unbalanced
## Two-way anova
two_way <- lm(Sales ~ Price + Advertising, data = Carseats)
anova(two_way)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 109.447 < 2.2e-16 ***
## Advertising 1 266.91 266.91 46.367 3.64e-11 ***
## Residuals 397 2285.33 5.76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Download the data from our shared folder"
Advertising <- read.csv(url("https://www.dropbox.com/s/xxzkmpwexorvtm8/Advertising.csv?dl=1"))
## Import the data from your own drive
## Advertising <- read.csv("your drive://your path/your folder/Advertising.csv")
## Alternatively, you can first specify your working directory (wd), then load the data from there. That way, you won't need to specify path and folder
## setwd("your drive://your path/your folder")
## Advertising <- read.csv("Advertising.csv")
## Summary statistics and dimension
summary(Advertising)
## X TV radio newspaper
## Min. : 1.00 Min. : 0.70 Min. : 0.000 Min. : 0.30
## 1st Qu.: 50.75 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75
## Median :100.50 Median :149.75 Median :22.900 Median : 25.75
## Mean :100.50 Mean :147.04 Mean :23.264 Mean : 30.55
## 3rd Qu.:150.25 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10
## Max. :200.00 Max. :296.40 Max. :49.600 Max. :114.00
## sales
## Min. : 1.60
## 1st Qu.:10.38
## Median :12.90
## Mean :14.02
## 3rd Qu.:17.40
## Max. :27.00
dim(Advertising)
## [1] 200 5
## Structure of the data
str(Advertising)
## 'data.frame': 200 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ TV : num 230.1 44.5 17.2 151.5 180.8 ...
## $ radio : num 37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
## $ newspaper: num 69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
## $ sales : num 22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...
## Check out variable names: X is just row index, so get rid of it!
names(Advertising)
## [1] "X" "TV" "radio" "newspaper" "sales"
## Subsetting data: remove the first column, remember in R data.frame, the data format reads like df[row, col]
new_data <- Advertising[, -1]
## Rename the data by its previous name
Advertising <- new_data
## Fitting models of different specifications
names(Advertising)
## [1] "TV" "radio" "newspaper" "sales"
attach(Advertising) # to save on typing
lm.fit1 <- lm(sales ~ TV)
lm.fit2 <- lm(sales ~ TV + newspaper)
lm.fit3 <- lm(sales ~ TV + newspaper + radio)
# writing your own function
# Bias is the average squared difference between predicted (y hat) and actual values (y), which is just the mean of sum((y^hat - y)^2)
calc_bias = function(predicted, actual) {
mean(sum((predicted - actual)^2))
}
# Apply this function to all three models
bias1 <- calc_bias(lm.fit1$fitted.values, sales)
bias2 <- calc_bias(lm.fit2$fitted.values, sales)
bias3 <- calc_bias(lm.fit3$fitted.values, sales)
# Calculate the RSS: simply sum over model ouput's variance-covariance matrix (vcov)
var1 <- sum(vcov(lm.fit1))
var2 <- sum(vcov(lm.fit2))
var3 <- sum(vcov(lm.fit3))
# root MSE is the square root of MSE
calc_rmse = function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
rmse1 <- calc_rmse(sales, lm.fit1$fitted.values)
rmse2 <- calc_rmse(sales, lm.fit2$fitted.values)
rmse3 <- calc_rmse(sales, lm.fit3$fitted.values)
# Generate a dataframe object called "table," list column (variable) names and values in each column. If the entry value is a text string, you should enclose it with " ".
table <- data.frame(Model = c("Model 1", "Model 2", "Model 3"), Bias = c(bias1, bias2, bias3), "Variance" = c(var1, var2, var3), RMSE = c(rmse1, rmse2, rmse3))
# Make it into a table for the ease of visual presentation
# install.packages("gridExtra")
library(gridExtra)
library(grid)
grid.table(table)
# Getting the right balance (K) using ridge regression
## Bias-variance tradeoff
# Load required package
# install.packages("lmridge")
library(lmridge)
# Using seg() to tell R to generate the scale for the X-axis, running from -0.5 to 0.3, with an increment of 0.002
mod <- lmridge(sales~., as.data.frame(Advertising), K = seq(-0.5, 0.3, 0.002))
# Horizontal and vertical lines show the minimum value of the (ridge) MSE at a given value of the biasing parameter K
par(mar=c(2,2,1,1)) # pop-up browser layout mar=c(bottom, left, top, right)
bias.plot(mod, abline = TRUE)
# Tell R the x-y coordinate to place the legend (x, y, )
legend(0.1, 12000, legend=c("Variance", "Bias^2", "MSE"),
col=c("black", "red", "green"), lty=c(1, 4, 2), cex=0.8)
Curve-fitting on Advertising data
Advertising <- read.csv("https://www.dropbox.com/s/xxzkmpwexorvtm8/Advertising.csv?dl=1")
attach(Advertising)
## The following objects are masked from Advertising (pos = 6):
##
## newspaper, radio, sales, TV
lm.fit1 <- lm(sales ~ TV)
lm.fit2 <- lm(sales ~ TV + newspaper)
lm.fit3 <- lm(sales ~ TV + newspaper + radio)
IntPoint2 <- data.frame(TV = 0,
newspaper = mean(Advertising$newspaper))
IntPoint3 <- data.frame(TV = 0,
newspaper = mean(Advertising$newspaper),
radio = mean(Advertising$radio))
Int2 = predict(lm.fit2, IntPoint2)
Int3 = predict(lm.fit3, IntPoint3)
plot(TV, sales, cex = 1.3, col = "red", pch = 19, main = "The relationship between Sales and TV", xlab = "TV", ylab = "Sales")
abline(lm.fit1, col="red")
abline(Int2, lm.fit2$coefficients[2], col="green")
abline(Int3, lm.fit3$coefficients[2], col="blue")