Statistical Learning: Week 3

Fitting a linear model

A description of variable names and measures can be seen here.

# install.packages("ISLR")
library(ISLR)
## load the data
data(Carseats)
## Check out variable "names" in the data
names(Carseats)

##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"

## Check out the dimension of the data
dim(Carseats)

## [1] 400  11

## Look at summary statistics to see what's inside this dataframe and variable characteristics 
summary(Carseats)

##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
##

## Accessing the DV (Sales) and check out its distribution: use $ to subset variable(s) from the dataframe
hist(Carseats$Sales)

## A univariate model: estimating Sales only as a function of Price. Note that there are three ways to do this:
lm.fit.a <- lm(Sales ~ Price, data = Carseats)
lm.fit.b <- lm(Carseats$Sales ~ Carseats$Price)

attach(Carseats)
lm.fit.c <- lm(Sales ~ Price)
detach(Carseats)

## All the above methods should obtain the same result, you can verify this with summary() function
summary(lm.fit.a)

## 
## Call:
## lm(formula = Sales ~ Price, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5224 -1.8442 -0.1459  1.6503  7.5108 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.641915   0.632812  21.558   <2e-16 ***
## Price       -0.053073   0.005354  -9.912   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.532 on 398 degrees of freedom
## Multiple R-squared:  0.198,  Adjusted R-squared:  0.196 
## F-statistic: 98.25 on 1 and 398 DF,  p-value: < 2.2e-16

## Use plot(x, y) function to plot the relationship between Sales and Price. Specify x and y variable, point size (cex), symbol color (col), pch (point shape), title and axis names for the main header, x- and y-axis

plot(Carseats$Price, Carseats$Sales, cex = 1.3, col = "red", pch = 19, main = "The relationship between Car Sales and Seat Price", xlab = "Seat Price", ylab = "Car Sales")

## Add the predicted line onto the graph
abline(lm.fit.a)

## Multivariate linear model
lm.fit.2 <- lm(Sales ~ Price + Advertising, data = Carseats)
summary(lm.fit.2)

## 
## Call:
## lm(formula = Sales ~ Price + Advertising, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9011 -1.5470 -0.0223  1.5361  6.3748 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.003427   0.606850  21.428  < 2e-16 ***
## Price       -0.054613   0.005078 -10.755  < 2e-16 ***
## Advertising  0.123107   0.018079   6.809 3.64e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.399 on 397 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2782 
## F-statistic: 77.91 on 2 and 397 DF,  p-value: < 2.2e-16

lm.fit.3 <- lm(Sales ~ Price + Advertising + Income, data = Carseats)
summary(lm.fit.3)

## 
## Call:
## lm(formula = Sales ~ Price + Advertising + Income, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4568 -1.5420 -0.0753  1.4975  6.7877 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.172701   0.682736  17.829  < 2e-16 ***
## Price       -0.053836   0.005051 -10.658  < 2e-16 ***
## Advertising  0.120237   0.017985   6.685 7.85e-11 ***
## Income       0.011066   0.004276   2.588     0.01 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.382 on 396 degrees of freedom
## Multiple R-squared:  0.2938, Adjusted R-squared:  0.2884 
## F-statistic: 54.91 on 3 and 396 DF,  p-value: < 2.2e-16

Various measures of variance and ANOVA

Calculating MSE, RMSE, RSS, RSE, R\(^2\), and ANOVA table

## MSE
library(ISLR)
data(Carseats)
lm.fit <- lm(Sales ~ Price, data = Carseats)
## Check out what's inside the lm.fit object
names(lm.fit)  # Show a list of variable names

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

## Calculate relevant statistics
mse <- mean(residuals(lm.fit)^2)
mse

## [1] 6.380611

## RMSE
rmse <- sqrt(mse)
rmse

## [1] 2.525987

## RSS
rss <- sum(residuals(lm.fit)^2)
rss

## [1] 2552.244

## RSE (or RMS)
## You should verify the nrow() of the df of your model output. You can do this by enter "lm.fit$df.residual" in your command line and see if R throws you 398, because the df for our base model is calculated as 400 - Y - X (only 1 variable used) = 398
rse <- sqrt(sum(residuals(lm.fit)^2) / lm.fit$df.residual)
rse

## [1] 2.532326

## R-squared is in the output  :-)
0.198

## [1] 0.198

## You can also retrieve the exact r^2 from model output using subset "$"
names(summary(lm.fit))

##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"

rsq <- summary(lm.fit)$r.squared

## TSS
tss <- rss / (1 - rsq)
tss

## [1] 3182.275

## ANOVA table
## One-way anova
anova(lm.fit)

## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## Price       1  630.03  630.03  98.248 < 2.2e-16 ***
## Residuals 398 2552.24    6.41                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Another way to do ANOVA is to use the built-in aov() function (aov = analysis of variance)
aov(Sales ~ Price, data = Carseats)

## Call:
##    aov(formula = Sales ~ Price, data = Carseats)
## 
## Terms:
##                     Price Residuals
## Sum of Squares   630.0304 2552.2443
## Deg. of Freedom         1       398
## 
## Residual standard error: 2.532326
## Estimated effects may be unbalanced

## Two-way anova
two_way <- lm(Sales ~ Price + Advertising, data = Carseats)
anova(two_way)

## Analysis of Variance Table
## 
## Response: Sales
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## Price         1  630.03  630.03 109.447 < 2.2e-16 ***
## Advertising   1  266.91  266.91  46.367  3.64e-11 ***
## Residuals   397 2285.33    5.76                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variance-bias tradeoff

## Download the data from our shared folder"
Advertising <- read.csv(url("https://www.dropbox.com/s/xxzkmpwexorvtm8/Advertising.csv?dl=1"))

## Import the data from your own drive
## Advertising <- read.csv("your drive://your path/your folder/Advertising.csv")

## Alternatively, you can first specify your working directory (wd), then load the data from there. That way, you won't need to specify path and folder
## setwd("your drive://your path/your folder")
## Advertising <- read.csv("Advertising.csv")

## Summary statistics and dimension
summary(Advertising)

##        X                TV             radio          newspaper     
##  Min.   :  1.00   Min.   :  0.70   Min.   : 0.000   Min.   :  0.30  
##  1st Qu.: 50.75   1st Qu.: 74.38   1st Qu.: 9.975   1st Qu.: 12.75  
##  Median :100.50   Median :149.75   Median :22.900   Median : 25.75  
##  Mean   :100.50   Mean   :147.04   Mean   :23.264   Mean   : 30.55  
##  3rd Qu.:150.25   3rd Qu.:218.82   3rd Qu.:36.525   3rd Qu.: 45.10  
##  Max.   :200.00   Max.   :296.40   Max.   :49.600   Max.   :114.00  
##      sales      
##  Min.   : 1.60  
##  1st Qu.:10.38  
##  Median :12.90  
##  Mean   :14.02  
##  3rd Qu.:17.40  
##  Max.   :27.00

dim(Advertising)

## [1] 200   5

## Structure of the data
str(Advertising)

## 'data.frame':    200 obs. of  5 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ TV       : num  230.1 44.5 17.2 151.5 180.8 ...
##  $ radio    : num  37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
##  $ newspaper: num  69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
##  $ sales    : num  22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...

## Check out variable names: X is just row index, so get rid of it!
names(Advertising)

## [1] "X"         "TV"        "radio"     "newspaper" "sales"

## Subsetting data: remove the first column, remember in R data.frame, the data format reads like df[row, col]
new_data <- Advertising[, -1]

## Rename the data by its previous name
Advertising <- new_data


## Fitting models of different specifications

names(Advertising)

## [1] "TV"        "radio"     "newspaper" "sales"

attach(Advertising)  # to save on typing

lm.fit1 <- lm(sales ~ TV)
lm.fit2 <- lm(sales ~ TV + newspaper)
lm.fit3 <- lm(sales ~ TV + newspaper + radio)

# writing your own function
# Bias is the average squared difference between predicted (y hat) and actual values (y), which is just the mean of sum((y^hat - y)^2)
calc_bias = function(predicted, actual) {
   mean(sum((predicted - actual)^2))
}

# Apply this function to all three models
bias1 <- calc_bias(lm.fit1$fitted.values, sales)
bias2 <- calc_bias(lm.fit2$fitted.values, sales)
bias3 <- calc_bias(lm.fit3$fitted.values, sales)

# Calculate the RSS: simply sum over model ouput's variance-covariance matrix (vcov)
var1 <- sum(vcov(lm.fit1))
var2 <- sum(vcov(lm.fit2))
var3 <- sum(vcov(lm.fit3))

# root MSE is the square root of MSE
calc_rmse = function(actual, predicted) {
  sqrt(mean((actual - predicted) ^ 2))
}

rmse1 <- calc_rmse(sales, lm.fit1$fitted.values)
rmse2 <- calc_rmse(sales, lm.fit2$fitted.values)
rmse3 <- calc_rmse(sales, lm.fit3$fitted.values)

# Generate a dataframe object called "table," list column (variable) names and values in each column. If the entry value is a text string, you should enclose it with " ".
table <- data.frame(Model = c("Model 1", "Model 2", "Model 3"), Bias = c(bias1, bias2, bias3), "Variance" = c(var1, var2, var3), RMSE = c(rmse1, rmse2, rmse3))

# Make it into a table for the ease of visual presentation
# install.packages("gridExtra")
library(gridExtra)
library(grid)
grid.table(table)

# Getting the right balance (K) using ridge regression
## Bias-variance tradeoff


# Load required package
# install.packages("lmridge")
library(lmridge)

# Using seg() to tell R to generate the scale for the X-axis, running from -0.5 to 0.3, with an increment of 0.002
mod <- lmridge(sales~., as.data.frame(Advertising), K = seq(-0.5, 0.3, 0.002))

# Horizontal and vertical lines show the minimum value of the (ridge) MSE at a given value of the biasing parameter K

par(mar=c(2,2,1,1))  # pop-up browser layout mar=c(bottom, left, top, right)
bias.plot(mod, abline = TRUE)
# Tell R the x-y coordinate to place the legend (x, y, )
legend(0.1, 12000, legend=c("Variance", "Bias^2", "MSE"),
       col=c("black", "red", "green"), lty=c(1, 4, 2), cex=0.8)

Supplementary materials

Curve-fitting on Advertising data

Advertising <- read.csv("https://www.dropbox.com/s/xxzkmpwexorvtm8/Advertising.csv?dl=1")
attach(Advertising)

## The following objects are masked from Advertising (pos = 6):
## 
##     newspaper, radio, sales, TV

lm.fit1 <- lm(sales ~ TV)
lm.fit2 <- lm(sales ~ TV + newspaper)
lm.fit3 <- lm(sales ~ TV + newspaper + radio)

IntPoint2 <- data.frame(TV = 0, 
    newspaper = mean(Advertising$newspaper))
IntPoint3 <- data.frame(TV = 0, 
    newspaper = mean(Advertising$newspaper), 
    radio = mean(Advertising$radio))

Int2 = predict(lm.fit2, IntPoint2)
Int3 = predict(lm.fit3, IntPoint3)

plot(TV, sales, cex = 1.3, col = "red", pch = 19, main = "The relationship between Sales and TV", xlab = "TV", ylab = "Sales")

abline(lm.fit1, col="red")
abline(Int2, lm.fit2$coefficients[2], col="green")
abline(Int3, lm.fit3$coefficients[2], col="blue")

Statistical Learning: Week 3

H. K. Tseng

03/04/2025

Fitting a linear model

Various measures of variance and ANOVA

Variance-bias tradeoff

Supplementary materials