STAT6020 - Week 2 - Linear Regression

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

rm( list=ls ())
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
theme_set(theme_bw())

Part A Exercise 1: BAC

• Compute a linear regression solution for the BAC dataset using explicitly the equations, rather than
calling lm():
– Compute the coefficients β0 and β1
– Compute the fitted values Yˆi
– Compute the residuals ei
– Compute the R2 statistic as a function of the correlation (cor()) between x (Beers) and Y (BAC).
• Display residual plots:
– Residuals ei versus predictor values xi
– Residuals ei versus fitted values Yˆi
• Discuss the results:
– When comparing the model with the one obtained by calling lm(), is there any major difference?
– When comparing the residual plots in the previous item, are the conclusions any different ?

bac <- read.csv('bac.csv')
x <- bac$Beers
y <- bac$BAC

# coeff, 
beta0 <- (sum(x^2) * sum(y) - (sum(x) * sum(x*y))) / (length(x) * sum(x^2) - sum(x)^2)
beta1 <- (length(x) * sum(x*y) - sum(x)*sum(y)) / (length(x) * sum(x^2) - sum(x)^2)
cat('Coefficients: \n beta0 = ', beta0, ' \n beta1 = ', beta1)

## Coefficients: 
##  beta0 =  -0.0127006  
##  beta1 =  0.01796376

# predict y
y_pred <- beta0 + beta1 * x
cat('Predicted BAC: ', y_pred)

## Predicted BAC:  0.07711821 0.02322692 0.1489733 0.1310095 0.04119068 0.1130457 0.04119068 0.07711821 0.04119068 0.07711821 0.05915444 0.09508197 0.07711821 0.1130457 0.005263158 0.05915444

# residuals
e_residuals <- y - y_pred
cat('Residuals: ', e_residuals)

## Residuals:  0.02288179 0.00677308 0.04102675 -0.01100949 -0.001190682 -0.01804573 0.02880932 -0.01711821 -0.02119068 -0.02711821 0.01084556 0.004918033 0.007881795 -0.02304573 0.004736842 -0.009154443

summary(e_residuals)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.027118 -0.017350  0.001773  0.000000  0.008623  0.041027

# R square
rss <- sum(e_residuals^2)
tss <- sum((y-mean(y))^2)
R_square <- 1 - (rss/tss)
cat('R square: ', R_square)

## R square:  0.7998407

# Residuals plots
plot(x=y, y=e_residuals, main='Residuals vs Predictors', 
     xlab='Predictors', ylab='Residuals', pch=c('❤'))

plot(x=y_pred,y=e_residuals, main='Residuals vs Predicted', 
     ylab='Residuals', xlab='Fitted values',pch=c('❤'))

dfModel <- data.frame(res=e_residuals, pred=y_pred)
ggplot(dfModel, aes(x=pred, y=res)) +
      geom_point(size=3)+
      geom_hline(aes(yintercept=0), linetype='dashed')+
      geom_smooth(method='loess')+
      ggtitle('Residuals vs Fitted values')

## `geom_smooth()` using formula 'y ~ x'

Discuss: the calculated results are the same as calling lm() if we round up the numbers.

Part A Exercise 2: Diamonds

This exercise involves the diamonds dataset from the ggplot2 package.
• Using the lm() function, perform linear regression of price modelled as a function of weight (carat):
– Initially, without any transformation of variables
– Then repeat, but applying a log transformation to both input and output variables
– Discuss the results
• Repeat the above now explicitly using the formulas, instead of calling the lm() function

diamonds.lm <- lm(data = diamonds, price~carat)
# names(summary(diamonds.lm))
summary(diamonds.lm)

## 
## Call:
## lm(formula = price ~ carat, data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18585.3   -804.8    -18.9    537.4  12731.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2256.36      13.06  -172.8   <2e-16 ***
## carat        7756.43      14.07   551.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1549 on 53938 degrees of freedom
## Multiple R-squared:  0.8493, Adjusted R-squared:  0.8493 
## F-statistic: 3.041e+05 on 1 and 53938 DF,  p-value: < 2.2e-16

plot(diamonds.lm,which=1)

diamonds_log.lm <- lm(data = diamonds, log(price)~log(carat))
# names(summary(diamonds.lm))
summary(diamonds_log.lm)

## 
## Call:
## lm(formula = log(price) ~ log(carat), data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50833 -0.16951 -0.00591  0.16637  1.33793 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.448661   0.001365  6190.9   <2e-16 ***
## log(carat)  1.675817   0.001934   866.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2627 on 53938 degrees of freedom
## Multiple R-squared:  0.933,  Adjusted R-squared:  0.933 
## F-statistic: 7.51e+05 on 1 and 53938 DF,  p-value: < 2.2e-16

plot(diamonds_log.lm,which=1)

Discuss : with and without log transformation

The model is stronger when applying the log scale (median residuals -0.006, R-square = 93%) than the initial model (median of residuals -18.9, R-square = 85%)
Residuals plot
** Initial model: the residuals scatters in a particular pattern. The lowess line is not close to the residuals = 0 line.
** log transformation: the residuals are randomly scattered from left to right (have constant variance). the lowess is close to 0 line.

Calculate using formulas, just replace x and y from the exercise 1.

SLR_manual <- function(x,y) {
      # coeff, 
      beta0 <- (sum(x^2) * sum(y) - (sum(x) * sum(x*y))) / (length(x) * sum(x^2) - sum(x)^2)
      beta1 <- (length(x) * sum(x*y) - sum(x)*sum(y)) / (length(x) * sum(x^2) - sum(x)^2)
      cat('\nCoefficients: \n beta0 = ', beta0, ' \n beta1 = ', beta1)
      
      # predict y
      y_pred <- beta0 + beta1 * x
      # cat('Predicted BAC: ', y_pred)
      
      # residuals
      e_residuals <- y - y_pred
      # cat('Residuals: ', e_residuals)
      summary(e_residuals)
      
      # R square
      rss <- sum(e_residuals^2)
      tss <- sum((y-mean(y))^2)
      R_square <- 1 - (rss/tss)
      cat('\nR square: ', R_square)
}
cat('\n### INITIAL MODEL ###')

## 
## ### INITIAL MODEL ###

x <- diamonds$carat
y <-  diamonds$price
SLR_manual(x,y)

## 
## Coefficients: 
##  beta0 =  -2256.361  
##  beta1 =  7756.426
## R square:  0.8493305

cat('\n\n### LOG TRANSFORMATION MODEL ###')

## 
## 
## ### LOG TRANSFORMATION MODEL ###

x <- log(diamonds$carat)
y <-  log(diamonds$price)
SLR_manual(x,y)

## 
## Coefficients: 
##  beta0 =  8.448661  
##  beta1 =  1.675817
## R square:  0.9329893

Part B Exercise 1 MLR

beta coefficient matrix formula βˆ = (X^T X)^-1X^T Y

set.seed(22)
x1 = runif(n = 100, min = 0, max = 1)
# 100 observations of predictor x1 randomly drawn from [0,1]
x2 = runif(n = 100, min = 0, max = 1) 
# 100 observations of predictor x2 randomly drawn from [0,1]
Yi = 5 + 3*x1 + 7*x2 + rnorm(n = 100, mean = 0, sd = 1) 
# Build vector Y = 5 + 3x1 + 7x2 + epsilon
dat = matrix(c(x1,x2,Yi), nrow = 100, ncol = 3, byrow = FALSE)
# Build data matrix
dim(dat)

## [1] 100   3

X= dat[,1:2]
Y= dat[,3]
beta_coeff = solve(t(X)%*%X) %*% (t(X) %*% Y)
beta_coeff

##           [,1]
## [1,]  6.483094
## [2,] 11.746231

dat_df <- as.data.frame(dat)
colnames(dat_df) <- c('x1','x2','y')
dat_lm <- lm(data=dat_df,y~x1+x2 )
summary(dat_lm)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = dat_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.11170 -0.56708  0.06266  0.59903  1.96478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7249     0.2303  20.513  < 2e-16 ***
## x1            2.4977     0.3309   7.548 2.42e-11 ***
## x2            7.5838     0.3266  23.220  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9325 on 97 degrees of freedom
## Multiple R-squared:  0.8703, Adjusted R-squared:  0.8676 
## F-statistic: 325.4 on 2 and 97 DF,  p-value: < 2.2e-16

cat('\n*** Coefficients matrix using least square formula', beta_coeff)

## 
## *** Coefficients matrix using least square formula 6.483094 11.74623

We can see that the coefficients that we calculated differs the lm() function. Was we wrong? No, because our calculated matrix is the coefficients without intercept. We will call lm() with the option without an intercept model (y~0+x1+x2) and now the slopes are the same.

summary(lm(data=dat_df,y~0+x1+x2 ))

## 
## Call:
## lm(formula = y ~ 0 + x1 + x2, data = dat_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6832 -0.7092  0.8520  2.3164  4.9213 
## 
## Coefficients:
##    Estimate Std. Error t value Pr(>|t|)    
## x1   6.4831     0.6157   10.53   <2e-16 ***
## x2  11.7462     0.5883   19.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.144 on 98 degrees of freedom
## Multiple R-squared:  0.9551, Adjusted R-squared:  0.9541 
## F-statistic:  1041 on 2 and 98 DF,  p-value: < 2.2e-16

So, can we calculate the coefficients matrix with intercept? The answer is Yes

This is our model:

Y = β0 + β1x1 + β2x2 + · · · + βpxp + epsilon

Y = β0 + βX + epsilon (1)

Recall the Least Square Estimation formula

βˆ = (X^T X)^-1X^T Y is the estimated coefficients for the model Y = βX + epsilon (2)
(1) can rewrite as

Y = β0x0 + βX + epsilon, where x0 = (1,1,.…1)
= βX’ + epsilon, where X’ = cbind(xo,X)

This formula is eaxt as (2). Thus, if we do the least squares formula on the new X’, we will have the coefficients matrix including the intercept of the initial model.

# bind a column x0 = (1,1....,1) to X
X= cbind(rep(1,100),dat[,1:2])
Y= dat[,3]
beta_coeff = solve(t(X)%*%X) %*% (t(X) %*% Y)
cat('Estimated coefficent matrix with intercept,', beta_coeff)

## Estimated coefficent matrix with intercept, 4.724859 2.497686 7.583763

beta_coeff

##          [,1]
## [1,] 4.724859
## [2,] 2.497686
## [3,] 7.583763

dat_df <- as.data.frame(dat)
colnames(dat_df) <- c('x1','x2','y')
dat_lm <- lm(data=dat_df,y~x1+x2 )
summary(dat_lm)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = dat_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.11170 -0.56708  0.06266  0.59903  1.96478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7249     0.2303  20.513  < 2e-16 ***
## x1            2.4977     0.3309   7.548 2.42e-11 ***
## x2            7.5838     0.3266  23.220  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9325 on 97 degrees of freedom
## Multiple R-squared:  0.8703, Adjusted R-squared:  0.8676 
## F-statistic: 325.4 on 2 and 97 DF,  p-value: < 2.2e-16

We now have the coefficients of calculating and lm() are the same 4.7248587, 2.4976863, 7.5837634

plot(dat_lm,which=1)

Part B Exercise 2: Cherry Trees

Recompute the least squares coefficients for the cherry dataset in our previous example, but now using the matrix formulation, and compare.

cherry <- read.csv('cherry.csv')
X <- cherry %>% select(Diam, Height) %>% mutate(Diam2 = Diam*Diam) %>% 
      mutate(intercept=1) %>% relocate(intercept)

X <- as.matrix(X)
Y <- cherry %>% select(Volume)
Y <- as.matrix(Y)
beta_coeff <- solve(t(X)%*%X) %*% (t(X) %*% Y)
cat('\nUsing formula\n')

## 
## Using formula

beta_coeff

##               Volume
## intercept -9.9204060
## Diam      -2.8850787
## Height     0.3763873
## Diam2      0.2686224

cat('Using lm()')

## Using lm()

cherry_lm <- lm(Volume ~ Diam + I(Diam^2) + Height , data=cherry)
summary(cherry_lm)

## 
## Call:
## lm(formula = Volume ~ Diam + I(Diam^2) + Height, data = cherry)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2928 -1.6693 -0.1018  1.7851  4.3489 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.92041   10.07911  -0.984 0.333729    
## Diam        -2.88508    1.30985  -2.203 0.036343 *  
## I(Diam^2)    0.26862    0.04590   5.852 3.13e-06 ***
## Height       0.37639    0.08823   4.266 0.000218 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.625 on 27 degrees of freedom
## Multiple R-squared:  0.9771, Adjusted R-squared:  0.9745 
## F-statistic: 383.2 on 3 and 27 DF,  p-value: < 2.2e-16

plot(cherry_lm,which = 1,pch=c('🍒'))

LAB week 2, Activity A1 - Blood pressure

bp <- read.csv('bloodpressure.csv')

bp_lm <- lm(data=bp,SystolicBloodPressure~Age   )
summary(bp_lm)

## 
## Call:
## lm(formula = SystolicBloodPressure ~ Age, data = bp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.724  -6.994  -0.520   2.931  75.654 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  98.7147    10.0005   9.871 1.28e-10 ***
## Age           0.9709     0.2102   4.618 7.87e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.31 on 28 degrees of freedom
## Multiple R-squared:  0.4324, Adjusted R-squared:  0.4121 
## F-statistic: 21.33 on 1 and 28 DF,  p-value: 7.867e-05

plot(bp_lm,which=1)

plot(bp_lm,which=2)

Significant, moderate relationship, R square = 0.4323947

3 outliers, a bit non linear, heteroscedasticity ( slightly narrow from left, wider to right)

LAB week 2, Activity A2 - Soap

soap <- read.csv('soap.csv')
soap_lm <- lm(data = soap,Weight~Day)
summary(soap_lm)

## 
## Call:
## lm(formula = Weight ~ Day, data = soap)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2436 -1.2950  0.3078  1.3942  5.5040 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 123.1408     1.3822   89.09   <2e-16 ***
## Day          -5.5748     0.1068  -52.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.949 on 13 degrees of freedom
## Multiple R-squared:  0.9953, Adjusted R-squared:  0.9949 
## F-statistic:  2724 on 1 and 13 DF,  p-value: < 2.2e-16

plot(soap_lm,which=1)

sqrt(length(soap$Day))

## [1] 3.872983

Strong relationship, Rsquared =99.5%, p_value<0.05

Residual plot: U shape pattern, non linear relationship, 3 outliers, however with n() sample <30, the L.I.N.E assumptions are not violated?

LAB week 2, Activity B1 - Soft drink

drink <- read.csv('softdrink.csv')
pairs(drink)

cor(drink)

##               Time     Cases  Distance
## Time     1.0000000 0.9646146 0.8916701
## Cases    0.9646146 1.0000000 0.8242150
## Distance 0.8916701 0.8242150 1.0000000

High correlations. Build model

mlr <- lm(data=drink,Time~Cases+Distance)
summary(mlr)

## 
## Call:
## lm(formula = Time ~ Cases + Distance, data = drink)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7880 -0.6629  0.4364  1.1566  7.4197 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.341231   1.096730   2.135 0.044170 *  
## Cases       1.615907   0.170735   9.464 3.25e-09 ***
## Distance    0.014385   0.003613   3.981 0.000631 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.259 on 22 degrees of freedom
## Multiple R-squared:  0.9596, Adjusted R-squared:  0.9559 
## F-statistic: 261.2 on 2 and 22 DF,  p-value: 4.687e-16

plot(mlr,which=1,pch=c('🥤','🍷'))

Model is significant, Delivery time estimated as:

Delivery Time = 2.341 + 1.616*Cases + 0.014* Distance

STAT6020 - Week 2 - Linear Regression - Draft

ha_le

2022-07-25