library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rm( list=ls ())
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
theme_set(theme_bw())
• Compute a linear regression solution for the BAC dataset using
explicitly the equations, rather than
calling lm():
– Compute the coefficients β0 and β1
– Compute the fitted values Yˆi
– Compute the residuals ei
– Compute the R2 statistic as a function of the correlation (cor())
between x (Beers) and Y (BAC).
• Display residual plots:
– Residuals ei versus predictor values xi
– Residuals ei versus fitted values Yˆi
• Discuss the results:
– When comparing the model with the one obtained by calling lm(), is
there any major difference?
– When comparing the residual plots in the previous item, are the
conclusions any different ?
bac <- read.csv('bac.csv')
x <- bac$Beers
y <- bac$BAC
# coeff,
beta0 <- (sum(x^2) * sum(y) - (sum(x) * sum(x*y))) / (length(x) * sum(x^2) - sum(x)^2)
beta1 <- (length(x) * sum(x*y) - sum(x)*sum(y)) / (length(x) * sum(x^2) - sum(x)^2)
cat('Coefficients: \n beta0 = ', beta0, ' \n beta1 = ', beta1)
## Coefficients:
## beta0 = -0.0127006
## beta1 = 0.01796376
# predict y
y_pred <- beta0 + beta1 * x
cat('Predicted BAC: ', y_pred)
## Predicted BAC: 0.07711821 0.02322692 0.1489733 0.1310095 0.04119068 0.1130457 0.04119068 0.07711821 0.04119068 0.07711821 0.05915444 0.09508197 0.07711821 0.1130457 0.005263158 0.05915444
# residuals
e_residuals <- y - y_pred
cat('Residuals: ', e_residuals)
## Residuals: 0.02288179 0.00677308 0.04102675 -0.01100949 -0.001190682 -0.01804573 0.02880932 -0.01711821 -0.02119068 -0.02711821 0.01084556 0.004918033 0.007881795 -0.02304573 0.004736842 -0.009154443
summary(e_residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.027118 -0.017350 0.001773 0.000000 0.008623 0.041027
# R square
rss <- sum(e_residuals^2)
tss <- sum((y-mean(y))^2)
R_square <- 1 - (rss/tss)
cat('R square: ', R_square)
## R square: 0.7998407
# Residuals plots
plot(x=y, y=e_residuals, main='Residuals vs Predictors',
xlab='Predictors', ylab='Residuals', pch=c('❤'))
plot(x=y_pred,y=e_residuals, main='Residuals vs Predicted',
ylab='Residuals', xlab='Fitted values',pch=c('❤'))
dfModel <- data.frame(res=e_residuals, pred=y_pred)
ggplot(dfModel, aes(x=pred, y=res)) +
geom_point(size=3)+
geom_hline(aes(yintercept=0), linetype='dashed')+
geom_smooth(method='loess')+
ggtitle('Residuals vs Fitted values')
## `geom_smooth()` using formula 'y ~ x'
Discuss: the calculated results are the same as calling lm() if we round up the numbers.
This exercise involves the diamonds dataset from the ggplot2
package.
• Using the lm() function, perform linear regression of price modelled
as a function of weight (carat):
– Initially, without any transformation of variables
– Then repeat, but applying a log transformation to both input and
output variables
– Discuss the results
• Repeat the above now explicitly using the formulas, instead of calling
the lm() function
diamonds.lm <- lm(data = diamonds, price~carat)
# names(summary(diamonds.lm))
summary(diamonds.lm)
##
## Call:
## lm(formula = price ~ carat, data = diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18585.3 -804.8 -18.9 537.4 12731.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2256.36 13.06 -172.8 <2e-16 ***
## carat 7756.43 14.07 551.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1549 on 53938 degrees of freedom
## Multiple R-squared: 0.8493, Adjusted R-squared: 0.8493
## F-statistic: 3.041e+05 on 1 and 53938 DF, p-value: < 2.2e-16
plot(diamonds.lm,which=1)
diamonds_log.lm <- lm(data = diamonds, log(price)~log(carat))
# names(summary(diamonds.lm))
summary(diamonds_log.lm)
##
## Call:
## lm(formula = log(price) ~ log(carat), data = diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.50833 -0.16951 -0.00591 0.16637 1.33793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.448661 0.001365 6190.9 <2e-16 ***
## log(carat) 1.675817 0.001934 866.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2627 on 53938 degrees of freedom
## Multiple R-squared: 0.933, Adjusted R-squared: 0.933
## F-statistic: 7.51e+05 on 1 and 53938 DF, p-value: < 2.2e-16
plot(diamonds_log.lm,which=1)
Discuss : with and without log transformation
The model is stronger when applying the log scale (median residuals -0.006, R-square = 93%) than the initial model (median of residuals -18.9, R-square = 85%)
Residuals plot
** Initial model: the residuals scatters in a particular pattern. The
lowess line is not close to the residuals = 0 line.
** log transformation: the residuals are randomly scattered from left to
right (have constant variance). the lowess is close to 0 line.
Calculate using formulas, just replace x and y from the exercise 1.
SLR_manual <- function(x,y) {
# coeff,
beta0 <- (sum(x^2) * sum(y) - (sum(x) * sum(x*y))) / (length(x) * sum(x^2) - sum(x)^2)
beta1 <- (length(x) * sum(x*y) - sum(x)*sum(y)) / (length(x) * sum(x^2) - sum(x)^2)
cat('\nCoefficients: \n beta0 = ', beta0, ' \n beta1 = ', beta1)
# predict y
y_pred <- beta0 + beta1 * x
# cat('Predicted BAC: ', y_pred)
# residuals
e_residuals <- y - y_pred
# cat('Residuals: ', e_residuals)
summary(e_residuals)
# R square
rss <- sum(e_residuals^2)
tss <- sum((y-mean(y))^2)
R_square <- 1 - (rss/tss)
cat('\nR square: ', R_square)
}
cat('\n### INITIAL MODEL ###')
##
## ### INITIAL MODEL ###
x <- diamonds$carat
y <- diamonds$price
SLR_manual(x,y)
##
## Coefficients:
## beta0 = -2256.361
## beta1 = 7756.426
## R square: 0.8493305
cat('\n\n### LOG TRANSFORMATION MODEL ###')
##
##
## ### LOG TRANSFORMATION MODEL ###
x <- log(diamonds$carat)
y <- log(diamonds$price)
SLR_manual(x,y)
##
## Coefficients:
## beta0 = 8.448661
## beta1 = 1.675817
## R square: 0.9329893
beta coefficient matrix formula βˆ = (XT X)-1XT Y
set.seed(22)
x1 = runif(n = 100, min = 0, max = 1)
# 100 observations of predictor x1 randomly drawn from [0,1]
x2 = runif(n = 100, min = 0, max = 1)
# 100 observations of predictor x2 randomly drawn from [0,1]
Yi = 5 + 3*x1 + 7*x2 + rnorm(n = 100, mean = 0, sd = 1)
# Build vector Y = 5 + 3x1 + 7x2 + epsilon
dat = matrix(c(x1,x2,Yi), nrow = 100, ncol = 3, byrow = FALSE)
# Build data matrix
dim(dat)
## [1] 100 3
X= dat[,1:2]
Y= dat[,3]
beta_coeff = solve(t(X)%*%X) %*% (t(X) %*% Y)
beta_coeff
## [,1]
## [1,] 6.483094
## [2,] 11.746231
dat_df <- as.data.frame(dat)
colnames(dat_df) <- c('x1','x2','y')
dat_lm <- lm(data=dat_df,y~x1+x2 )
summary(dat_lm)
##
## Call:
## lm(formula = y ~ x1 + x2, data = dat_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.11170 -0.56708 0.06266 0.59903 1.96478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7249 0.2303 20.513 < 2e-16 ***
## x1 2.4977 0.3309 7.548 2.42e-11 ***
## x2 7.5838 0.3266 23.220 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9325 on 97 degrees of freedom
## Multiple R-squared: 0.8703, Adjusted R-squared: 0.8676
## F-statistic: 325.4 on 2 and 97 DF, p-value: < 2.2e-16
cat('\n*** Coefficients matrix using least square formula', beta_coeff)
##
## *** Coefficients matrix using least square formula 6.483094 11.74623
We can see that the coefficients that we calculated differs the lm() function. Was we wrong? No, because our calculated matrix is the coefficients without intercept. We will call lm() with the option without an intercept model (y~0+x1+x2) and now the slopes are the same.
summary(lm(data=dat_df,y~0+x1+x2 ))
##
## Call:
## lm(formula = y ~ 0 + x1 + x2, data = dat_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6832 -0.7092 0.8520 2.3164 4.9213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x1 6.4831 0.6157 10.53 <2e-16 ***
## x2 11.7462 0.5883 19.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.144 on 98 degrees of freedom
## Multiple R-squared: 0.9551, Adjusted R-squared: 0.9541
## F-statistic: 1041 on 2 and 98 DF, p-value: < 2.2e-16
So, can we calculate the coefficients matrix with intercept? The answer is Yes
This is our model:
Y = β0 + β1x1 + β2x2 + · · · + βpxp + epsilon
Y = β0 + βX + epsilon (1)
Recall the Least Square Estimation formula
βˆ = (XT X)-1XT Y is the estimated
coefficients for the model Y = βX + epsilon (2)
(1) can rewrite as
Y = β0x0 + βX + epsilon, where x0 = (1,1,.…1)
= βX’ + epsilon, where X’ = cbind(xo,X)
This formula is eaxt as (2). Thus, if we do the least squares formula on
the new X’, we will have the coefficients matrix including the intercept
of the initial model.
# bind a column x0 = (1,1....,1) to X
X= cbind(rep(1,100),dat[,1:2])
Y= dat[,3]
beta_coeff = solve(t(X)%*%X) %*% (t(X) %*% Y)
cat('Estimated coefficent matrix with intercept,', beta_coeff)
## Estimated coefficent matrix with intercept, 4.724859 2.497686 7.583763
beta_coeff
## [,1]
## [1,] 4.724859
## [2,] 2.497686
## [3,] 7.583763
dat_df <- as.data.frame(dat)
colnames(dat_df) <- c('x1','x2','y')
dat_lm <- lm(data=dat_df,y~x1+x2 )
summary(dat_lm)
##
## Call:
## lm(formula = y ~ x1 + x2, data = dat_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.11170 -0.56708 0.06266 0.59903 1.96478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7249 0.2303 20.513 < 2e-16 ***
## x1 2.4977 0.3309 7.548 2.42e-11 ***
## x2 7.5838 0.3266 23.220 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9325 on 97 degrees of freedom
## Multiple R-squared: 0.8703, Adjusted R-squared: 0.8676
## F-statistic: 325.4 on 2 and 97 DF, p-value: < 2.2e-16
We now have the coefficients of calculating and lm() are the same 4.7248587, 2.4976863, 7.5837634
plot(dat_lm,which=1)
Recompute the least squares coefficients for the cherry dataset in our previous example, but now using the matrix formulation, and compare.
cherry <- read.csv('cherry.csv')
X <- cherry %>% select(Diam, Height) %>% mutate(Diam2 = Diam*Diam) %>%
mutate(intercept=1) %>% relocate(intercept)
X <- as.matrix(X)
Y <- cherry %>% select(Volume)
Y <- as.matrix(Y)
beta_coeff <- solve(t(X)%*%X) %*% (t(X) %*% Y)
cat('\nUsing formula\n')
##
## Using formula
beta_coeff
## Volume
## intercept -9.9204060
## Diam -2.8850787
## Height 0.3763873
## Diam2 0.2686224
cat('Using lm()')
## Using lm()
cherry_lm <- lm(Volume ~ Diam + I(Diam^2) + Height , data=cherry)
summary(cherry_lm)
##
## Call:
## lm(formula = Volume ~ Diam + I(Diam^2) + Height, data = cherry)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2928 -1.6693 -0.1018 1.7851 4.3489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.92041 10.07911 -0.984 0.333729
## Diam -2.88508 1.30985 -2.203 0.036343 *
## I(Diam^2) 0.26862 0.04590 5.852 3.13e-06 ***
## Height 0.37639 0.08823 4.266 0.000218 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.625 on 27 degrees of freedom
## Multiple R-squared: 0.9771, Adjusted R-squared: 0.9745
## F-statistic: 383.2 on 3 and 27 DF, p-value: < 2.2e-16
plot(cherry_lm,which = 1,pch=c('🍒'))
bp <- read.csv('bloodpressure.csv')
bp_lm <- lm(data=bp,SystolicBloodPressure~Age )
summary(bp_lm)
##
## Call:
## lm(formula = SystolicBloodPressure ~ Age, data = bp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.724 -6.994 -0.520 2.931 75.654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.7147 10.0005 9.871 1.28e-10 ***
## Age 0.9709 0.2102 4.618 7.87e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.31 on 28 degrees of freedom
## Multiple R-squared: 0.4324, Adjusted R-squared: 0.4121
## F-statistic: 21.33 on 1 and 28 DF, p-value: 7.867e-05
plot(bp_lm,which=1)
plot(bp_lm,which=2)
Significant, moderate relationship, R square = 0.4323947
3 outliers, a bit non linear, heteroscedasticity ( slightly narrow
from left, wider to right)
soap <- read.csv('soap.csv')
soap_lm <- lm(data = soap,Weight~Day)
summary(soap_lm)
##
## Call:
## lm(formula = Weight ~ Day, data = soap)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2436 -1.2950 0.3078 1.3942 5.5040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123.1408 1.3822 89.09 <2e-16 ***
## Day -5.5748 0.1068 -52.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.949 on 13 degrees of freedom
## Multiple R-squared: 0.9953, Adjusted R-squared: 0.9949
## F-statistic: 2724 on 1 and 13 DF, p-value: < 2.2e-16
plot(soap_lm,which=1)
sqrt(length(soap$Day))
## [1] 3.872983
Strong relationship, Rsquared =99.5%, p_value<0.05
Residual plot: U shape pattern, non linear relationship, 3 outliers, however with n() sample <30, the L.I.N.E assumptions are not violated?
drink <- read.csv('softdrink.csv')
pairs(drink)
cor(drink)
## Time Cases Distance
## Time 1.0000000 0.9646146 0.8916701
## Cases 0.9646146 1.0000000 0.8242150
## Distance 0.8916701 0.8242150 1.0000000
High correlations. Build model
mlr <- lm(data=drink,Time~Cases+Distance)
summary(mlr)
##
## Call:
## lm(formula = Time ~ Cases + Distance, data = drink)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7880 -0.6629 0.4364 1.1566 7.4197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.341231 1.096730 2.135 0.044170 *
## Cases 1.615907 0.170735 9.464 3.25e-09 ***
## Distance 0.014385 0.003613 3.981 0.000631 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 22 degrees of freedom
## Multiple R-squared: 0.9596, Adjusted R-squared: 0.9559
## F-statistic: 261.2 on 2 and 22 DF, p-value: 4.687e-16
plot(mlr,which=1,pch=c('🥤','🍷'))
Model is significant, Delivery time estimated as:
Delivery Time = 2.341 + 1.616*Cases + 0.014* Distance