Simple Linear Regression

1 Setting Up Working directory, clearing all data and memory

I will first set up my working directory.

# Clear the workspace
rm(list = ls()) # Clear environment
gc()            # Clear unused memory
          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells  597269 31.9    1356358 72.5         NA   700248 37.4
Vcells 1099361  8.4    8388608 64.0      49152  1963372 15.0
cat("\f")       # Clear the console
# Set working directory and path to data
setwd("/Users/arvindsharma/Dropbox/WCAS/Econometrics/")

1.1 Installing Package (required once only), then loading package

### install.packages("UsingR")
library(UsingR)
Loading required package: MASS
Loading required package: HistData
Loading required package: Hmisc

Attaching package: 'Hmisc'
The following objects are masked from 'package:base':

    format.pval, units
data(diamond)

2 Plotting Data

plot(x = diamond$carat,
     y = diamond$price,
     xlab = "Mass (carats)",
     ylab = "Price (Singapore $)",
     main = "Price of Diamonds By Weight",
     sub = "",
     bg  = "lightblue",     # a vector of background colors (Graphical Parameters)
     col = "black",         # the colors for lines and points (Graphical Parameters)
     cex = 1.1,             # a numerical vector giving the amount by which plotting characters and symbols should be scaled relative to the default = 1 (Graphical Parameters)
     pch = 21,              # a vector of plotting characters or symbols (Graphical Parameters) {triangle, empty circle, filled circle, square,...}
     frame = FALSE          # frame.plot    - a logical indicating whether a box should be drawn around the plot.
     )

?abline
abline(reg = lm(data = diamond,
                price ~ carat,
                ),
       lwd = 2,                    # line width, default = 1
       col = "blue"
       )

3 Fitting the linear regression model

fit <- lm(data = diamond,
          formula = price ~ carat
          )
coef(fit)
(Intercept)       carat 
  -259.6259   3721.0249 

We estimate an expected 3721.02 (SIN) dollar increase in price for every carat increase in mass of diamond.

The intercept -259.63 is the expected price of a 0 carat diamond.

4 A more interpretable intercept

fit2 <- lm(data=diamond,
           price ~ I(carat - mean(carat)) 
           )
coef(fit2)
           (Intercept) I(carat - mean(carat)) 
              500.0833              3721.0249 

Thus $500.1 is the expected price for the average sized diamond of the data (0.2042 carats).

5 Changing scale

fit3 <- lm( data=diamond,
            price ~ I(carat*10)
           )

coef(fit3)
  (Intercept) I(carat * 10) 
    -259.6259      372.1025 

6 Predicting the price of a diamond

### PREDICTED VALUES SPECIFIC VALUES
new_x <- c(0.16, 
          0.27, 
          0.34
          )

coef(fit)[1] + coef(fit)[2] * new_x
[1]  335.7381  745.0508 1005.5225
## USE THE PREDICT FUNCTION
?predict
predict(object  = fit,
        newdata = data.frame(carat = new_x)
        )
        1         2         3 
 335.7381  745.0508 1005.5225 
### PREDICTED VALUES FOR THE ENTIRE DATA
predict(object  = fit,
        data = data.frame(carat = new_x)
)
        1         2         3         4         5         6         7         8 
 372.9483  335.7381  372.9483  410.1586  670.6303  335.7381  298.5278  447.3688 
        9        10        11        12        13        14        15        16 
 521.7893  298.5278  410.1586  782.2611  335.7381  484.5791  596.2098  819.4713 
       17        18        19        20        21        22        23        24 
 186.8971  707.8406  670.6303  745.0508  410.1586  335.7381  372.9483  335.7381 
       25        26        27        28        29        30        31        32 
 372.9483  410.1586  372.9483  410.1586  372.9483  298.5278  372.9483  931.1020 
       33        34        35        36        37        38        39        40 
 931.1020  298.5278  335.7381  335.7381  596.2098  596.2098  372.9483  968.3123 
       41        42        43        44        45        46        47        48 
 670.6303 1042.7328  410.1586  670.6303  670.6303  298.5278  707.8406  298.5278 

7 Properties of the residuals

y <- diamond$price; x <- diamond$carat; n <- length(y)
fit   <- lm(formula = y ~ x)
# equivalent
fit   <- lm(data=diamond,
            formula = price ~ carat
            )

e     <-  resid(fit)

plot(x = diamond$carat,
     y = resid(lm(y~x)),
     xlab = "Mass (carats)",
     ylab = "Residuals (SIN $) ",
     main = "Residual vs X",
     sub = "price ~ carat"
     ); abline(h   = 0,
               col = "blue",
               lwd = 2) # y values for horizontal lines

?abline
yhat  <-  predict(fit)
max( abs( e - ( y - yhat ) ) ) # y = diamond$price ; yhat = predicted (diamond$price)
[1] 5.57776e-13
# equivalent
max(abs(e-(y-coef(fit)[1]-coef(fit)[2]*x)))
[1] 5.57776e-13

8 Non-linear data

x <- runif(100,-3,3) ; y <- x + sin(x) + rnorm(100,sd=.2);
plot(x = x,
     y = y, 
     xlab = "Real Number Line (Domain)",
     ylab = "Y axis (Range)",
     main = "Raw Data",
     col  = 'red',
     cex  = .8,
     pch  = 16);abline(reg = lm(y~x),
                       col = "blue",
                       lwd = 2)

plot(x = x,
     y = resid(lm(y ~ x)),
     main = "Residuals vs Domain (Real Number Line)",
     xlab = "Real Number Line (Domain)",
     ylab = "Residuals:  lm(y ~ x) ",
     col  = "red",
     cex  = .8,
     pch  = 16) ; abline(h   = 0,
                         col = "blue",
                         lwd = 2)

9 Heteroskedasticity

# Construct data 
x <- runif(n = 100,
          min = 0,
          max = 6);y <- x + rnorm(n   = 100, 
                                 mean = 0,
                                 sd   = .001 * x 
                                 )
# PLot raw data 
plot(x = x,
     y = y,
     xlab = "Real Number Line (Domain)",
     ylab = "Y axis (Range)",
     main = "Raw Data",);abline(reg = lm(y~x))

9.1 Heteroskedasticity: Getting rid of the blank space can be helpful

plot(x = x,
     y = resid(lm(y ~ x)),
     main = "Residuals vs Domain (Real Number Line)",
     xlab = "Real Number Line (Domain)",
     ylab = "Residuals:  lm(y ~ x) ",
     col  = "red",
     cex  = .8,
     pch  = 16) ; abline(h   = 0,
                         col = "blue",
                         lwd = 2)