Downloading and accessing R packages

Install R package

install.packages("dplyr")

## Installing package into 'C:/Users/hktse/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'dplyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'dplyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hktse\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\hktse\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied

## Warning: restored 'dplyr'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpO6Iegl\downloaded_packages

Install multiple R packages

install.packages(c("dplyr", "ggplot2"))

## Installing packages into 'C:/Users/hktse/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'dplyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'dplyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hktse\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\hktse\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied

## Warning: restored 'dplyr'

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpO6Iegl\downloaded_packages

# c() means combine values into a vector or list

Load installed R package

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Alternatively, you can also use

library(dplyr)

Load multiple installed R packages

package_list <- c("dplyr", "ggplot2")
lapply(package_list, require, character.only = TRUE) # lapply() applies an action over values inside a list

## Loading required package: ggplot2

## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE

Basic functionalities

Print out characters

hello_world <- "Hello, world!"
paste(hello_world)   # print the values enclosed within "" on the console

## [1] "Hello, world!"

Basic calculation

1 + 1

## [1] 2

(2 * 1) / 2

## [1] 1

Saving calculation result as an “object,” and print it out

two <- (2 * 1) / 2
two

## [1] 1

Inquire object type

two <- (2 * 1) / 2
class(two)

## [1] "numeric"

class(hello_world)

## [1] "character"

Replace previously generated “object” with the same name. Note that you can name the “object” whatever you like, but be aware that once you name another output by the same object name, your previously generated object of the same name will be masked (by the new values)!

hello_world <- "Aloha!"
hello_world

## [1] "Aloha!"

hello_world <- "Hola!"
hello_world

## [1] "Hola!"

Square a number: \(2^2\)

2^2

## [1] 4

Taking the sequare root of a positive real number

four <- 2^2
sqrt(four)

## [1] 2

Log transformation

log(four)

## [1] 1.386294

Logical operation

two <- 2
one <- 1
two > one

## [1] TRUE

Remove all objects from the current workspace (R memory).

rm(list=ls())

Create a matrix object: note the sequence by which they appear in each cell entry

matrix(c(1,2,3,4), ncol=2, nrow=2)

##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4

Matrix multiplication

a <- matrix(c(1,2,3,4), ncol=2, nrow=2)
class(a)  # use class() to check out the "type/format" of an object

## [1] "matrix" "array"

b <- matrix(c(5,6,7,8), ncol=2, nrow=2)
b

##      [,1] [,2]
## [1,]    5    7
## [2,]    6    8

a * b   # element-wise multiplication

##      [,1] [,2]
## [1,]    5   21
## [2,]   12   32

a %*% b   # matrix multiplication

##      [,1] [,2]
## [1,]   23   31
## [2,]   34   46

Coding a linear model of height prediction:

\(Y = \beta_{0} + X\beta_{1} + \epsilon\)

Y = your height
X = your father’s height
Assuming a baseline effect: \(\beta_0\) = 70 (Your height can’t be zero!)
Two obs: \(X_1\) = 181, \(X_2\) = 159

## parameters: 1 + X
X <- matrix(c(1, 1, 181, 159), ncol = 2, nrow = 2)

## beta coefficient
beta <- matrix(c(70, 0.6), ncol = 1, nrow = 2)

## multiplication
Xbeta <- X %*% beta
Xbeta

##       [,1]
## [1,] 178.6
## [2,] 165.4

class(Xbeta)

## [1] "matrix" "array"

dim(Xbeta)

## [1] 2 1

## generate the error term: is the dimension of epsilon conformable to Xbeta?
epsilon <- matrix(c(2.4, -1.4), ncol = 1, nrow = 2)
dim(epsilon)

## [1] 2 1

## predict Y as a linear combination of Xbeta and epsilon
Y <- Xbeta + epsilon
Y

##      [,1]
## [1,]  181
## [2,]  164

Normal distribution

The consequences of different values of \(\sigma\) on data distribution.

x = seq(-8, 8, length=500)
## Generate three distributions of different sigma(s) but with the same mean

y1=dnorm(x, mean = 0, sd = 1)  # dnorm() gives the density
y2=dnorm(x, mean = 0, sd = 2)
y3 = dnorm(x, mean = 0, sd = 1/2)
plot(x, y1,type = "l", lwd = 2, col = "red")   # line type = "l" means "line," lwd means "line width," col means "color"
lines(x, y2, type = "l", lwd = 2, col = "blue")
lines(x, y3, type = "l", lwd = 2, col = "green")
legend("topright",c("sigma=1","sigma=2","sigma=1/2"),
lty=c(1,1,1),col=c("red","blue","green"))

The Area Under the Probability Density Function (PDF).

x = seq(70, 130, length = 200) # range between 70 and 130  #length = n means you divide the range of the sequence into n equal segments 
y = dnorm(x, mean = 100, sd = 10)  # dnorm returns the value of the PDF for the normal distribution given parameter values (x, mean, sd)
plot(x, y, type = "l", lwd = 2, col = "black")
## lower-tail distribution between 70 and 90
x = seq(70, 90, length = 100)
y = dnorm(x, mean = 100, sd = 10)
polygon(c(70, x, 90), c(0, y, 0), col = "gray")  # polygon(x, y) x-y coordinate, col = "gray": color this area in gray

The 68-95-99.7 Rule.

# Replicate the same distribution function as above
x = seq(70, 130, length = 200)
y = dnorm(x, mean = 100, sd = 10)
plot(x, y, type = "l", lwd = 2, col = "black")
## fill the area between 90 and 110
x = seq(90, 110, length = 200)  # x-axis values range
y = dnorm(x, mean = 100, sd = 10)   # y-axis values range
polygon(c(90, x, 110), c(0, y, 0), col = "gray")

To find the area between x = (90, 110), we simply subtract the area to the left of x = 90 (i.e., the CDF) from the area to the left of x = 110. Note that the probability that x will occur between 90 and 110 is the PDF.

To express formally (by the fundamental theorem of calculus)

\[\begin{equation*} \int_{0}^{110}p(x)dx - \int_{0}^{90}p(x)dx \end{equation*}\]

In R

pnorm(110, mean = 100, sd = 10) - pnorm(90, mean = 100, sd = 10)  # pnorm() gives the cumulative distribution function (CDF)

## [1] 0.6826895

Draw a sample of size = 200 with mean = 10 and s.d. = 2

x.norm <- rnorm(n = 200, m = 10, sd = 2) # rnorm() generates multivariate normal random variables in the space X
## plot the sample
hist(x.norm,main="Histogram of sample data")

## calculate z-score (Note: z-score is unconditional on any X)
z.norm <- (x.norm - mean(x.norm))/sd(x.norm)
## show first 6 entries
head(z.norm)

## [1]  0.47174865  0.01974108  1.02327699 -0.48074668 -0.28566615 -0.66648011

Programming your first linear model

Programming your first linear model, visualize, and interpret the output

# Load required packages and data
# install.packages(c("MASS", "ISLR"))
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(ISLR)
data(Boston)

# For a description of the data, please refer to the link below: https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/Boston.html

# Take a look at what's inside the Boston data
names(Boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

# How many rows and columns?
dim(Boston)

## [1] 506  14

# Display the structure of each data column
str(Boston)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

# How many rows of missing value (NA) in the variable "crim" (crime rates)?
sum(is.null(Boston$crim))

## [1] 0

# Are there NA values in "crim"? Which rows contain NA values?
which(is.na(Boston$crim), arr.ind=TRUE)  # arr.ind = array index

## integer(0)

# Fitting your first linear regression
# First model, "medv" as a function of "crim" and "ptratio"
# There are a number of ways to fit a R functions, let's find out.
# First, the standard and easy way. Specify your data source and type in variable names without subsetting
lm.fit1 = lm(medv ~ crim + ptratio, data = Boston)
# Second way, subset variable names from the data
lm.fit1a <- lm(Boston$medv ~ Boston$crim +　Boston$ptratio)
# Third way, attach the data to your current R session. That way, you don't need to subset variables nor specify data source, but mindful of its pitfalls!

attach(Boston)
lm.fit1b = lm(medv ~ crim + ptratio)
detach(Boston)  # Be sure to detach the dataframe after each computing session.

# You should obtain the same result from all three models.

# Second model, same as the first one but only regress "medv" on the first 350 obs, remember the format for subsetting is df[row, col]
lm.fit2 = lm(medv ~ crim + ptratio, data = Boston[1:350, ])

# Third model, regress "medv" on ALL other variables (represented by .)
lm.fit3 = lm(medv ~ ., data = Boston)


# Take a look at what's inside your model output
names(lm.fit1)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

length(lm.fit1$coefficients)  # 3 coefficients (one for each variable plus the constant)

## [1] 3

length(lm.fit1$residuals)   # one for each obs in the data

## [1] 506

length(lm.fit1$fitted.values)  # one predicted value (y_hat) for each obs

## [1] 506

# You should also examine the outputs of lm.fit2 and lm.fit3. I'll leave it to you as an exercise. 

# Check out the result of the first model
summary(lm.fit1)

## 
## Call:
## lm(formula = medv ~ crim + ptratio, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.851  -4.637  -1.051   2.582  32.246 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 57.37835    2.98874  19.198  < 2e-16 ***
## crim        -0.28142    0.04104  -6.857 2.06e-11 ***
## ptratio     -1.83298    0.16305 -11.242  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.592 on 503 degrees of freedom
## Multiple R-squared:  0.3213, Adjusted R-squared:  0.3186 
## F-statistic: 119.1 on 2 and 503 DF,  p-value: < 2.2e-16

summary(lm.fit2)

## 
## Call:
## lm(formula = medv ~ crim + ptratio, data = Boston[1:350, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.702  -4.611  -0.998   2.751  25.555 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  61.4592     3.2626  18.838  < 2e-16 ***
## crim         -4.0548     0.6141  -6.602 1.52e-10 ***
## ptratio      -1.9635     0.1794 -10.944  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.187 on 347 degrees of freedom
## Multiple R-squared:  0.2847, Adjusted R-squared:  0.2806 
## F-statistic: 69.06 on 2 and 347 DF,  p-value: < 2.2e-16

summary(lm.fit3)

## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

# Alternatively, you can use the print() method
print(lm.fit1)

## 
## Call:
## lm(formula = medv ~ crim + ptratio, data = Boston)
## 
## Coefficients:
## (Intercept)         crim      ptratio  
##     57.3783      -0.2814      -1.8330

print(lm.fit2)

## 
## Call:
## lm(formula = medv ~ crim + ptratio, data = Boston[1:350, ])
## 
## Coefficients:
## (Intercept)         crim      ptratio  
##      61.459       -4.055       -1.963

print(lm.fit3)

## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Coefficients:
## (Intercept)         crim           zn        indus         chas          nox  
##   3.646e+01   -1.080e-01    4.642e-02    2.056e-02    2.687e+00   -1.777e+01  
##          rm          age          dis          rad          tax      ptratio  
##   3.810e+00    6.922e-04   -1.476e+00    3.060e-01   -1.233e-02   -9.527e-01  
##       black        lstat  
##   9.312e-03   -5.248e-01

# Now, the most important part, interpretation!
# The dependent variable (phenomenon to be explained), medv, is "Median value of owner-occupied homes measured in $1000's (USD)."
# Here we see that, in the first model (lm.fit1), the coefficients of crim and ptratio are -0.28 and -1.83, respectively, meaning that one unit increases in crime rate (Pupil/Teacher ratio) is associated with -0.28 (-1.83) * 1000 (USD) dollars decrease in medv (edian value of owner-occupied homes), holding other things constant (or you can say "controlling for other variables"). The very last line "holding other things constant" is crucial, because you need to take into account whatever is inside the error term or variables you have not included in your analysis.

# Organize and present your output as a table (in plublishable grade).
# install.packages(c("flextable", "stargazer")) 
library(flextable)

ft <- as_flextable(lm.fit1)
ft

	Estimate	Standard Error	t value	Pr(>\|t\|)
(Intercept)	57.378	2.989	19.198	0.0000	***
crim	-0.281	0.041	-6.857	0.0000	***
ptratio	-1.833	0.163	-11.242	0.0000	***
Signif. codes: 0 <= '*' < 0.001 < '' < 0.01 < '*' < 0.05

Residual standard error: 7.592 on 503 degrees of freedom
Multiple R-squared: 0.3213, Adjusted R-squared: 0.3186
F-statistic: 119.1 on 503 and 2 DF, p-value: 0.0000

# In as_flextable function, the variable names that appear in the resulting flextable will be directly taken from the column names of the data frame you are converting, meaning you should ensure your data frame has descriptive column names to accurately represent your variables in the table. Let's do that.

# First, we can replicate lm.fit1 as another object
lm.fit1_temp <- lm.fit1
# We then renames the names of its coefficients
names(lm.fit1$coefficients)  # Original names

## [1] "(Intercept)" "crim"        "ptratio"

names(lm.fit1_temp$coefficients) <- c("Constant", "Crime rate", "P/T ratio")  # Rename

# Now we are good to go
ft <- as_flextable(lm.fit1_temp)
ft

	Estimate	Standard Error	t value	Pr(>\|t\|)
Constant	57.378	2.989	19.198	0.0000	***
Crime rate	-0.281	0.041	-6.857	0.0000	***
P/T ratio	-1.833	0.163	-11.242	0.0000	***
Signif. codes: 0 <= '*' < 0.001 < '' < 0.01 < '*' < 0.05

Residual standard error: 7.592 on 503 degrees of freedom
Multiple R-squared: 0.3213, Adjusted R-squared: 0.3186
F-statistic: 119.1 on 503 and 2 DF, p-value: 0.0000

# Stargazer provides an even more handy way to do the same trick
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

sg <- stargazer(lm.fit1, 
                   type = "text", 
                  title = "Linear Regression Results",
             # omit.stat = c("LL","ser","f"),  relevant statistics you prefer not to display
        dep.var.caption = "Dependent variable",
        dep.var.labels  = c("Median value of owner-occupied homes in $1000's"),  # Y variable name
       covariate.labels = c("Crime rate", "P/T ratio"))  # X variable names

## 
## Linear Regression Results
## ==================================================================
##                                   Dependent variable              
##                     ----------------------------------------------
##                     Median value of owner-occupied homes in 1000's
## ------------------------------------------------------------------
## Crime rate                            -0.281***                   
##                                        (0.041)                    
##                                                                   
## P/T ratio                             -1.833***                   
##                                        (0.163)                    
##                                                                   
## Constant                              57.378***                   
##                                        (2.989)                    
##                                                                   
## ------------------------------------------------------------------
## Observations                             506                      
## R2                                      0.321                     
## Adjusted R2                             0.319                     
## Residual Std. Error                7.592 (df = 503)               
## F Statistic                    119.060*** (df = 2; 503)           
## ==================================================================
## Note:                                  *p<0.1; **p<0.05; ***p<0.01

sg

##  [1] ""                                                                  
##  [2] "Linear Regression Results"                                         
##  [3] "=================================================================="
##  [4] "                                  Dependent variable              "
##  [5] "                    ----------------------------------------------"
##  [6] "                    Median value of owner-occupied homes in 1000's"
##  [7] "------------------------------------------------------------------"
##  [8] "Crime rate                            -0.281***                   "
##  [9] "                                       (0.041)                    "
## [10] "                                                                  "
## [11] "P/T ratio                             -1.833***                   "
## [12] "                                       (0.163)                    "
## [13] "                                                                  "
## [14] "Constant                              57.378***                   "
## [15] "                                       (2.989)                    "
## [16] "                                                                  "
## [17] "------------------------------------------------------------------"
## [18] "Observations                             506                      "
## [19] "R2                                      0.321                     "
## [20] "Adjusted R2                             0.319                     "
## [21] "Residual Std. Error                7.592 (df = 503)               "
## [22] "F Statistic                    119.060*** (df = 2; 503)           "
## [23] "=================================================================="
## [24] "Note:                                  *p<0.1; **p<0.05; ***p<0.01"

# Visualize your result with coefplot() to get a glimpse of coefficient estimate (beta), s.e., and the sign of beta (positive or negative)

# Source the coefplot() function externally: 
source("http://www.r-statistics.com/wp-content/uploads/2010/07/coefplot.r.txt")

# You can also use coefplot function directly from
#https://www.rdocumentation.org/packages/arm/versions/1.12-2/topics/coefplot

# Install.packages("coefplot") # Do not run unless you haven't downloaded this package already
# library(coefplot)

# Get coef estimates and s.e. without the constant term
coefplot(lm.fit1, parm = -1)   # -1 means minus the constant term (i.e., the b0 in the model), it's the same as with all lm() and glm() family of models

# Not happy with the labels on the y-axis? You can change them manually
coefplot::coefplot(lm.fit2, parm = -1, newNames = c(crim = "crime rate", ptratio = "pupil-teacher ratio"))  # the reason for using :: before a command is to tell R that you will be using the function from a certain package instead of others that also use the same name(s) for their functions. In that case, you just type package::function()

# What if I only want to focus on a few variables of interest (specify them into a list c() inside the "predictors" argument)
coefplot::coefplot.lm(lm.fit3, predictors=c("crim", "ptratio"), parm = -1, newNames = c(crim = "crime rate", ptratio = "Pupil-teacher ratio"))

# If you want a traditional black-and-white (or grayish) plotting theme, you will need to require the ggplot2 package (one of the greatest inventions in human history)
# install.packages("ggplot2") # If you haven't installed it already
library(ggplot2)
coefplot::coefplot.lm(lm.fit3, predictors=c("crim", "ptratio"), parm = -1, newNames = c(crim = "crime rate", ptratio = "pupil-teacher ratio")) + theme_bw()

Supplementary materials 1

Code to replicate the 3D plot shown in our lecture slides

require(plot3D)

## Loading required package: plot3D

attach(mtcars)  # Note that I attached the package in the environment, it's not recommended though. If you want to override this setting, just use detach() (shown at the bottom of this code chunk)

## The following object is masked from package:ggplot2:
## 
##     mpg

# linear fit
fit <- lm(mpg ~ wt + disp)
 
# Predict on x-y grid, for surface
wt.pred <- seq(1.5, 5.5, length.out = 30)
disp.pred <- seq(71, 472, length.out = 30)
xy <- expand.grid(wt = wt.pred, 
                 disp = disp.pred)  # Each position on the x-y coordinate represents a predicted value from a unique combination of wt and disp
 
mpg.pred <- matrix (nrow = 30, ncol = 30, 
  data = predict(fit, newdata = data.frame(xy), interval = "prediction"))  # Use the fitted model to predict on the new data

## Warning in matrix(nrow = 30, ncol = 30, data = predict(fit, newdata =
## data.frame(xy), : data length differs from size of matrix: [2700 != 30 x 30]

# Predicted z-values, fitted points for droplines to surface
fitpoints <- predict(fit)   # Get predicted values
 
scatter3D(z = mpg, x = wt, y = disp, pch = 18, cex = 2, 
      theta = 20, phi = 20, ticktype = "detailed",
      xlab = "wt", ylab = "disp", zlab = "mpg", clab = "mpg", 
      surf = list(x = wt.pred, y = disp.pred, z = mpg.pred, 
                  facets = NA, fit = fitpoints),
      colkey = list(length = 0.8, width = 0.4),            
      main = "Miles per gallon as a function of wt. and disp.")

detach(mtcars)  # Remember to detach the data

Supplementary materials 2

A more handy tool for plotting coefficients and CI: dotwhisker (Dot-and-Whisker Plots)

# load required packages and data
# install.packages(c("dotwhisker", "broom", "dplyr"))
library(dotwhisker)
library(broom)
library(dplyr)
data(mtcars)

# Fit a simple model of two variables
model1 <- lm(mpg ~ wt + cyl, data = mtcars)

# Plot model1, the default CI is 95%
dwplot(model1)

# If you want to change the width of the CI to 99%, a more stringent standard.
dwplot(model1, conf.level = .99)

# Include two more models to display
model2 <- lm(mpg ~ wt + cyl + disp, data = mtcars)
model3 <- lm(mpg ~ wt + cyl + disp + gear, data = mtcars)

# You can use dwplot to compare the three models
dwplot(list(model1, model2, model3))

# for more details, check out the documentation of dotwhisker package: #https://cran.r-project.org/web/packages/dotwhisker/vignettes/dotwhisker-vignette.html

Statistical Learning: Week 2

H. K. Tseng

02/25/2025

Installing R and RStudio