library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

working directory

setwd("/Users/se776257/OneDrive - University of Central Florida/Desktop/Prof. An/02 Teaching/2024 Spring/PAD 7754 Quantitative Methods/Class Materials/R data/")

Making data (see Slide 5)

family <- c(1,2,3,4,5)
family
## [1] 1 2 3 4 5

Make income and savings variables

income <- c(8,11,9,6,6)
savings <- c(0.6,1.2,1,0.7,0.3)

Make a dataframe with the three variables

df <- data.frame(family,income,savings)
df
##   family income savings
## 1      1      8     0.6
## 2      2     11     1.2
## 3      3      9     1.0
## 4      4      6     0.7
## 5      5      6     0.3

You want to examine the relationship between income and savings

ggplot(data = df) + 
  geom_point(mapping = aes(x = income, y = savings))

Let’s add a regression line

ggplot(data = df) + 
  geom_point(mapping = aes(x = income, y = savings)) +
  geom_smooth(mapping = aes(x = income, y = savings), method = lm, se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

# Least Squares Regression # Let’s call our fitted model model1. Then we have:

model1=lm(data=df, formula= savings ~ income)

summary(model1)
## 
## Call:
## lm(formula = savings ~ income, data = df)
## 
## Residuals:
##         1         2         3         4         5 
## -0.160000  0.006667  0.095556  0.228889 -0.171111 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.39556    0.38180  -1.036   0.3764  
## income       0.14444    0.04644   3.111   0.0529 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.197 on 3 degrees of freedom
## Multiple R-squared:  0.7633, Adjusted R-squared:  0.6844 
## F-statistic: 9.676 on 1 and 3 DF,  p-value: 0.05286

“lm” stands for linear model, so the first letter is an L. The first line calculates the results of the linear regression.The second line prints the results.

#Residuals and plots of the residuals # We can create the residuals after fitting the model

df$ResidSavings <- residuals(model1)

We can plot the residuals as a function of ‘income’ to check for linearity, and for common standard deviation (i.e. that the spread of the residuals about 0 is similar for all values of income):

ggplot(data = df) + 
   geom_point(mapping = aes(x = income, y = ResidSavings)) +
   geom_hline(yintercept=0,color="red") +
   geom_smooth(mapping = aes(x = income, y = ResidSavings),method=loess,se=FALSE,color="blue")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 5.975
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.025
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 9.1506

geom_point above creates a scatter plot. The geom_hline adds a horizontal line for y=0, and the color is by default black, so I switched it to red to make it more salient. Finally, geom_smooth adds a smooth fit (not necessarily linear) using the loess smoothing method (method=loess).

Finally, we can do a QQ plot of the residuals to check that the response varies normally about the regression line:

qqnorm(df$ResidSavings)
qqline(df$ResidSavings,col="red")