library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
setwd("/Users/se776257/OneDrive - University of Central Florida/Desktop/Prof. An/02 Teaching/2024 Spring/PAD 7754 Quantitative Methods/Class Materials/R data/")
Making data (see Slide 5)
family <- c(1,2,3,4,5)
family
## [1] 1 2 3 4 5
Make income and savings variables
income <- c(8,11,9,6,6)
savings <- c(0.6,1.2,1,0.7,0.3)
Make a dataframe with the three variables
df <- data.frame(family,income,savings)
df
## family income savings
## 1 1 8 0.6
## 2 2 11 1.2
## 3 3 9 1.0
## 4 4 6 0.7
## 5 5 6 0.3
You want to examine the relationship between income and savings
ggplot(data = df) +
geom_point(mapping = aes(x = income, y = savings))
Let’s add a regression line
ggplot(data = df) +
geom_point(mapping = aes(x = income, y = savings)) +
geom_smooth(mapping = aes(x = income, y = savings), method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
# Least Squares Regression # Let’s call our fitted model model1. Then we
have:
model1=lm(data=df, formula= savings ~ income)
summary(model1)
##
## Call:
## lm(formula = savings ~ income, data = df)
##
## Residuals:
## 1 2 3 4 5
## -0.160000 0.006667 0.095556 0.228889 -0.171111
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.39556 0.38180 -1.036 0.3764
## income 0.14444 0.04644 3.111 0.0529 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.197 on 3 degrees of freedom
## Multiple R-squared: 0.7633, Adjusted R-squared: 0.6844
## F-statistic: 9.676 on 1 and 3 DF, p-value: 0.05286
“lm” stands for linear model, so the first letter is an L. The first line calculates the results of the linear regression.The second line prints the results.
#Residuals and plots of the residuals # We can create the residuals after fitting the model
df$ResidSavings <- residuals(model1)
We can plot the residuals as a function of ‘income’ to check for linearity, and for common standard deviation (i.e. that the spread of the residuals about 0 is similar for all values of income):
ggplot(data = df) +
geom_point(mapping = aes(x = income, y = ResidSavings)) +
geom_hline(yintercept=0,color="red") +
geom_smooth(mapping = aes(x = income, y = ResidSavings),method=loess,se=FALSE,color="blue")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 5.975
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.025
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 9.1506
geom_point above creates a scatter plot. The geom_hline adds a
horizontal line for y=0, and the color is by default black, so I
switched it to red to make it more salient. Finally, geom_smooth adds a
smooth fit (not necessarily linear) using the loess smoothing method
(method=loess).
Finally, we can do a QQ plot of the residuals to check that the response varies normally about the regression line:
qqnorm(df$ResidSavings)
qqline(df$ResidSavings,col="red")