library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.0.5
# iris flowers datasets
data(cars)
attach(cars)
dim(cars)
## [1] 50 2
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
#linear regression
model <- lm(dist ~ speed, data = cars)
model
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Coefficients:
## (Intercept) speed
## -17.579 3.932
summary(model)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
# SE should be 5-10x smaller than coeff
#3.9/.42=9 so ok
#17.6/6.7= 2.6 intercept may vary
# beta0,beta1 coeff are significant
# Residual std error=15.4
#1Q*1.5=9.5*1.5=14.2 close to 15.4 so ok
#3Q*1.5=9.2*1.5=13.8 close to 15.4 so ok
#model accounts for 65% of variation
plot(speed,dist)
abline(model)

plot(fitted(model),resid(model))

#residuals are uniformly scattered, no patters, so #ok
qqnorm(resid(model))
qqnorm(resid(model))

#QQ plot is ok, however, may investigate points on both ends
# here is some code that gives more diagnostic #plots
model.diag.metrics <- augment(model)
head(model.diag.metrics)
## # A tibble: 6 x 8
## dist speed .fitted .resid .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 4 -1.85 3.85 0.115 15.5 0.00459 0.266
## 2 10 4 -1.85 11.8 0.115 15.4 0.0435 0.819
## 3 4 7 9.95 -5.95 0.0715 15.5 0.00620 -0.401
## 4 22 7 9.95 12.1 0.0715 15.4 0.0255 0.813
## 5 16 8 13.9 2.12 0.0600 15.5 0.000645 0.142
## 6 10 9 17.8 -7.81 0.0499 15.5 0.00713 -0.521
ggplot(model.diag.metrics, aes(speed,dist)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = speed, yend = .fitted), color = "red", size = 0.3)
## `geom_smooth()` using formula 'y ~ x'

autoplot(model)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

#the shows no pattern in residual v fitted, so ok
#normal QQ plot is ok
#scale location addresses heteroscadesicity is ok
#Leverage - may investigate the 2 leverage points