library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
x = rnorm(100)
y = 1.5 * x + 3
dat=tibble(x,y)
ggplot(data=dat,aes(x = x, y = y)) +
geom_point() +
geom_smooth(method='lm',se=FALSE,color = 'grey') +
ggtitle("Notice the perfect fit to the Data, aslo noting the x variable is random") +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
cor(x,y)
## [1] 1
x = rnorm(100)
sigma = 2
y = 1.5 * x + 3 + rnorm(n=100,sd=sigma)
dat=tibble(x,y)
ggplot(data=dat) +
geom_point(aes(x = x, y = y)) +
theme_bw()
What is going on here?
x = rnorm(100)
y = 1.5 * x + 3 + rnorm(100,sd=.5)
dat=tibble(x,y)
ggplot(data=dat) +
geom_point(aes(x = x, y = y)) +
geom_smooth(aes(x=x,y=y),method = 'lm') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
fit <- lm(y~x ,data = dat)
summary(fit)
##
## Call:
## lm(formula = y ~ x, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.30917 -0.27336 0.00675 0.27185 1.03212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.07268 0.04535 67.75 <2e-16 ***
## x 1.48324 0.03977 37.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4533 on 98 degrees of freedom
## Multiple R-squared: 0.9342, Adjusted R-squared: 0.9335
## F-statistic: 1391 on 1 and 98 DF, p-value: < 2.2e-16
rz=residuals(fit)
pd = fitted(fit)
data2 = tibble(x,y,rz,pd)
data2
## # A tibble: 100 × 4
## x y rz pd
## <dbl> <dbl> <dbl> <dbl>
## 1 -1.06 1.08 -0.427 1.50
## 2 -0.689 1.42 -0.633 2.05
## 3 -0.321 2.77 0.173 2.60
## 4 0.150 2.98 -0.313 3.30
## 5 -1.92 0.338 0.115 0.223
## 6 -1.85 0.205 -0.123 0.328
## 7 0.520 3.96 0.118 3.84
## 8 -0.899 2.05 0.313 1.74
## 9 -1.09 1.98 0.523 1.46
## 10 -1.54 0.689 -0.106 0.795
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(data = data2,aes(x = x, y = y)) +
geom_smooth(method = 'lm', se = FALSE, color='lightgrey') +
geom_segment(aes(xend = x, yend = pd),alpha = .2) +
geom_point() +
geom_point(aes(y = pd),shape=1) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
Let’s look at some data with which you are already familiar.
mpg %>% ggplot(aes(x = displ, y = cty)) +
geom_point() +
geom_smooth(method = 'lm',se = FALSE,color = 'pink') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
d <- mpg %>% select(displ,cty)
fit = lm(cty ~ displ,data = d)
pred = predict(fit)
ggplot(d, aes(x = displ, y = cty)) +
geom_smooth(method = 'lm', se = FALSE, color = 'pink') +
geom_segment(aes(xend = displ, yend = pred), alpha = .2) +
geom_point() +
geom_point(aes(y = pred),shape = 1) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
summary(fit)
##
## Call:
## lm(formula = cty ~ displ, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3109 -1.4695 -0.2566 1.1087 14.0064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.9915 0.4821 53.91 <2e-16 ***
## displ -2.6305 0.1302 -20.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.567 on 232 degrees of freedom
## Multiple R-squared: 0.6376, Adjusted R-squared: 0.6361
## F-statistic: 408.2 on 1 and 232 DF, p-value: < 2.2e-16
#install.packages("UsingR")
library(UsingR)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
head(galton)
## child parent
## 1 61.7 70.5
## 2 61.7 68.5
## 3 61.7 65.5
## 4 61.7 64.5
## 5 61.7 64.0
## 6 62.2 67.5
ggplot(data=galton) +
geom_point(aes(x = parent, y = child)) +
geom_smooth(aes(x= parent, y = child), method = 'lm') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
Mathmatical Expression: child = 1.4952 (parent) + 2.98713
fitgalton <- lm(y~x, data= galton)
summary(fitgalton)
##
## Call:
## lm(formula = y ~ x, data = galton)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.30917 -0.27336 0.00675 0.27185 1.03212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.07268 0.04535 67.75 <2e-16 ***
## x 1.48324 0.03977 37.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4533 on 98 degrees of freedom
## Multiple R-squared: 0.9342, Adjusted R-squared: 0.9335
## F-statistic: 1391 on 1 and 98 DF, p-value: < 2.2e-16
The parent ratio is it explains 89% of the variation of folks heights. So, yes it is useful.
(71 in + 1.08 (63in))/2 = 69.52 or 5ft 9.5in
It is off by 5 inches off my current height.