library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
x = rnorm(100)
y = 1.5 * x + 3 
dat=tibble(x,y)
ggplot(data=dat,aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method='lm',se=FALSE,color = 'grey') + 
  ggtitle("Notice the perfect fit to the Data, aslo noting the x variable is random") +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

cor(x,y)
## [1] 1
x = rnorm(100)
sigma = 2
y = 1.5 * x + 3 + rnorm(n=100,sd=sigma)
dat=tibble(x,y)
ggplot(data=dat) +
  geom_point(aes(x = x, y = y)) +
  theme_bw()

What is going on here?

x = rnorm(100)
y = 1.5 * x + 3 + rnorm(100,sd=.5)
dat=tibble(x,y)
ggplot(data=dat) +
  geom_point(aes(x = x, y = y)) +
  geom_smooth(aes(x=x,y=y),method = 'lm') +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

fit <- lm(y~x ,data = dat)
summary(fit)
## 
## Call:
## lm(formula = y ~ x, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.30917 -0.27336  0.00675  0.27185  1.03212 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.07268    0.04535   67.75   <2e-16 ***
## x            1.48324    0.03977   37.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4533 on 98 degrees of freedom
## Multiple R-squared:  0.9342, Adjusted R-squared:  0.9335 
## F-statistic:  1391 on 1 and 98 DF,  p-value: < 2.2e-16
rz=residuals(fit)
pd = fitted(fit)
data2 = tibble(x,y,rz,pd)
data2
## # A tibble: 100 × 4
##         x     y     rz    pd
##     <dbl> <dbl>  <dbl> <dbl>
##  1 -1.06  1.08  -0.427 1.50 
##  2 -0.689 1.42  -0.633 2.05 
##  3 -0.321 2.77   0.173 2.60 
##  4  0.150 2.98  -0.313 3.30 
##  5 -1.92  0.338  0.115 0.223
##  6 -1.85  0.205 -0.123 0.328
##  7  0.520 3.96   0.118 3.84 
##  8 -0.899 2.05   0.313 1.74 
##  9 -1.09  1.98   0.523 1.46 
## 10 -1.54  0.689 -0.106 0.795
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(data = data2,aes(x = x, y = y)) +
  geom_smooth(method = 'lm', se = FALSE, color='lightgrey') +
  geom_segment(aes(xend = x, yend = pd),alpha = .2) +
  geom_point() +
  geom_point(aes(y = pd),shape=1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

Let’s look at some data with which you are already familiar.

mpg %>% ggplot(aes(x = displ, y = cty)) +
          geom_point() +
          geom_smooth(method = 'lm',se = FALSE,color = 'pink') +
          theme_bw()
## `geom_smooth()` using formula 'y ~ x'

d <- mpg %>% select(displ,cty)
fit = lm(cty ~ displ,data = d)
pred = predict(fit)
ggplot(d, aes(x = displ, y = cty)) +
  geom_smooth(method = 'lm', se = FALSE, color = 'pink') +
  geom_segment(aes(xend = displ, yend = pred), alpha = .2) +
  geom_point() +
  geom_point(aes(y = pred),shape = 1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

summary(fit)
## 
## Call:
## lm(formula = cty ~ displ, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3109 -1.4695 -0.2566  1.1087 14.0064 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.9915     0.4821   53.91   <2e-16 ***
## displ        -2.6305     0.1302  -20.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.567 on 232 degrees of freedom
## Multiple R-squared:  0.6376, Adjusted R-squared:  0.6361 
## F-statistic: 408.2 on 1 and 232 DF,  p-value: < 2.2e-16
#install.packages("UsingR")
library(UsingR)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer
head(galton)
##   child parent
## 1  61.7   70.5
## 2  61.7   68.5
## 3  61.7   65.5
## 4  61.7   64.5
## 5  61.7   64.0
## 6  62.2   67.5
ggplot(data=galton) +
  geom_point(aes(x = parent, y = child)) +
  geom_smooth(aes(x= parent, y = child), method = 'lm') +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

Question 1&2

Mathmatical Expression: child = 1.4952 (parent) + 2.98713

fitgalton <- lm(y~x, data= galton)
summary(fitgalton)
## 
## Call:
## lm(formula = y ~ x, data = galton)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.30917 -0.27336  0.00675  0.27185  1.03212 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.07268    0.04535   67.75   <2e-16 ***
## x            1.48324    0.03977   37.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4533 on 98 degrees of freedom
## Multiple R-squared:  0.9342, Adjusted R-squared:  0.9335 
## F-statistic:  1391 on 1 and 98 DF,  p-value: < 2.2e-16

Question 3

The parent ratio is it explains 89% of the variation of folks heights. So, yes it is useful.

Question 4

(71 in + 1.08 (63in))/2 = 69.52 or 5ft 9.5in

Question 5

It is off by 5 inches off my current height.