Loading data and basic analysis

Loading galton data - Height of Parent Vs Height of Child

library(UsingR)
data(galton)
head(galton)
##   child parent
## 1  61.7   70.5
## 2  61.7   68.5
## 3  61.7   65.5
## 4  61.7   64.5
## 5  61.7   64.0
## 6  62.2   67.5
summary(galton)
##      child           parent     
##  Min.   :61.70   Min.   :64.00  
##  1st Qu.:66.20   1st Qu.:67.50  
##  Median :68.20   Median :68.50  
##  Mean   :68.09   Mean   :68.31  
##  3rd Qu.:70.20   3rd Qu.:69.50  
##  Max.   :73.70   Max.   :73.00
plot(galton$parent,galton$child,pch=19,col="blue")
abline(h=mean(galton$child),col="blue",lwd=2)
abline(v=mean(galton$parent),col="red",lwd=3)

Fitting the model

model fit : y = beta0 + beta1 * X

outcome<-galton$child
predictor<-galton$parent
cor_value<-cor(outcome,predictor)
sd_predictor<-sd(predictor)
sd_outcome<-sd(outcome)
beta1<-cor_value* (sd_outcome/sd_predictor)
beta0<-mean(outcome) - (beta1 * mean(predictor)) 
cat("beta1: ",beta1," , beta0: ",beta0)
## beta1:  0.6462906  , beta0:  23.94153

It should be same as the model fit from lm

fit<-lm(child ~ parent, data = galton)
coef(fit)
## (Intercept)      parent 
##  23.9415302   0.6462906
summary(fit)
## 
## Call:
## lm(formula = child ~ parent, data = galton)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8050 -1.3661  0.0487  1.6339  5.9264 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.94153    2.81088   8.517   <2e-16 ***
## parent       0.64629    0.04114  15.711   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.239 on 926 degrees of freedom
## Multiple R-squared:  0.2105, Adjusted R-squared:  0.2096 
## F-statistic: 246.8 on 1 and 926 DF,  p-value: < 2.2e-16

Regression through Mean

If we fit regression through mean, it will have the same slope

x_m<-galton$parent - mean(galton$parent)
y_m<-galton$child - mean(galton$child)  
beta1_m<-cor(y_m,x_m)*sd(y_m)/sd(x_m)
beta1_m
## [1] 0.6462906
fit_m<-lm(y_m ~ x_m)
coef(fit_m)[2]
##       x_m 
## 0.6462906

Plotting Linear Model

library(ggplot2)
qplot(x=predictor,y=outcome)+geom_smooth(method = "lm")

Statistics of Linear Regression

Consider the model : Yi = β0 + β1 * Xi + εi
where, ε = N(0,σ^2) is the error term.

Residuals

Residuals: Variation left unexplained = Observed Value - Predicted Value

e <- outcome - predicted(outcome) or (y - ŷ)

y_hat <- predict(fit)
e <- resid(fit)
e_test <- galton$child - y_hat
cat("Mean from resid: ",mean(e))
## Mean from resid:  -2.359884e-15
cat("Mean from manual computation: ",mean(e_test))
## Mean from manual computation:  -8.67773e-13
plot(y=e,x=predictor)
abline(h=0,col="red",lwd=2)

Variance of Residual around the regression line = Avg squared Residual = (1/n-2) * Σ(ei2)
Std Error of Residual (σ) = sqrt (Avg squared Residual)
In R, Std Error of Residual = summary(fit)$sigma

summary(fit)$sigma
## [1] 2.238547

Total Variation = Residual Variation + Regression Variation

cat("Total Variation: ",var(outcome))
## Total Variation:  6.340029
cat("Residual Varaition: ",var(fit$residuals))
## Residual Varaition:  5.005688
cat("Regression Variation: ",var(predict(fit)))
## Regression Variation:  1.334341

R Squared: % total variability explained by a regression model
Rsquared = Regression Variation / Total Variation

Predicting With Linear Regression

newParents <- data.frame(parent=c(65,66,67))
predict(fit,newdata=newParents)
##        1        2        3 
## 65.95042 66.59671 67.24300

Confidence Interval:
- Shows the confidence interval around the estimated line at that particular value of X

predict(fit,newdata=newParents,interval = ("confidence"))
##        fit      lwr      upr
## 1 65.95042 65.64690 66.25394
## 2 66.59671 66.36108 66.83234
## 3 67.24300 67.06425 67.42175

Prediction Interval:
- Shows the cofidence interval for new Y at that particular value of X.
- Prediction intervals must account for both the uncertainty in knowing the value of the population mean, plus data scatter.
- So a prediction interval is always wider than a confidence interval.

predict(fit,newdata=newParents,interval = ("prediction"))
##        fit      lwr      upr
## 1 65.95042 61.54673 70.35410
## 2 66.59671 62.19718 70.99624
## 3 67.24300 62.84615 71.63985