Loading galton data - Height of Parent Vs Height of Child
library(UsingR)
data(galton)
head(galton)
## child parent
## 1 61.7 70.5
## 2 61.7 68.5
## 3 61.7 65.5
## 4 61.7 64.5
## 5 61.7 64.0
## 6 62.2 67.5
summary(galton)
## child parent
## Min. :61.70 Min. :64.00
## 1st Qu.:66.20 1st Qu.:67.50
## Median :68.20 Median :68.50
## Mean :68.09 Mean :68.31
## 3rd Qu.:70.20 3rd Qu.:69.50
## Max. :73.70 Max. :73.00
plot(galton$parent,galton$child,pch=19,col="blue")
abline(h=mean(galton$child),col="blue",lwd=2)
abline(v=mean(galton$parent),col="red",lwd=3)
model fit : y = beta0 + beta1 * X
outcome<-galton$child
predictor<-galton$parent
cor_value<-cor(outcome,predictor)
sd_predictor<-sd(predictor)
sd_outcome<-sd(outcome)
beta1<-cor_value* (sd_outcome/sd_predictor)
beta0<-mean(outcome) - (beta1 * mean(predictor))
cat("beta1: ",beta1," , beta0: ",beta0)
## beta1: 0.6462906 , beta0: 23.94153
It should be same as the model fit from lm
fit<-lm(child ~ parent, data = galton)
coef(fit)
## (Intercept) parent
## 23.9415302 0.6462906
summary(fit)
##
## Call:
## lm(formula = child ~ parent, data = galton)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8050 -1.3661 0.0487 1.6339 5.9264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.94153 2.81088 8.517 <2e-16 ***
## parent 0.64629 0.04114 15.711 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.239 on 926 degrees of freedom
## Multiple R-squared: 0.2105, Adjusted R-squared: 0.2096
## F-statistic: 246.8 on 1 and 926 DF, p-value: < 2.2e-16
If we fit regression through mean, it will have the same slope
x_m<-galton$parent - mean(galton$parent)
y_m<-galton$child - mean(galton$child)
beta1_m<-cor(y_m,x_m)*sd(y_m)/sd(x_m)
beta1_m
## [1] 0.6462906
fit_m<-lm(y_m ~ x_m)
coef(fit_m)[2]
## x_m
## 0.6462906
library(ggplot2)
qplot(x=predictor,y=outcome)+geom_smooth(method = "lm")
Consider the model : Yi = β0 + β1 * Xi + εi
where, ε = N(0,σ^2) is the error term.
Residuals: Variation left unexplained = Observed Value - Predicted Value
Σ(ei * Xi) = 0
Residual Plots should be random and should not have a pattern otherwise it means that the model is not accurate.
e <- outcome - predicted(outcome) or (y - ŷ)
y_hat <- predict(fit)
e <- resid(fit)
e_test <- galton$child - y_hat
cat("Mean from resid: ",mean(e))
## Mean from resid: -2.359884e-15
cat("Mean from manual computation: ",mean(e_test))
## Mean from manual computation: -8.67773e-13
plot(y=e,x=predictor)
abline(h=0,col="red",lwd=2)
Variance of Residual around the regression line = Avg squared Residual = (1/n-2) * Σ(ei2)
Std Error of Residual (σ) = sqrt (Avg squared Residual)
In R, Std Error of Residual = summary(fit)$sigma
summary(fit)$sigma
## [1] 2.238547
Total Variation = Residual Variation + Regression Variation
cat("Total Variation: ",var(outcome))
## Total Variation: 6.340029
cat("Residual Varaition: ",var(fit$residuals))
## Residual Varaition: 5.005688
cat("Regression Variation: ",var(predict(fit)))
## Regression Variation: 1.334341
R Squared: % total variability explained by a regression model
Rsquared = Regression Variation / Total Variation
newParents <- data.frame(parent=c(65,66,67))
predict(fit,newdata=newParents)
## 1 2 3
## 65.95042 66.59671 67.24300
Confidence Interval:
- Shows the confidence interval around the estimated line at that particular value of X
predict(fit,newdata=newParents,interval = ("confidence"))
## fit lwr upr
## 1 65.95042 65.64690 66.25394
## 2 66.59671 66.36108 66.83234
## 3 67.24300 67.06425 67.42175
Prediction Interval:
- Shows the cofidence interval for new Y at that particular value of X.
- Prediction intervals must account for both the uncertainty in knowing the value of the population mean, plus data scatter.
- So a prediction interval is always wider than a confidence interval.
predict(fit,newdata=newParents,interval = ("prediction"))
## fit lwr upr
## 1 65.95042 61.54673 70.35410
## 2 66.59671 62.19718 70.99624
## 3 67.24300 62.84615 71.63985