https://github.com/DataScienceSpecialization/courses https://github.com/DataScienceSpecialization/courses/blob/master/07_RegressionModels%2F01_01_introduction%2Findex.md
https://class.coursera.org/regmods-006/wiki/Lecture_note_links
par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100);
hist(galton$parent,col="blue",breaks=100);
What is the “center of the mass”? , means where can you put your finger to balance the graph?
Minimize the distance from all other points.
sum((Child - Parent)^2) from ... from zero to n
Least Square!
copy paste this
library(manipulate)
library(ggplot2)
myHist <- function(mu){
mse <- mean((galton$child - mu)^2)
g <- ggplot(galton, aes(x = child)) + geom_histogram(fill = "salmon", colour = "black", binwidth=1)
g <- g + geom_vline(xintercept = mu, size = 3)
g <- g + ggtitle(paste("mu = ", mu, ", MSE (Mean Sq Err) = ", round(mse, 2), sep = ""))
g
}
manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
plot(galton$parent,galton$child,pch=19)
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
plot(as.numeric(as.vector(freqData$parent)),
as.numeric(as.vector(freqData$child)),
pch = 21, col = "black", bg = "lightblue",
cex = .15 * freqData$freq,
xlab = "parent", ylab = "child")
Approach.
think of a line y=xb. Give xi value and try to minimize the difference : yi-y
“Actual Y - Predicted Line’s Y ?”
Play with beta. copy paste this.
myPlot <- function(beta){
y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
plot(
as.numeric(as.vector(freqData$parent)),
as.numeric(as.vector(freqData$child)),
pch = 21, col = "black", bg = "lightblue",
cex = .15 * freqData$freq,
xlab = "parent",
ylab = "child"
)
abline(0, beta, lwd = 3)
points(0, 0, cex = 2, pch = 19)
mse <- mean( (y - beta * x)^2 )
title(paste("beta = ", beta, "mse = ", round(mse, 3)))
}
manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))
Y=B0+B1X
B1=Cor(Y,X)*Sd(Y)/Sd(X)
B0=Ybar-B1*Xbar
data(father.son)
x<-father.son$fheight
y<-father.son$sheight
#x<-galton$child
#y<-galton$parent
b1<-cor(y,x)*sd(y)/sd(x)
b0<-mean(y)-b1*mean(x)
c(b0,b1)
## [1] 33.8866 0.5141
Is this same from lm function? Verify
coef(lm(y~x))
## (Intercept) x
## 33.8866 0.5141
Outcome v.s. Predictor Relation ship
b1<-cor(y,x)*sd(x)/sd(y)
b0<-mean(x)-b1*mean(y)
c(b0,b1)
## [1] 34.1075 0.4889
Is this same as this?
coef(lm(x~y))
## (Intercept) y
## 34.1075 0.4889
Regression via origin
yc<-y-mean(y)
xc<-x-mean(x)
b1<-sum(yc*xc)/sum(xc^2)
b1
## [1] 0.5141
Is this same as this?
coef(lm(y~x))[2]
## x
## 0.5141
rho <- cor(xc,yc)
rho
## [1] 0.5013
plot(xc,yc,xlab='Father (normalized)',ylab='Son (normalized)')
#Identity Line = Perfect Correlation!
abline(0,1,col="blue",lwd=3)
# Father predicts son: If Father is tall, son tends to be tall but not as tall as the father.
abline(0,rho,col="green",lwd=3)
# Son predicts father: If Son is tall, father tends to be tall but not as tall as the son.
abline(0,1/rho,col="red",lwd=3)
#No relationship
abline(h=0,col="gray",lwd=1)
abline(v=0,col="gray",lwd=1)