https://github.com/DataScienceSpecialization/courses https://github.com/DataScienceSpecialization/courses/blob/master/07_RegressionModels%2F01_01_introduction%2Findex.md

https://class.coursera.org/regmods-006/wiki/Lecture_note_links

par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100);
hist(galton$parent,col="blue",breaks=100);

plot of chunk unnamed-chunk-2

What is the “center of the mass”? , means where can you put your finger to balance the graph?

Minimize the distance from all other points.

sum((Child - Parent)^2) from ... from zero to n

Least Square!

Plotting and Manipulating

copy paste this

library(manipulate)
library(ggplot2)
myHist <- function(mu){
    mse <- mean((galton$child - mu)^2)
    g <- ggplot(galton, aes(x = child)) + geom_histogram(fill = "salmon", colour = "black", binwidth=1)
    g <- g + geom_vline(xintercept = mu, size = 3)
    g <- g + ggtitle(paste("mu = ", mu, ", MSE (Mean Sq Err) = ", round(mse, 2), sep = ""))
    g
}
manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))

Comparing

plot(galton$parent,galton$child,pch=19)

plot of chunk unnamed-chunk-3

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
plot(as.numeric(as.vector(freqData$parent)), 
     as.numeric(as.vector(freqData$child)),
     pch = 21, col = "black", bg = "lightblue",
     cex = .15 * freqData$freq, 
     xlab = "parent", ylab = "child")

plot of chunk unnamed-chunk-4

Fit - Regression from the origin

Approach.

think of a line y=xb. Give xi value and try to minimize the difference : yi-y

“Actual Y - Predicted Line’s Y ?”

plot of chunk freqGalton

Play with beta. copy paste this.

myPlot <- function(beta){
  y <- galton$child - mean(galton$child)
  x <- galton$parent - mean(galton$parent)
  freqData <- as.data.frame(table(x, y))
  names(freqData) <- c("child", "parent", "freq")
  plot(
    as.numeric(as.vector(freqData$parent)), 
    as.numeric(as.vector(freqData$child)),
    pch = 21, col = "black", bg = "lightblue",
    cex = .15 * freqData$freq, 
    xlab = "parent", 
    ylab = "child"
    )
  abline(0, beta, lwd = 3)
  points(0, 0, cex = 2, pch = 19)
  mse <- mean( (y - beta * x)^2 )
  title(paste("beta = ", beta, "mse = ", round(mse, 3)))
}
manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))

Linear Least Sq Calculation

Y=B0+B1X

B1=Cor(Y,X)*Sd(Y)/Sd(X)
B0=Ybar-B1*Xbar
data(father.son)
x<-father.son$fheight
y<-father.son$sheight

#x<-galton$child
#y<-galton$parent

b1<-cor(y,x)*sd(y)/sd(x)
b0<-mean(y)-b1*mean(x)
c(b0,b1)
## [1] 33.8866  0.5141

Is this same from lm function? Verify

coef(lm(y~x))
## (Intercept)           x 
##     33.8866      0.5141

Outcome v.s. Predictor Relation ship

b1<-cor(y,x)*sd(x)/sd(y)
b0<-mean(x)-b1*mean(y)
c(b0,b1)
## [1] 34.1075  0.4889

Is this same as this?

coef(lm(x~y))
## (Intercept)           y 
##     34.1075      0.4889

Regression via origin

yc<-y-mean(y)
xc<-x-mean(x)
b1<-sum(yc*xc)/sum(xc^2)
b1
## [1] 0.5141

Is this same as this?

coef(lm(y~x))[2]
##      x 
## 0.5141

Normalizing and Correlation

rho <- cor(xc,yc)
rho
## [1] 0.5013
plot(xc,yc,xlab='Father (normalized)',ylab='Son (normalized)')

#Identity Line = Perfect Correlation!
abline(0,1,col="blue",lwd=3)

# Father predicts son: If Father is tall, son tends to be tall but not as tall as the father.
abline(0,rho,col="green",lwd=3)

# Son predicts father: If Son is tall, father tends to be tall but not as tall as the son.
abline(0,1/rho,col="red",lwd=3)

#No relationship
abline(h=0,col="gray",lwd=1)
abline(v=0,col="gray",lwd=1)

plot of chunk unnamed-chunk-11