library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: replacing previous import by 'ggplot2::arrow' when loading 'Hmisc'
## Warning: replacing previous import by 'ggplot2::unit' when loading 'Hmisc'
## Warning: replacing previous import by 'scales::alpha' when loading 'Hmisc'
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
##
##
## Attaching package: 'UsingR'
##
## The following object is masked from 'package:survival':
##
## cancer
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(UsingR)
library(manipulate)
library(reshape)
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:dplyr':
##
## rename
data(galton)
par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100)
hist(galton$parent,col="blue",breaks=100)
##or the ggplot way
long <- melt(galton)
## Using as id variables
g <- ggplot(long, aes(x = value, fill = variable))
g <- g + geom_histogram(colour = "black", binwidth=1)
g <- g + facet_grid(. ~ variable)
g
hist(galton$child,col="blue",breaks=100)
myHist<- function(mu){
hist(galton$child,col="blue",breaks=100)
lines(c(mu,mu),c(0,150),col="red",lwd=5)
mse<- mean((galton$child-mu)^2)
text(63,150,paste("MSE",round(mse,2)))
}
##manipulate(myHist(mu),mu=slider(62,74,step=0.5))
hist(galton$child,col="blue",breaks=100)
meanChild <- mean(galton$child)
lines(rep(meanChild,100),seq(0,150,length=100),col="red",lwd=5)
sum of the squared deviations.
myHist <- function(mu){
mse <- mean((galton$child - mu)^2)
g<-ggplot(galton,aes(x=child))+
geom_histogram(fill="salmon",colour="black",binwidth=1)
g <- g + geom_vline(xintercept = mu, size = 3)
g <- g + ggtitle(paste("mu= ",mu,",MSE= ",round(mse,2),sep = ""))
g
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
myHist(mean(galton$child))
####old version minimise mu for childrens heights
myHist <- function(mu){
hist(galton$child,col="blue",breaks=100)
lines(c(mu, mu), c(0, 150),col="red",lwd=5)
mse <- mean((galton$child - mu)^2)
text(63, 150, paste("mu = ", mu))
text(63, 140, paste("MSE = ", round(mse, 2)))
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
plot(galton$parent,galton$child,pch=19,col="blue")
##better plot
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g+geom_point(colour="grey50",aes(size = freq+20,show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g
## Regression through the origin * Suppose that \(X_i\) are the parents’ heights. * Consider picking the slope \(\beta\) that minimizes \[\sum_{i=1}^n (Y_i - X_i \beta)^2\] * This is exactly using the origin as a pivot point picking the line that minimizes the sum of the squared vertical distances of the points to the line * Use R studio’s manipulate function to experiment * Subtract the means so that the origin is the mean of the parent and children’s heights
y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
myPlot <- function(beta){
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
mse <- mean( (y - beta * x) ^2 )
g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
g
}
#manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))
Consider again the parent and child height data from Galton
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g
beta1 <- cor(y, x) * sd(x) / sd(y)
beta0 <- mean(x) - beta1 * mean(y)
rbind(c(beta0, beta1), coef(lm(x ~ y)))
## (Intercept) y
## [1,] 46.13535 0.3256475
## [2,] 46.13535 0.3256475
yn <- (y - mean(y))/sd(y)
xn <- (x - mean(x))/sd(x)
c(cor(y, x), cor(yn, xn), coef(lm(yn ~ xn))[2])
## xn
## 0.4587624 0.4587624 0.4587624
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g <- g + geom_smooth(method="lm", formula=y~x)
g
set.seed(1234)
plot(jitter(galton$parent,factor=2),jitter(galton$child,factor=2),pch=19,col="blue")
ggplot(galton, aes(x = parent, y = child)) + geom_point()
## Average parent = 65 inches tall
plot(galton$parent,galton$child,pch=19,col="blue")
near65 <- galton[abs(galton$parent - 65)<1, ]
points(near65$parent,near65$child,pch=19,col="red")
lines(seq(64,66,length=100),rep(mean(near65$child),100),col="red",lwd=4)
## Average parent = 71 inches tall
plot(galton$parent,galton$child,pch=19,col="blue")
near71 <- galton[abs(galton$parent - 71)<1, ]
points(near71$parent,near71$child,pch=19,col="red")
lines(seq(70,72,length=100),rep(mean(near71$child),100),col="red",lwd=4)
plot(galton$parent,galton$child,pch=19,col="blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent,lm1$fitted,col="red",lwd=3)
plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent, 26 + 0.646*galton$parent)
y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
myPlot <- function(beta){
g <- ggplot(filter(freqData,freq > 0),aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g +geom_point(colour="grey50",aes(size =freq+20,show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
mse <- mean( (y - beta * x) ^2 )
g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
g
}
#manipulate(myPlot(beta),beta = slider(0.6, 1.2,step = 0.02))
If \(C_i\) is the height of child \(i\) and \(P_i\) is the height of the average parent, then we can imagine writing the equation for a line
\[C_i = b_0 + b_1 P_i\]
plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent,lm1$fitted,col="red",lwd=3)
# load father.son data
data(father.son)
# normalize son's height
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
# normalize father's height
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
# calculate correlation
rho <- cor(x, y)
# plot the relationship between the two
g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 3, alpha = .2, colour = "black")
g = g + geom_point(size = 2, alpha = .2, colour = "red")
g = g + xlim(-4, 4) + ylim(-4, 4)
# reference line for perfect correlation between
# variables (data points will like on diagonal line)
g = g + geom_abline(intercept = 0, slope = 1)
#g = g + geom_abline(position = "identity")
# if there is no correlation between the two variables,
# the data points will lie on horizontal/vertical lines
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
# plot the actual correlation for both
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
# add appropriate labels
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")
g = g + geom_text(x = 3.8, y = 1.6, label="x~y", angle = 25) +
geom_text(x = 3.2, y = 3.6, label="cor(y,x)=1", angle = 35) +
geom_text(x = 1.6, y = 3.8, label="y~x", angle = 60)
g
data(father.son)
y<-(father.son$sheight-mean(father.son$sheight))/sd(father.son$sheight)
x<-(father.son$fheight-mean(father.son$fheight))/sd(father.son$fheight)
rho <- cor(x, y)
g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 6, colour = "black", alpha = 0.2)
g = g + geom_point(size = 4, colour = "salmon", alpha = 0.2)
g = g + xlim(-4, 4) + ylim(-4, 4)
g = g + geom_abline(intercept = 0, slope = 1)
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g
g = ggplot(data.frame(x, y), aes(x = x, y = y))
g = g + geom_point(size = 5, alpha = .2, colour = "black")
g = g + geom_point(size = 4, alpha = .2, colour = "red")
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(intercept = 0, slope = 1)
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")
g
library(UsingR)
data(father.son)
y<-(father.son$sheight-mean(father.son$sheight))/sd(father.son$sheight)
x<-(father.son$fheight-mean(father.son$fheight))/sd(father.son$fheight)
rho <- cor(x, y)
myPlot <- function(x, y) {
plot(x, y,
xlab = "Father's height, normalized",
ylab = "Son's height, normalized",
xlim = c(-3, 3), ylim = c(-3, 3),
bg = "lightblue", col = "black", cex = 1.1, pch = 21,
frame = FALSE)
}
## Plot the data, code
myPlot(x, y)
abline(0, 1) # if there were perfect correlation
abline(0, rho, lwd = 2) # father predicts son
abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis
abline(h = 0); abline(v = 0) # reference lines for no relathionship
If \(C_i\) is the height of child \(i\) and \(P_i\) is the height of the average parent, then we can imagine writing the equation for a line
\[C_i = b_0 + b_1 P_i + e_i\]
\(e_i\) is everything we didn t measure (how much they eat, where they live, do they stretch in the morning…)
If \(C_i\) is the height of child \(i\) and \(P_i\) is the height of the average parent, pick the line that makes the child values \(C_i\) and our guesses
\[ \sum_{i=1}^{928}(C_i - \{b_0 + b_1 P_i\})^2 \]
par(mfrow=c(1,2))
plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent,lm1$fitted,col="red",lwd=3)
plot(galton$parent,lm1$residuals,col="blue",pch=19)
abline(c(0,0),col="red",lwd=3)
## The least squares est. is the empirical mean
library(manipulate)
myHist <- function(mu){
mse <- mean((galton$child - mu)^2)
g <- ggplot(galton, aes(x = child)) +
geom_histogram(fill = "salmon", colour = "black", binwidth=1)
g <- g + geom_vline(xintercept = mu, size = 3)
g <- g + ggtitle(paste("mu = ", mu, ", MSE = ", round(mse, 2), sep = ""))
g
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
####The solution
lm(I(child - mean(child))~ I(parent - mean(parent)) - 1, data = galton)
##
## Call:
## lm(formula = I(child - mean(child)) ~ I(parent - mean(parent)) -
## 1, data = galton)
##
## Coefficients:
## I(parent - mean(parent))
## 0.6463
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
lm1 <- lm(galton$child ~ galton$parent)
g <- g + geom_abline(intercept = coef(lm1)[1], slope = coef(lm1)[2], size = 3, colour = grey(.5))
g
data(father.son)
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
rho <- cor(x, y)
myPlot <- function(x, y) {
plot(x, y,
xlab = "Father's height, normalized",
ylab = "Son's height, normalized",
xlim = c(-3, 3), ylim = c(-3, 3),
bg ="lightblue", col = "black", cex =1.1, pch = 21,
frame = FALSE)
}
myPlot(x, y)
abline(0, 1) # if there were perfect correlation
abline(0, rho, lwd =2) # father predicts son
abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis
abline(h = 0); abline(v = 0) # reference lines for no relathionship
### Dalton’s Investigation on Regression to the Mean ## Basic regression model with additive Gaussian errors. ## Plot of the data
fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
coef(fit2)
## (Intercept) I(carat - mean(carat))
## 500.0833 3721.0249
Thus $500.1 is the expected price for the average sized diamond of the data (0.2041667 carats).
newx <- c(0.16, 0.27, 0.34)
coef(fit)[1] + coef(fit)[2] * newx
## [1] 335.7381 745.0508 1005.5225
predict(fit, newdata = data.frame(carat = newx))
## 1 2 3
## 335.7381 745.0508 1005.5225
Predicted values at the observed Xs (red) and at the new Xs (lines)