1:
library(UsingR)
library(reshape)
library(manipulate)
data("galton")
str(galton)
## 'data.frame': 928 obs. of 2 variables:
## $ child : num 61.7 61.7 61.7 61.7 61.7 62.2 62.2 62.2 62.2 62.2 ...
## $ parent: num 70.5 68.5 65.5 64.5 64 67.5 67.5 67.5 66.5 66.5 ...
## child parent
## 1 61.7 70.5
## 2 61.7 68.5
## 3 61.7 65.5
## 4 61.7 64.5
## 5 61.7 64.0
## 6 62.2 67.5


# We are unable to compare two variable(child & parent) one-by-one
# So, we need to melt the data set,
long <- melt(galton)
str(long)
## 'data.frame': 1856 obs. of 2 variables:
## $ variable: Factor w/ 2 levels "child","parent": 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 61.7 61.7 61.7 61.7 61.7 62.2 62.2 62.2 62.2 62.2 ...
## variable value
## 1 child 61.7
## 2 child 61.7
## 3 child 61.7
## 4 child 61.7
## 5 child 61.7
## 6 child 62.2
## [1] 928 2
## [1] 1856 2
# Plotting
g <- ggplot(long, aes(x = value, fill = variable))
g <- g + geom_histogram(colour = "black", binwidth = 1)
g <- g + facet_grid(. ~ variable)
g

# Scatter plot
library(dplyr)
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g

# Model fitting
library(manipulate)
y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
myPlot <- function(beta){
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
mse <- mean( (y - beta * x) ^2 )
g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
g
}
lm(I(child - mean(child)) ~ I(parent - mean(parent)) - 1, data = galton)
##
## Call:
## lm(formula = I(child - mean(child)) ~ I(parent - mean(parent)) -
## 1, data = galton)
##
## Coefficients:
## I(parent - mean(parent))
## 0.6463
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
lm1 <- lm(galton$child ~ galton$parent)
g <- g + geom_abline(intercept = coef(lm1)[1], slope = coef(lm1)[2], size = 3, colour = grey(.5))
g

lmd1 <- lm(child ~ parent, data = galton)
par(mfrow = c(2, 2))
plot(lmd1)

##
## Call:
## lm(formula = child ~ parent, data = galton)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8050 -1.3661 0.0487 1.6339 5.9264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.94153 2.81088 8.517 <2e-16 ***
## parent 0.64629 0.04114 15.711 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.239 on 926 degrees of freedom
## Multiple R-squared: 0.2105, Adjusted R-squared: 0.2096
## F-statistic: 246.8 on 1 and 926 DF, p-value: < 2.2e-16
# Ordering
x <- rnorm(100)
y <- rnorm(100)
odr <- order(x) # Ordering data
x[odr[100]] # Maximum
## [1] 2.71211
## [1] -2.175858
## [1] -1.265532
## [1] -0.8484868
# RTTM
library(UsingR)
data(father.son)
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
rho <- cor(x, y)
library(ggplot2)
g <- ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g <- g + geom_point(size = 6, colour = "black", alpha = 0.2)
g <- g + geom_point(size = 4, colour = "salmon", alpha = 0.2)
g <- g + xlim(-4, 4) + ylim(-4, 4)
g <- g + geom_abline(intercept = 0, slope = 1)
g <- g + geom_vline(xintercept = 0)
g <- g + geom_hline(yintercept = 0)
g <- g + geom_abline(intercept = 0, slope = rho, size = 2)
g <- g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g

g <- ggplot(data.frame(x, y), aes(x = x, y = y))
g <- g + geom_point(size = 5, alpha = .2, colour = "black")
g <- g + geom_point(size = 4, alpha = .2, colour = "red")
g <- g + geom_vline(xintercept = 0)
g <- g + geom_hline(yintercept = 0)
g <- g + geom_abline(position = "identity")
g <- g + geom_abline(intercept = 0, slope = rho, size = 2)
g <- g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g <- g + xlab("Father's height, normalized")
g <- g + ylab("Son's height, normalized")
g

# Middle (MSE)
myHist <- function(mu){
g <- ggplot(galton, aes(x = child))
g <- g + geom_histogram(fill = "green",
binwidth = 1,
aes(y = ..density..),
color = "black")
g <- g + geom_density(size = 2)
g <- g + geom_vline(xintercept = mu, size = 2)
mse <- round(mean((galton$child - mu)^2), 3)
g <- g + labs(title = paste('mu = ', mu, ' MSE = ', mse))
g
}
# Manipulate work or not ?
# manipulate(
# {
# if(resetSeed)
# set.seed(sample(1:1000))
#
# hist(rnorm(n=100, mean=0, sd=3), breaks=bins)
# },
# bins = slider(1, 20, step=1, initial =5, label="Bins"),
# resetSeed = button("Reset Seed")
#)
# manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
# manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))
2:
library(UsingR)
data("diamond")
str(diamond)
## 'data.frame': 48 obs. of 2 variables:
## $ carat: num 0.17 0.16 0.17 0.18 0.25 0.16 0.15 0.19 0.21 0.15 ...
## $ price: int 355 328 350 325 642 342 322 485 483 323 ...
## carat price
## 1 0.17 355
## 2 0.16 328
## 3 0.17 350
## 4 0.18 325
## 5 0.25 642
## 6 0.16 342
g <- ggplot(diamond, aes(x = carat, y = price))
g <- g + geom_point(size = 5, col = "black", alpha = .2)
g <- g + geom_smooth(method = "lm", col = "red")
g

# Fitting
fit1 <- lm(price ~ carat, diamond)
summary(fit1)
##
## Call:
## lm(formula = price ~ carat, data = diamond)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.63 17.32 -14.99 <2e-16 ***
## carat 3721.02 81.79 45.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(fit1)

# We estimate an expect 3721.02 (SIN $) increase in price
# for every carat increase in mass of diamond
fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
summary(fit2)
##
## Call:
## lm(formula = price ~ I(carat - mean(carat)), data = diamond)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 500.083 4.596 108.8 <2e-16 ***
## I(carat - mean(carat)) 3721.025 81.786 45.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(fit2)

fit3 <- lm(price ~ I(carat * 10), data = diamond)
summary(fit3)
##
## Call:
## lm(formula = price ~ I(carat * 10), data = diamond)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.626 17.319 -14.99 <2e-16 ***
## I(carat * 10) 372.102 8.179 45.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(fit3)

# Predict
predict(fit1, newdata = data.frame(carat = c(0.16, 0.27, 0.34)))
## 1 2 3
## 335.7381 745.0508 1005.5225
predict(fit2, newdata = data.frame(carat = c(0.16, 0.27, 0.34)))
## 1 2 3
## 140.3843 549.6970 810.1687
predict(fit3, newdata = data.frame(carat = c(0.16, 0.27, 0.34)))
## 1 2 3
## 335.7381 745.0508 1005.5225
# Residuals
data("diamond")
y <- diamond$price
x <- diamond$carat
fit <- lm(y ~ x)
e <- resid(fit)
yhat <- predict(fit)
max(abs(e - (y - yhat)))
## [1] 9.485746e-13
plot(x = x, y = y)
abline(fit, lwd = 2, col = "red")
for(i in 1:length(y)){
lines(c(x[i], x[i]), c(y[i], yhat[i]), lwd = 2, col = "blue")
}
plot(x, e)
abline(h = 0, col = "red")
for(i in 1:length(y)){
lines(c(x[i], x[i]), c(e[i], 0), lwd = 2, col = "blue")
}

3:
# Estimation
n = 100; x = rnorm(n); x2 = rnorm(n); x3 = rnorm(n)
y = 1 + x + x2 + x3 + rnorm(n, sd = .1)
ey = resid(lm(y ~ x2 + x3))
ex = resid(lm(x ~ x2 + x3))
sum(ey * ex) / sum(ex ^ 2)
## [1] 0.9867069
## ex
## 0.9867069
coef(lm(y ~ x + x2 + x3))
## (Intercept) x x2 x3
## 1.0080901 0.9867069 1.0051380 0.9993590
## Example
library(datasets)
data(swiss)
head(swiss)
## Fertility Agriculture Examination Education Catholic
## Courtelary 80.2 17.0 15 12 9.96
## Delemont 83.1 45.1 6 9 84.84
## Franches-Mnt 92.5 39.7 5 5 93.40
## Moutier 85.8 36.5 12 7 33.77
## Neuveville 76.9 43.5 17 15 5.16
## Porrentruy 76.1 35.3 9 7 90.57
## Infant.Mortality
## Courtelary 22.2
## Delemont 22.2
## Franches-Mnt 20.2
## Moutier 20.3
## Neuveville 20.6
## Porrentruy 26.6
## 'data.frame': 47 obs. of 6 variables:
## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
library(GGally)
library(ggplot2)
g <- ggpairs(data = swiss, lower = list(continuous = "smooth")) # pairs()
g

# Fitting
summary(lm(Fertility ~ . , data = swiss))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.9151817 10.70603759 6.250229 1.906051e-07
## Agriculture -0.1721140 0.07030392 -2.448142 1.872715e-02
## Examination -0.2580082 0.25387820 -1.016268 3.154617e-01
## Education -0.8709401 0.18302860 -4.758492 2.430605e-05
## Catholic 0.1041153 0.03525785 2.952969 5.190079e-03
## Infant.Mortality 1.0770481 0.38171965 2.821568 7.335715e-03
summary(lm(Fertility ~ Agriculture, data = swiss))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.3043752 4.25125562 14.185074 3.216304e-18
## Agriculture 0.1942017 0.07671176 2.531577 1.491720e-02
# Try a simulation
n <- 100; x2 <- 1 : n; x1 <- .01 * x2 + runif(n, -.1, .1); y = -x1 + x2 + rnorm(n, sd = .01)
summary(lm(y ~ x1))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.80534 1.101669 1.638731 1.044759e-01
## x1 94.67050 1.870753 50.605547 4.706694e-72
summary(lm(y ~ x1 + x2))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.193221e-05 0.0016687774 0.0251275 9.800049e-01
## x1 -9.856695e-01 0.0147084902 -67.0136419 4.813523e-83
## x2 9.998259e-01 0.0001509347 6624.2293238 4.127510e-276
dat = data.frame(y = y, x1 = x1, x2 = x2, ey = resid(lm(y ~ x2)), ex1 = resid(lm(x1 ~ x2)))
library(ggplot2)
g = ggplot(dat, aes(y = y, x = x1, colour = x2))
g = g + geom_point(colour="grey50", size = 5) + geom_smooth(method = lm, se = FALSE, colour = "black")
g = g + geom_point(size = 4)
g

g2 = ggplot(dat, aes(y = ey, x = ex1, colour = x2))
g2 = g2 + geom_point(colour="grey50", size = 5) + geom_smooth(method = lm, se = FALSE, colour = "black") + geom_point(size = 4)
g2

z <- swiss$Agriculture + swiss$Education
lm(Fertility ~ . + z, data = swiss)
##
## Call:
## lm(formula = Fertility ~ . + z, data = swiss)
##
## Coefficients:
## (Intercept) Agriculture Examination Education
## 66.9152 -0.1721 -0.2580 -0.8709
## Catholic Infant.Mortality z
## 0.1041 1.0770 NA
require(datasets);data(InsectSprays); require(stats); require(ggplot2)
g = ggplot(data = InsectSprays, aes(y = count, x = spray, fill = spray))
g = g + geom_violin(colour = "black", size = 2)
g = g + xlab("Type of spray") + ylab("Insect count")
g

summary(lm(count ~ spray, data = InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.5000000 1.132156 12.8074279 1.470512e-19
## sprayB 0.8333333 1.601110 0.5204724 6.044761e-01
## sprayC -12.4166667 1.601110 -7.7550382 7.266893e-11
## sprayD -9.5833333 1.601110 -5.9854322 9.816910e-08
## sprayE -11.0000000 1.601110 -6.8702352 2.753922e-09
## sprayF 2.1666667 1.601110 1.3532281 1.805998e-01
summary(lm(count ~
I(1 * (spray == 'B')) + I(1 * (spray == 'C')) +
I(1 * (spray == 'D')) + I(1 * (spray == 'E')) +
I(1 * (spray == 'F')) + I(1 * (spray == 'A')), data = InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.5000000 1.132156 12.8074279 1.470512e-19
## I(1 * (spray == "B")) 0.8333333 1.601110 0.5204724 6.044761e-01
## I(1 * (spray == "C")) -12.4166667 1.601110 -7.7550382 7.266893e-11
## I(1 * (spray == "D")) -9.5833333 1.601110 -5.9854322 9.816910e-08
## I(1 * (spray == "E")) -11.0000000 1.601110 -6.8702352 2.753922e-09
## I(1 * (spray == "F")) 2.1666667 1.601110 1.3532281 1.805998e-01
summary(lm(count ~ spray - 1, data = InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## sprayA 14.500000 1.132156 12.807428 1.470512e-19
## sprayB 15.333333 1.132156 13.543487 1.001994e-20
## sprayC 2.083333 1.132156 1.840148 7.024334e-02
## sprayD 4.916667 1.132156 4.342749 4.953047e-05
## sprayE 3.500000 1.132156 3.091448 2.916794e-03
## sprayF 16.666667 1.132156 14.721181 1.573471e-22
library(dplyr)
summarise(group_by(InsectSprays, spray), mn = mean(count))
## # A tibble: 6 x 2
## spray mn
## <fct> <dbl>
## 1 A 14.5
## 2 B 15.3
## 3 C 2.08
## 4 D 4.92
## 5 E 3.5
## 6 F 16.7
spray2 <- relevel(InsectSprays$spray, "C")
summary(lm(count ~ spray2, data = InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.083333 1.132156 1.840148 7.024334e-02
## spray2A 12.416667 1.601110 7.755038 7.266893e-11
## spray2B 13.250000 1.601110 8.275511 8.509776e-12
## spray2D 2.833333 1.601110 1.769606 8.141205e-02
## spray2E 1.416667 1.601110 0.884803 3.794750e-01
## spray2F 14.583333 1.601110 9.108266 2.794343e-13
swiss = mutate(swiss, CatholicBin = 1 * (Catholic > 50))
g = ggplot(swiss, aes(x = Agriculture, y = Fertility, colour = factor(CatholicBin)))
g = g + geom_point(size = 6, colour = "black") + geom_point(size = 4)
g = g + xlab("% in Agriculture") + ylab("Fertility")
g

fit = lm(Fertility ~ Agriculture, data = swiss)
g1 = g
g1 = g1 + geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], size = 2)
g1

summary(lm(Fertility ~ Agriculture + factor(CatholicBin), data = swiss))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.8322366 4.1058630 14.815944 1.032493e-18
## Agriculture 0.1241776 0.0810977 1.531210 1.328763e-01
## factor(CatholicBin)1 7.8843292 3.7483622 2.103406 4.118221e-02
fit = lm(Fertility ~ Agriculture + factor(CatholicBin), data = swiss)
g1 = g
g1 = g1 + geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], size = 2)
g1 = g1 + geom_abline(intercept = coef(fit)[1] + coef(fit)[3], slope = coef(fit)[2], size = 2)
g1

fit = lm(Fertility ~ Agriculture * factor(CatholicBin), data = swiss)
g1 = g
g1 = g1 + geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], size = 2)
g1 = g1 + geom_abline(intercept = coef(fit)[1] + coef(fit)[3],
slope = coef(fit)[2] + coef(fit)[4], size = 2)
g1

## Adjustment
# Simulation 1
n <- 100
t <- rep(c(0, 1), c(n/2, n/2))
x <- c(runif(n/2), runif(n/2))
beta0 <- 0
beta1 <- 2
tau <- 1
sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)

# Simulation 2
n <- 100
t <- rep(c(0, 1), c(n/2, n/2))
x <- c(runif(n/2), 1.5 + runif(n/2))
beta0 <- 0
beta1 <- 2
tau <- 0
sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)
