#install.packages('car')
#install.packages('corrplot')
#install.packages('MPV')
library(car)
## Loading required package: carData
library(corrplot)
## corrplot 0.92 loaded
library(MPV)
## Loading required package: lattice
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
prestige <- data.frame(Prestige[,4],Prestige[,-c(4,6)])
colnames(prestige)[1] <- 'prestige'
plotlm <- function(lm.mod)
{
windows();
quartz();
par(mfrow=c(2,2))
plot(lm.mod)
}
Exercise 1 1.
help(Prestige)
## starting httpd help server ... done
pairs(prestige, panel = panel.smooth, main = 'Prestige Data Set', col=3)
#The 'education' is the strongest predictors of the response since it is almost a straight line. There are some noise variables, such as 'income', 'women', and 'census'.
corrplot(cor(prestige))
#The darker the blue the higher the correlation, the darker the red the higher the negative correlation.
lm.education <- lm(prestige~education, data = prestige)
summary(lm.education)
##
## Call:
## lm(formula = prestige ~ education, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.0397 -6.5228 0.6611 6.7430 18.1636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.732 3.677 -2.919 0.00434 **
## education 5.361 0.332 16.148 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.103 on 100 degrees of freedom
## Multiple R-squared: 0.7228, Adjusted R-squared: 0.72
## F-statistic: 260.8 on 1 and 100 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm.education)
The number of the data in this dataset is 102. The M-education is significant and it needs an intercept. Residuals are randomly scattered around zero without any clear trends. Since in the Normal Q-Q plot, it is almost a straight line, the residuals are normal distribution. The residuals are independent. In the Residual vs Leverage chart, it is homoscedastic. So, the model is fit.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(car)
prestige <- data.frame(Prestige[,4], Prestige[,-c(4,6)])
colnames(prestige)[1] <- 'prestige'
lm.mod <- lm(prestige~income, data=prestige)
xxnew <- data.frame(income=prestige$income)
xnew <- prestige$income
ynew <- prestige$prestige
pred.band <- predict(lm.mod, xxnew, interval="prediction")
conf.band <- predict(lm.mod, xxnew, interval="confidence")
datadone <- data.frame(xnew,ynew, conf.band[,1], conf.band[,2], conf.band[,3], pred.band[,2], pred.band[,3])
colnames(datadone) <- c('x', 'y', 'yhat', 'lc', 'uc', 'lp','up')
p <- ggplot(datadone, aes(x, y)) +
geom_point() +
geom_line(color='black', aes(x=x, y=yhat))
p + geom_line(aes(y = lc), color = "blue", linetype = "dotted")+
geom_line(aes(y = uc), color = "blue", linetype = "dotted") +
geom_line(aes(y = lp), color = "red", linetype = "dashed")+
geom_line(aes(y = up), color = "red", linetype = "dashed")
lm.mod <- lm(prestige~women, data=prestige)
xxnew <- data.frame(women=prestige$women)
xnew <- prestige$women
ynew <- prestige$prestige
pred.band <- predict(lm.mod, xxnew, interval="prediction")
conf.band <- predict(lm.mod, xxnew, interval="confidence")
datadone <- data.frame(xnew,ynew, conf.band[,1], conf.band[,2], conf.band[,3], pred.band[,2], pred.band[,3])
colnames(datadone) <- c('x', 'y', 'yhat', 'lc', 'uc', 'lp','up')
p <- ggplot(datadone, aes(x, y)) +
geom_point() +
geom_line(color='black', aes(x=x, y=yhat))
p + geom_line(aes(y = lc), color = "blue", linetype = "dotted")+
geom_line(aes(y = uc), color = "blue", linetype = "dotted") +
geom_line(aes(y = lp), color = "red", linetype = "dashed")+
geom_line(aes(y = up), color = "red", linetype = "dashed")
The confidence bands and the prediction bands of ‘income’ is smaller
than the confidence bands and the prediction bands of ‘women’. But
‘income’ has some outliers and its range greater than the range of
‘women’.
beta.0.hat <- coefficients(lm.education)[1]
beta.1.hat <- coefficients(lm.education)[2]
ese.beta_1_hat <- summary(lm.education)$coefficients[2,2]
n <- nrow(prestige)
p <- ncol(prestige)-1
df <- n-2
beta_1_0 <- 6
t.beta_1_hat <- (beta.1.hat - beta_1_0)/ese.beta_1_hat
alpha <- 0.1
Pvalue <- pt(t.beta_1_hat, df=df)
decision <- ifelse(Pvalue < alpha, 'Reject H_o', 'Do not reject H_o')
cat('The Pvalue in this case is', Pvalue, '. Therefore', decision, '\n\n')
## The Pvalue in this case is 0.02852685 . Therefore Reject H_o
decision
## education
## "Reject H_o"
confint(lm.education, level = 0.9) #confidence interval
## 5 % 95 %
## (Intercept) -16.83681 -4.627154
## education 4.80970 5.912056
mu.hat.y.x <- predict(lm.education, newdata = data.frame(education=c(10)))
mu.hat.y.x
## 1
## 42.8768
conf.mu.hat.y.x <- predict(lm.education, newdata = data.frame(education=c(10)), interval = "prediction")
conf.mu.hat.y.x
## fit lwr upr
## 1 42.8768 24.7213 61.03229
Exercise 2 1.
library(MPV)
help(cement)
#The cement data frame has 13 rows and 5 columns. They are all numeric vectors. There is no strong correlation between explanatory variable(x) and response variable(y) in this data.
pairs(cement, panel = panel.smooth, col=3)
library(corrplot)
corrplot(cor(cement))
x_1 and x_2 are positively correlated with y, and x_2 has a stronger
correlation with y. x_3 and x_4 are negatively correlated with y, and
x_4 has a stronger negative correlation.
boxplot(cement)
x_1, x_3 have a small range, and x_1 is mostly concentrated in the
middle of the range. x_2 mostly in the upper part of the range, and x_3,
x_4 are mostly in the lower part of the range.
summary(lm(y~., data=cement))
##
## Call:
## lm(formula = y ~ ., data = cement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1750 -1.6709 0.2508 1.3783 3.9254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.4054 70.0710 0.891 0.3991
## x1 1.5511 0.7448 2.083 0.0708 .
## x2 0.5102 0.7238 0.705 0.5009
## x3 0.1019 0.7547 0.135 0.8959
## x4 -0.1441 0.7091 -0.203 0.8441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.446 on 8 degrees of freedom
## Multiple R-squared: 0.9824, Adjusted R-squared: 0.9736
## F-statistic: 111.5 on 4 and 8 DF, p-value: 4.756e-07
The number of the data is 10. x_1 is significant. x_2, x_3, x_4, and intercept are not significant. Since the p-value is too small, reject H_o.
plotlm <- function(lm.mod)
{
op <- par(mfrow=c(2,2))
plot(lm.mod)
par(op)
}
plotlm(lm(y~., data=cement))
Residuals are randomly scattered around zero without any clear trends.
Since in the Normal Q-Q plot, it is almost a straight line, the
residuals are normal distribution. The residuals are independent. In the
Residual vs Leverage chart, it is fanning in. So, the model is fit.
shapiro.test(resid(lm(y~., data=cement)))
##
## Shapiro-Wilk normality test
##
## data: resid(lm(y ~ ., data = cement))
## W = 0.96967, p-value = 0.8903
H_0: the residuals are normally distribution H_1: the residuals are not normally distribution
w = 0.96967, it is close to 1, the data is a normal distribution. p-value = 0.8903, cannot reject H_o, we have sufficient evidence that the residuals are normally distribution.
Exercise 3 1. (a) #Since (x-x_bar)^2 is always positive, when (x-x_bar)^2 = 0 we can get the lowest leverage equals to 1/n. The lowest leverage can be obtained at the point (x_bar, y_bar).
#Sxx = (n-1)Sx^2, Sx is the standard deviation #x-x_bar = +-3Sx #leverage(x) = 1/n + (x* - x_bar)^2/Sxx = 1/n + (+-3Sx)2/((n-1)Sx2) = 1/n + (9 Sx2)/((n-1)Sx2) = 1/n + 9/(n-1) = 10/n
#Leverage is highest when (x-x_bar)^2 is highest. The highest leverage occurs when the x has the largest distance from x_bar.
#z score = (-2.33-0)/1 = -2.33
p_sresidual_less_than_minus_2.33 <- pnorm(-2.33)
p_sresidual_less_than_minus_2.33
## [1] 0.009903076