library("alr4")
## Loading required package: car
## Loading required package: carData
## Loading required package: effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
attach(UN11)
plot(fertility~ppgdp, xlab = 'Per Capita GDP', ylab = 'Fertility Rate')
The trend is not linear.
plot(log(fertility)~log(ppgdp), xlab = 'log(ppgdp)', ylab = 'log(fertility)')
A simple linear regression model is plausible for the summary of this graph given that it shows a straight line pattern.
xbar=mean(log(ppgdp))
ybar=mean(log(fertility))
#Sample means
c(xbar,ybar)
## [1] 8.4637943 0.9122342
SXX=sum((log(ppgdp)-log(xbar))^2)
SYY=sum((log(fertility)-log(ybar)^2))
SXY=sum((log(ppgdp)-xbar)*log(fertility))
#Sums of squares
c(SXX,SYY,SXY)
## [1] 8449.13984 179.85544 -99.53026
b1 = SXY/SXX
b0 = ybar - b1*xbar
# Intercept and Slope
c(Intercept = b0, Slope = b1)
## Intercept Slope
## 1.01193704 -0.01177993
plot(log(fertility)~log(ppgdp), xlab = 'log(ppgdp)', ylab = 'log(fertility)')
abline(b0, b1, col = 'red')
detach(UN11)
library(faraway)
##
## Attaching package: 'faraway'
## The following objects are masked from 'package:alr4':
##
## cathedral, pipeline, twins
## The following objects are masked from 'package:car':
##
## logit, vif
attach(prostate)
plot(lcavol~lpsa, xlab = 'log prostate specific antigen', ylab = 'log cancer volume')
A simple linear regression model seems reasonable because it satisfies linearity, independence, normality, and equal variances.
xbar=mean(lpsa)
ybar=mean(lcavol)
#Sample means
c(xbar,ybar)
## [1] 2.478387 1.350010
SXX=sum((lpsa-xbar)^2)
SYY=sum((lcavol-ybar)^2)
SXY=sum((lpsa-xbar)*lcavol)
#Sums of squares
c(SXX,SYY,SXY)
## [1] 127.91758 133.35903 95.92784
b1 = SXY/SXX
b0 = ybar - b1*xbar
# Intercept and Slope
c(Intercept = b0, Slope = b1)
## Intercept Slope
## -0.5085802 0.7499191
plot(lcavol~lpsa, xlab = 'log prostate specific antigen', ylab = 'log cancer volume')
abline(b0, b1, col = 'blue')
n = length(lpsa)
SSE = SYY - SXY^2/SXX
sighat_sq = SSE/(n - 2)
# Variance Estimate
sighat_sq
## [1] 0.646536
sighat = sqrt(sighat_sq)
se_b0 = sighat*sqrt(1/n + xbar^2/SXX)
se_b1 = sighat/sqrt(SXX)
# Standard Errors
c(SE_b0 = se_b0, SE_b1 = se_b1)
## SE_b0 SE_b1
## 0.19419311 0.07109372
The estimate of σˆ2= 0.6465 and the estimated standard errors of βˆ0 and βˆ1 are 0.1941 and 0.0710, respectively.
cov_b0_b1 = -sighat_sq*xbar/SXX
cov_b0_b1
## [1] -0.01252655
The estimated covariance between βˆ0 and βˆ1 is -0.0125.
t_b1 = b1/se_b1
t_b0 = b0/se_b0
p_b0 = 2*pt(abs(t_b0), n - 2, lower.tail = F)
p_b1 = 2*pt(abs(t_b1), n - 2, lower.tail = F)
c(p_b0, p_b1)
## [1] 1.026687e-02 1.118616e-17
The p-values for the intercept and slope are .01027 and 1.12e-17, respectively, so null hypothesis is rejected at level 0.05.
prostate.lm <- lm(lcavol~lpsa)
summary(prostate.lm)
##
## Call:
## lm(formula = lcavol ~ lpsa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.15948 -0.59383 0.05034 0.50826 1.67751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.50858 0.19419 -2.619 0.0103 *
## lpsa 0.74992 0.07109 10.548 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8041 on 95 degrees of freedom
## Multiple R-squared: 0.5394, Adjusted R-squared: 0.5346
## F-statistic: 111.3 on 1 and 95 DF, p-value: < 2.2e-16
detach(prostate)
library('alr4')
data(wblake)
Y <- wblake$Length
x <- wblake$Age
Ybar = mean(Y)
xbar = mean(x)
Sxy = sum( (x - xbar)*(Y - Ybar) )
Sxx = sum( (x - xbar)**2 )
b1 = Sxy / Sxx #this is our slope estimate
b0 = Ybar - b1*xbar #this is our intercept estimate
c(b1,b0)
## [1] 30.32389 65.52716
The slope and intercept estimates are -0.012 and 1.011, respectively.
Yhat = b0 + b1*x #this vector contains the estimated values of Y from our model
SSE = sum( (Y - Yhat)**2 )
SSR = sum( (Yhat - Ybar)**2 )
SSTO = SSE + SSR
n = length(Y)
MSE = SSE / (n-2) #MSE is our mean square error need to evaluate standard error of b0 and
The values of SSE, SSR, SSTO, and MSE are 358595.53, 1595359.09, 1953954.62, and 820.58 respectively.
s2 = SSTO / (n-1)
R2 = SSR / SSTO #coefficient of determination
se_b1 = sqrt( MSE / Sxx ) #standard error of slope estimate b1
se_b0 = sqrt( MSE*( (1/n) + ((xbar^2)/Sxx) ) ) #standard error of intercept estimate b0
c(s2, se_b1, se_b0)
## [1] 4461.0835960 0.6877291 3.1973881
The estimate of variance, standard error of slope b1, and standard error of intercept b0 are 4461.084, 0.6877, 3.197, respectively. So, for every unit increase of age, small mouth bass will increase 3.197 unites in length.
a = 1 - 0.90
t_score = qt(1 - a/2, n-2)
upper_bound = b1 + t_score * se_b1
lower_bound = b1 - t_score * se_b1
c(lower_bound,upper_bound)
## [1] 29.19027 31.45751
We are 90% confident that the true slope which relates the age and length of small mouth bass is between -1.1454 and 1.1218.
Y_h = b0 + b1*(1)
lwr_bound = Y_h - t_score * sqrt(MSE) * sqrt( 1 + (1/n) + ((1 - xbar)**2)/Sxx )
upr_bound = Y_h + t_score * sqrt(MSE) * sqrt( 1 + (1/n) + ((1 - xbar)**2)/Sxx )
c(lwr_bound,upr_bound)
## [1] 48.43975 143.26235
Our prediction for the length of a small mouth bass at age 1 is 95.851mm. Our lower bound is -47.02316 and upper bound is 49.02348, giving us the 90% confidence interval: (21.43775, 170.2644). So we can confidently say that a 1 year old small mouth bass with be between -47.02316mm and 49.02348mm.
library(alr4)
data(Heights)
y = Heights$dheight
x = Heights$mheight
fit = lm(y~x)
summary(fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.397 -1.529 0.036 1.492 9.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.91744 1.62247 18.44 <2e-16 ***
## x 0.54175 0.02596 20.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared: 0.2408, Adjusted R-squared: 0.2402
## F-statistic: 435.5 on 1 and 1373 DF, p-value: < 2.2e-16
plot(y~x, xlab = 'mheight', ylab = 'dheight')
abline(fit, col='blue')
attach(Heights)
ht.lm = lm(dheight~mheight)
summary(ht.lm)
##
## Call:
## lm(formula = dheight ~ mheight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.397 -1.529 0.036 1.492 9.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.91744 1.62247 18.44 <2e-16 ***
## mheight 0.54175 0.02596 20.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared: 0.2408, Adjusted R-squared: 0.2402
## F-statistic: 435.5 on 1 and 1373 DF, p-value: < 2.2e-16
The estimates are βˆ0 = 29.917 and βˆ1 = 0.542, with standard errors 1.62 and 0.026, respectively. Additionally, R2 = 0.2408 and σˆ2 = 2.2662 = 5.135. Since both estimates are very large compared to their standard errors,they are both very statistically significant, so that the mother’s height is linearly related to the daughter’s. Since 24% of the variability in daughter’s height can be explained by the mother’s, this is a moderately strong relationship.
confint(ht.lm, level = 0.90)
## 5 % 95 %
## (Intercept) 27.2469103 32.5879633
## mheight 0.4990166 0.5844774
We are 90% confident that the true slope which relates the heights of mothers and daughters is between 0.499 and 0.584.
confint(ht.lm, newdata = data.frame(mheight = 65), level = 0.99)
## 0.5 % 99.5 %
## (Intercept) 25.7324151 34.1024585
## mheight 0.4747836 0.6087104
We are 99% confident that the true slope which relates heights of daughters and mothers who are 65 inches tall is between 0.4748 and 0.6087.
predict(ht.lm, newdata = data.frame(mheight = 60), level = 0.95, interval = 'predict')
## fit lwr upr
## 1 62.42226 57.97308 66.87144
There is a 95% probability that the daughter will be between 57.97 and 66.87 inches tall if her mother is 60 inches tall.
cor(x,y)
## [1] 0.4907094
Since the correlation coefficient is 0.4907, there is a moderately positive relationship between the heights of mothers and their daughters.
detach(Heights)