getwd()
## [1] "C:/Users/Caleb/Documents/Civil Engineering degree coursework/Applied Statistical Methods/Labs/Lab 4"
spruce <- read.table(file.choose(),header=TRUE,sep=",")
tail(spruce)
## BHDiameter Height
## 31 17.7 19.9
## 32 20.7 19.4
## 33 21.0 20.4
## 34 13.3 15.5
## 35 15.9 17.6
## 36 22.9 19.2
library(s20x)
x <- spruce$BHDiameter
y <- spruce$Height
trendscatter(x,y,f=0.5)
spruce.lm=with(spruce,lm(Height~BHDiameter))
spruce.lm
##
## Call:
## lm(formula = Height ~ BHDiameter)
##
## Coefficients:
## (Intercept) BHDiameter
## 9.1468 0.4815
height.res=residuals(spruce.lm)
height.fit=fitted(spruce.lm)
plot(height.fit,height.res)
trendscatter( height.fit,height.res)
#The shape is a quadratic function
#This new curve has a definite maximum while the previous curve doesn't have a clear maximum
plot(spruce.lm,which=1)
normcheck(spruce.lm,shapiro.wilk = TRUE)
The Shapiro-Wilk test gives a p-value of 0.29. The NULL hypothesis is that a variable’s values are a simple random sample from a normal distribution.
The straight line doesn’t model this well. This is because there is a quadratic term that should be in the approximation, but is instead showing up in the error.
quad.lm=lm(Height~BHDiameter + I(BHDiameter^2),data=spruce)
summary(quad.lm)
##
## Call:
## lm(formula = Height ~ BHDiameter + I(BHDiameter^2), data = spruce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2966 -0.6245 -0.0707 0.7442 3.2541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.860896 2.205022 0.390 0.698731
## BHDiameter 1.469592 0.243786 6.028 8.88e-07 ***
## I(BHDiameter^2) -0.027457 0.006635 -4.138 0.000227 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.382 on 33 degrees of freedom
## Multiple R-squared: 0.7741, Adjusted R-squared: 0.7604
## F-statistic: 56.55 on 2 and 33 DF, p-value: 2.182e-11
plot(Height~BHDiameter,bg="Blue",pch=21,cex=1.2,
ylim=c(0,max(Height)),xlim=c(0,max(BHDiameter)),
main="Spruce height prediction",data=spruce)
myplot=function(x){
quad.lm$coef[1] +quad.lm$coef[2]*x + quad.lm$coef[3]*x^2
}
curve(myplot, lwd=2, col="steelblue",add=TRUE)
quad.fit=fitted(quad.lm)
plot(quad.lm, which=1)
library(s20x)
normcheck(quad.lm,shapiro.wilk = TRUE)
The p-value is 0.684. I can conclude that the quadratic fit is a better fit for the data than a linear fit.
summary(quad.lm)
##
## Call:
## lm(formula = Height ~ BHDiameter + I(BHDiameter^2), data = spruce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2966 -0.6245 -0.0707 0.7442 3.2541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.860896 2.205022 0.390 0.698731
## BHDiameter 1.469592 0.243786 6.028 8.88e-07 ***
## I(BHDiameter^2) -0.027457 0.006635 -4.138 0.000227 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.382 on 33 degrees of freedom
## Multiple R-squared: 0.7741, Adjusted R-squared: 0.7604
## F-statistic: 56.55 on 2 and 33 DF, p-value: 2.182e-11
The formula for the fitted is \[y=-0.027457*(Breast\ Height\ Diameter)^2+1.469592*Breast\ Height\ Diameter+0.860896\]
predict(quad.lm,data.frame(BHDiameter=c(15,18,20)))
## 1 2 3
## 16.72690 18.41740 19.26984
predict(spruce.lm,data.frame(BHDiameter=c(15,18,20)))
## 1 2 3
## 16.36895 17.81338 18.77632
summary(quad.lm)
##
## Call:
## lm(formula = Height ~ BHDiameter + I(BHDiameter^2), data = spruce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2966 -0.6245 -0.0707 0.7442 3.2541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.860896 2.205022 0.390 0.698731
## BHDiameter 1.469592 0.243786 6.028 8.88e-07 ***
## I(BHDiameter^2) -0.027457 0.006635 -4.138 0.000227 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.382 on 33 degrees of freedom
## Multiple R-squared: 0.7741, Adjusted R-squared: 0.7604
## F-statistic: 56.55 on 2 and 33 DF, p-value: 2.182e-11
summary(spruce.lm)
##
## Call:
## lm(formula = Height ~ BHDiameter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9394 -0.9763 0.2829 0.9950 2.6644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.14684 1.12131 8.157 1.63e-09 ***
## BHDiameter 0.48147 0.05967 8.069 2.09e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.678 on 34 degrees of freedom
## Multiple R-squared: 0.6569, Adjusted R-squared: 0.6468
## F-statistic: 65.1 on 1 and 34 DF, p-value: 2.089e-09
quad.lm has an \(R^2\) value of 0.7604, while spruce.lm has an \(R^2\) value of 0.6569. Thus, quad.lm is a better model.
anova(spruce.lm,quad.lm)
## Analysis of Variance Table
##
## Model 1: Height ~ BHDiameter
## Model 2: Height ~ BHDiameter + I(BHDiameter^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 34 95.703
## 2 33 63.007 1 32.696 17.125 0.0002269 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
height.qfit=fitted(quad.lm)
TSS = with(spruce, sum((Height-mean(Height))^2))
TSS
## [1] 278.9475
MSS = with(spruce, sum((height.qfit-mean(Height))^2))
MSS
## [1] 215.9407
RSS=with(spruce, sum((Height-height.qfit)^2))
RSS
## [1] 63.00683
MSS/TSS
## [1] 0.7741266
cooks20x(quad.lm)
Cook’s distance is used to identify points that will negatively affect your regression model. It tells us that our model doesn’t fit well at 24, 21, and 18.
quad2.lm=lm(Height~BHDiameter + I(BHDiameter^2) , data=spruce[-24,])
summary(quad2.lm)
##
## Call:
## lm(formula = Height ~ BHDiameter + I(BHDiameter^2), data = spruce[-24,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.11233 -0.48227 0.01253 0.71727 2.59146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.341500 2.068479 -0.165 0.87
## BHDiameter 1.564793 0.226102 6.921 7.78e-08 ***
## I(BHDiameter^2) -0.029242 0.006114 -4.782 3.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.266 on 32 degrees of freedom
## Multiple R-squared: 0.8159, Adjusted R-squared: 0.8044
## F-statistic: 70.91 on 2 and 32 DF, p-value: 1.74e-12
summary(quad.lm)
##
## Call:
## lm(formula = Height ~ BHDiameter + I(BHDiameter^2), data = spruce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2966 -0.6245 -0.0707 0.7442 3.2541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.860896 2.205022 0.390 0.698731
## BHDiameter 1.469592 0.243786 6.028 8.88e-07 ***
## I(BHDiameter^2) -0.027457 0.006635 -4.138 0.000227 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.382 on 33 degrees of freedom
## Multiple R-squared: 0.7741, Adjusted R-squared: 0.7604
## F-statistic: 56.55 on 2 and 33 DF, p-value: 2.182e-11
This new quadratic is an even better fit than the first. ## Task 7
Piece-wise Regression:
Assume our two piece-wise regression lines are given by, \[l_{1}:\ y= \beta_{0} + \beta_{1}x\] \[l_{2}:\ y= \beta_{0} + \delta+ (\beta_{1}+\beta_{2})x\] These two lines meet at the point \((x_{k},y_{k})\). Thus, \[y_{k}=\beta_{0} + \beta_{1}x_{k}=\beta_{0} + \delta+ (\beta_{1}+\beta_{2})x_{k} \\ \Rightarrow 0=\delta+\beta_{2}x_{k} \\ \Rightarrow \delta=-\beta_{2}x_{k}\]
Then, line 2 has the following equation: \[l_{2}:\ y=\beta_{0}-\beta_{2}x_{k}+(\beta_{1}+\beta_{2})x \\ \Rightarrow y=\beta_{0} + \beta_{1}x+\beta_{2}(x-x_{k})\] Finally, to combine lines 1 and 2 we include the function I(). Which is 1 when the included statment is true and 0 when it is false. So, the equation for our piece-wise line is \[y=\beta_{0} + \beta_{1}x+\beta_{2}(x-x_{k})*I(x>x_{k})\]
I <- I(x>15)
plot(I)
sp2.df=within(spruce, X<-(BHDiameter-20)*(BHDiameter>20)) # this makes a new variable and places it within the same df
lmp=lm(Height~BHDiameter + X,data=sp2.df)
tmp=summary(lmp)
myf = function(x,coef){
coef[1]+coef[2]*(x) + coef[3]*(x-18)*(x-18>0)
}
plot(spruce,main="Piecewise regression")
curve(myf(x,coef=tmp$coefficients[,"Estimate"] ),add=TRUE, lwd=2,col="Blue")
abline(v=18)
text(18,16,paste("R sq.=",round(tmp$r.squared,4) ))
MATH4753GRAY::cheby(3)
## [1] 88.88889
This function takes an input value x which is the number of standard deviations from the mean of a data set. Then using Chebyshev’s formula it calculates the minimum percentage of the data that is contained in that interval.