HW 2
Please submit both you code and resulting output. For simplicity and to ensure that your output matches your textual explanations and answers, I will accept copies of your output in Microsoft Word for this assignment. However if you feel more comfortable doing so, you can also alternatively submit an HTML document produced in R Markdown. Just be sure that the HTML document displays your output and that your written answers match the displayed output. Be sure to use either R or R Markdown to answer all of the following questions.
windmill <- read.csv("C:/Users/raze1/OneDrive/Desktop/UIndy/MSDA 622/Homework/Homework 2/data_Windmill.csv")
summary(windmill)
## Velocity Output
## Min. : 2.450 Min. :0.123
## 1st Qu.: 3.950 1st Qu.:1.144
## Median : 6.000 Median :1.800
## Mean : 6.132 Mean :1.610
## 3rd Qu.: 8.150 3rd Qu.:2.166
## Max. :10.200 Max. :2.386
windmill_lm <- lm(windmill$Output~windmill$Velocity, data = windmill)
summary(windmill_lm)
##
## Call:
## lm(formula = windmill$Output ~ windmill$Velocity, data = windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59869 -0.14099 0.06059 0.17262 0.32184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13088 0.12599 1.039 0.31
## windmill$Velocity 0.24115 0.01905 12.659 7.55e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2361 on 23 degrees of freedom
## Multiple R-squared: 0.8745, Adjusted R-squared: 0.869
## F-statistic: 160.3 on 1 and 23 DF, p-value: 7.546e-12
library(MASS)
windmill_quad <- lm(windmill$Output~windmill$Velocity + I(windmill$Velocity^2), data = windmill)
summary(windmill_quad)
##
## Call:
## lm(formula = windmill$Output ~ windmill$Velocity + I(windmill$Velocity^2),
## data = windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26347 -0.02537 0.01264 0.03908 0.19903
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.155898 0.174650 -6.618 1.18e-06 ***
## windmill$Velocity 0.722936 0.061425 11.769 5.77e-11 ***
## I(windmill$Velocity^2) -0.038121 0.004797 -7.947 6.59e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1227 on 22 degrees of freedom
## Multiple R-squared: 0.9676, Adjusted R-squared: 0.9646
## F-statistic: 328.3 on 2 and 22 DF, p-value: < 2.2e-16
plot(windmill$Velocity, windmill$Output, xlab = "Velocity", main = "Windmill Quadratic Polynomial Model", ylab = "Output", cex = 0.5)
lines(windmill$Velocity, windmill_quad$fitted.values, col="blue", lwd = 1)
lines(windmill$Velocity, windmill_lm$fitted.values, col = "red", lwd = 1)
library(splines)
windmill_regsp <- lm(windmill$Output~bs(windmill$Velocity), data = windmill)
summary(windmill_regsp)
##
## Call:
## lm(formula = windmill$Output ~ bs(windmill$Velocity), data = windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.199016 -0.054316 0.008116 0.064812 0.164210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.26965 0.06606 4.082 0.000534 ***
## bs(windmill$Velocity)1 1.95653 0.21358 9.161 8.79e-09 ***
## bs(windmill$Velocity)2 1.65272 0.15323 10.786 5.07e-10 ***
## bs(windmill$Velocity)3 2.09467 0.10267 20.401 2.52e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1059 on 21 degrees of freedom
## Multiple R-squared: 0.9769, Adjusted R-squared: 0.9737
## F-statistic: 296.6 on 3 and 21 DF, p-value: < 2.2e-16
windmill_conf <- predict(windmill_regsp, interval = "confidence", level = 0.99)
windmill_conf
## fit lwr upr
## 1 0.2696470 0.08262017 0.4566739
## 2 0.4520332 0.30651363 0.5975527
## 3 0.5881880 0.46637661 0.7099993
## 4 0.6848176 0.57510566 0.7945296
## 5 0.8927897 0.79458536 0.9909941
## 6 1.0011300 0.90233038 1.0999296
## 7 1.1734678 1.06934939 1.2775862
## 8 1.2409421 1.13455111 1.3473330
## 9 1.4402880 1.33068924 1.5498868
## 10 1.5738840 1.46706326 1.6807048
## 11 1.7000160 1.59957174 1.8004603
## 12 1.7826002 1.68676662 1.8784337
## 13 1.8244050 1.73012221 1.9186877
## 14 1.8626800 1.76876779 1.9565923
## 15 1.8892565 1.79475828 1.9837546
## 16 1.9866828 1.88239870 2.0909669
## 17 2.0355655 1.92292206 2.1482089
## 18 2.0841803 1.96472520 2.2036354
## 19 2.1145093 1.99371608 2.2353025
## 20 2.1800943 2.06663299 2.2935557
## 21 2.2126060 2.10414035 2.3210716
## 22 2.2670795 2.15330387 2.3808552
## 23 2.2872821 2.16430018 2.4102640
## 24 2.3315389 2.17519551 2.4878822
## 25 2.3643163 2.17461293 2.5540196
plot(windmill$Velocity, windmill$Output, xlab = "Velocity", main = "Regression Spline", ylab = "Output", cex = 0.5)
lines(windmill$Velocity, windmill_regsp$fitted.values, col = "blue", lwd = 1)
lines(windmill$Velocity, windmill_conf[,2], col = "red", lty = 2)
lines(windmill$Velocity, windmill_conf[,3], col = "red", lty = 2)
anova(windmill_quad, windmill_regsp)
## Analysis of Variance Table
##
## Model 1: windmill$Output ~ windmill$Velocity + I(windmill$Velocity^2)
## Model 2: windmill$Output ~ bs(windmill$Velocity)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 22 0.33108
## 2 21 0.23543 1 0.095649 8.5318 0.008168 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The regression splines model has the smallest Sum of Squared Error, otherwise known as the Residual Sum of Squares; the regression splines RSS is 30% smaller than the quadratic model. This 30% smaller RSS is statistically signifcant and would indicate that the Regression Splines is the better model.
windmill_regsp2 <- lm(windmill$Output~bs(windmill$Velocity, df = 10), data = windmill)
anova(windmill_regsp, windmill_regsp2)
## Analysis of Variance Table
##
## Model 1: windmill$Output ~ bs(windmill$Velocity)
## Model 2: windmill$Output ~ bs(windmill$Velocity, df = 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 21 0.23543
## 2 14 0.17660 7 0.058827 0.6662 0.6973
plot(windmill$Velocity, windmill$Output, xlab = "Velocity", main = "Regression Spline with df = 10", ylab = "Output", cex = 0.5)
lines(windmill$Velocity, windmill_regsp2$fitted.values, col = "blue", lwd = 1)
windmill_conf2 <- predict(windmill_regsp2, interval = "confidence", level = 0.99)
lines(windmill$Velocity, windmill_conf2[,2], col = "red", lty = 2)
lines(windmill$Velocity, windmill_conf2[,3], col = "red", lty = 2)
The regression splines model with 10 degrees of freedom has
the smallest Sum of Squared Error, but by only 25%. This 35% smaller RSS
is statistically significant and would indicate that the Regression
Splines with 10 degrees of freedom is the better model. However, it
seems that we are approaching the point of limited
returns.
windmill_ns <- lm(windmill$Output~ns(windmill$Velocity, df = 6), data = windmill)
anova(windmill_regsp, windmill_ns)
## Analysis of Variance Table
##
## Model 1: windmill$Output ~ bs(windmill$Velocity)
## Model 2: windmill$Output ~ ns(windmill$Velocity, df = 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 21 0.23543
## 2 18 0.18604 3 0.049386 1.5927 0.226
windmill_confns <- predict(windmill_ns, interval = "confidence", level = 0.85)
windmill_confns
## fit lwr upr
## 1 0.1813098 0.06231097 0.3003086
## 2 0.4105225 0.33124873 0.4897963
## 3 0.5870989 0.52131612 0.6528817
## 4 0.7121712 0.64428747 0.7800550
## 5 0.9662633 0.88576389 1.0467628
## 6 1.0809850 1.00264919 1.1593208
## 7 1.2350628 1.16583929 1.3042863
## 8 1.2874445 1.21642980 1.3584592
## 9 1.4309811 1.33798159 1.5239806
## 10 1.5378558 1.45190860 1.6238029
## 11 1.6544810 1.58615440 1.7228076
## 12 1.7389925 1.66827392 1.8097112
## 13 1.7836864 1.71003464 1.8573381
## 14 1.8252945 1.75235358 1.8982354
## 15 1.8546607 1.78385263 1.9254689
## 16 1.9689915 1.89076504 2.0472179
## 17 2.0340475 1.94481569 2.1232793
## 18 2.1063603 2.02573978 2.1869809
## 19 2.1521916 2.07838469 2.2259985
## 20 2.2358039 2.14908546 2.3225223
## 21 2.2633765 2.17576106 2.3509919
## 22 2.2890786 2.22021602 2.3579413
## 23 2.2944607 2.22821502 2.3607063
## 24 2.3023578 2.21711962 2.3875961
## 25 2.3065215 2.19394116 2.4191019
plot(windmill$Velocity, windmill$Output, xlab = "Velocity", main = "Natural Cubic Spline with df = 6", ylab = "Output", cex = 0.5)
lines(windmill$Velocity, windmill_ns$fitted.values, col = "blue", lwd = 1)
lines(windmill$Velocity, windmill_confns[,2], col = "red", lty = 2)
lines(windmill$Velocity, windmill_confns[,3], col = "red", lty = 2)