#Homework 2 - Week 3 1. Load the data_Windmill csv file into R.
Windmill <- read.csv("/Users/jusimioni/Desktop/MSDA 2021-23/Spring 2023/Big Data/Data/data_Windmill.csv")
Windmill
## Velocity Output
## 1 2.45 0.123
## 2 2.70 0.500
## 3 2.90 0.653
## 4 3.05 0.558
## 5 3.40 1.057
## 6 3.60 1.137
## 7 3.95 1.144
## 8 4.10 1.194
## 9 4.60 1.562
## 10 5.00 1.582
## 11 5.45 1.501
## 12 5.80 1.737
## 13 6.00 1.822
## 14 6.20 1.866
## 15 6.35 1.930
## 16 7.00 1.800
## 17 7.40 2.088
## 18 7.85 2.179
## 19 8.15 2.166
## 20 8.80 2.112
## 21 9.10 2.303
## 22 9.55 2.294
## 23 9.70 2.386
## 24 10.00 2.236
## 25 10.20 2.310
Y <- Windmill$Output
X <- Windmill$Velocity
plot(Y~X, xlab = "Velocity", ylab = "Output")
lm_mod <- lm(Y~X, data = Windmill)
summary(lm_mod)
##
## Call:
## lm(formula = Y ~ X, data = Windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59869 -0.14099 0.06059 0.17262 0.32184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13088 0.12599 1.039 0.31
## X 0.24115 0.01905 12.659 7.55e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2361 on 23 degrees of freedom
## Multiple R-squared: 0.8745, Adjusted R-squared: 0.869
## F-statistic: 160.3 on 1 and 23 DF, p-value: 7.546e-12
plot(X, Y, xlab = "Velocity", ylab = "Output", main = "Simple Linear Regression")
abline(lm_mod, col = "blue", lwd = 1)
quad_mod <- lm(Y~X + I(X^2), data = Windmill)
summary(quad_mod)
##
## Call:
## lm(formula = Y ~ X + I(X^2), data = Windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26347 -0.02537 0.01264 0.03908 0.19903
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.155898 0.174650 -6.618 1.18e-06 ***
## X 0.722936 0.061425 11.769 5.77e-11 ***
## I(X^2) -0.038121 0.004797 -7.947 6.59e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1227 on 22 degrees of freedom
## Multiple R-squared: 0.9676, Adjusted R-squared: 0.9646
## F-statistic: 328.3 on 2 and 22 DF, p-value: < 2.2e-16
#Note that cex in the below plot determines how big the circles appear in the graph
plot(X, Y, xlab = "Velocity", main = "Quadratic", ylab = "Output", cex = 0.6)
lines(X, quad_mod$fitted.values, col="blue", lwd = 1)
anova(lm_mod, quad_mod)
## Analysis of Variance Table
##
## Model 1: Y ~ X
## Model 2: Y ~ X + I(X^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 23 1.28157
## 2 22 0.33108 1 0.9505 63.16 6.586e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(splines)
reg_sp <- lm(Y~bs(X, df= 4), data = Windmill)
summary(reg_sp)
##
## Call:
## lm(formula = Y ~ bs(X, df = 4), data = Windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16694 -0.06493 0.01365 0.06848 0.11538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.18646 0.07063 2.640 0.0157 *
## bs(X, df = 4)1 1.18434 0.15531 7.626 2.42e-07 ***
## bs(X, df = 4)2 1.57874 0.14060 11.229 4.36e-10 ***
## bs(X, df = 4)3 2.15893 0.15095 14.302 5.78e-12 ***
## bs(X, df = 4)4 2.10989 0.09407 22.430 1.20e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09676 on 20 degrees of freedom
## Multiple R-squared: 0.9817, Adjusted R-squared: 0.978
## F-statistic: 267.7 on 4 and 20 DF, p-value: < 2.2e-16
#The below code can be performed for prediction intervals as well as confidence intervals
conf_interval <- predict(reg_sp, interval = "confidence", level = 0.99)
conf_interval
## fit lwr upr
## 1 0.1864577 -0.01449874 0.3874141
## 2 0.4221021 0.28329480 0.5609094
## 3 0.5905826 0.47867879 0.7024865
## 4 0.7059901 0.60179775 0.8101825
## 5 0.9416193 0.83260905 1.0506296
## 6 1.0569047 0.94234145 1.1714679
## 7 1.2288118 1.11066078 1.3469628
## 8 1.2920829 1.17521044 1.4089555
## 9 1.4654803 1.35998979 1.5709708
## 10 1.5714619 1.47331772 1.6696061
## 11 1.6679435 1.56731392 1.7685730
## 12 1.7341634 1.62722791 1.8410988
## 13 1.7711014 1.66171542 1.8804874
## 14 1.8085503 1.69879563 1.9183050
## 15 1.8369613 1.72818576 1.9457368
## 16 1.9603534 1.85905310 2.0616537
## 17 2.0336677 1.93019605 2.1371394
## 18 2.1105235 1.99595708 2.2250900
## 19 2.1569932 2.03393302 2.2800535
## 20 2.2397156 2.11146810 2.3679630
## 21 2.2675948 2.14644449 2.3887452
## 22 2.2945499 2.18453279 2.4045671
## 23 2.2991143 2.18520514 2.4130234
## 24 2.3009263 2.15230667 2.4495460
## 25 2.2963479 2.10240137 2.4902944
plot(X, Y, xlab = "Velocity", main = "Regression Spline", ylab = "Output", cex = .6)
lines(X, reg_sp$fitted.values, col = "blue", lwd = 1)
lines(X, conf_interval[,2], col = "red", lty = 2)
lines(X, conf_interval[,3], col = "red", lty = 2)
anova(quad_mod, reg_sp)
## Analysis of Variance Table
##
## Model 1: Y ~ X + I(X^2)
## Model 2: Y ~ bs(X, df = 4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 22 0.33108
## 2 20 0.18724 2 0.14384 7.6822 0.003347 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The spline model has the lower sum of squared errors. The results are statistically significant since their Pr(>F) is 0.003347 lower than 0.5. The spline is better to use as a prediction output since it thinks better the data than the quadrantic model. It also shows a considerable lower sum of squares error.
library(splines)
reg_sp10 <- lm(Y~bs(X, df= 10), data = Windmill)
summary(reg_sp10)
##
## Call:
## lm(formula = Y ~ bs(X, df = 10), data = Windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16701 -0.08491 0.02911 0.05992 0.12740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1335 0.1117 1.196 0.25174
## bs(X, df = 10)1 0.3156 0.2198 1.436 0.17302
## bs(X, df = 10)2 0.6719 0.2140 3.140 0.00723 **
## bs(X, df = 10)3 1.1806 0.2041 5.785 4.72e-05 ***
## bs(X, df = 10)4 1.3768 0.1930 7.133 5.06e-06 ***
## bs(X, df = 10)5 1.6983 0.1688 10.064 8.63e-08 ***
## bs(X, df = 10)6 1.7851 0.2000 8.925 3.75e-07 ***
## bs(X, df = 10)7 2.1110 0.2234 9.451 1.87e-07 ***
## bs(X, df = 10)8 2.0912 0.2303 9.080 3.04e-07 ***
## bs(X, df = 10)9 2.2270 0.1960 11.363 1.88e-08 ***
## bs(X, df = 10)10 2.1483 0.1566 13.723 1.64e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1123 on 14 degrees of freedom
## Multiple R-squared: 0.9827, Adjusted R-squared: 0.9704
## F-statistic: 79.55 on 10 and 14 DF, p-value: 1.437e-10
anova(reg_sp, reg_sp10)
## Analysis of Variance Table
##
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ bs(X, df = 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 20 0.18724
## 2 14 0.17660 6 0.010637 0.1405 0.9881
There is not a significant difference between the two models. The model performed on question 3 may have better predictions since the model will does not fit the model so specifically.
cubic <- lm(Y~ns(X, df = 6), data = Windmill)
anova(reg_sp, cubic)
## Analysis of Variance Table
##
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ ns(X, df = 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 20 0.18724
## 2 18 0.18604 2 0.0011954 0.0578 0.944
The anova shoes very little difference between the two models.
a) Calculate 85% confidence intervals for all data points
#The below code can be performed for prediction intervals as well as confidence intervals
conf_interval2 <- predict(cubic, interval = "confidence", level = 0.85)
conf_interval2
## fit lwr upr
## 1 0.1813098 0.06231097 0.3003086
## 2 0.4105225 0.33124873 0.4897963
## 3 0.5870989 0.52131612 0.6528817
## 4 0.7121712 0.64428747 0.7800550
## 5 0.9662633 0.88576389 1.0467628
## 6 1.0809850 1.00264919 1.1593208
## 7 1.2350628 1.16583929 1.3042863
## 8 1.2874445 1.21642980 1.3584592
## 9 1.4309811 1.33798159 1.5239806
## 10 1.5378558 1.45190860 1.6238029
## 11 1.6544810 1.58615440 1.7228076
## 12 1.7389925 1.66827392 1.8097112
## 13 1.7836864 1.71003464 1.8573381
## 14 1.8252945 1.75235358 1.8982354
## 15 1.8546607 1.78385263 1.9254689
## 16 1.9689915 1.89076504 2.0472179
## 17 2.0340475 1.94481569 2.1232793
## 18 2.1063603 2.02573978 2.1869809
## 19 2.1521916 2.07838469 2.2259985
## 20 2.2358039 2.14908546 2.3225223
## 21 2.2633765 2.17576106 2.3509919
## 22 2.2890786 2.22021602 2.3579413
## 23 2.2944607 2.22821502 2.3607063
## 24 2.3023578 2.21711962 2.3875961
## 25 2.3065215 2.19394116 2.4191019
plot(X, Y, xlab = "Velocity", main = "Spline", ylab = "Output", cex = .6)
lines(X, cubic$fitted.values, col = "blue", lwd = 1)
lines(X, conf_interval2[,2], col = "red", lty = 2)
lines(X, conf_interval2[,3], col = "red", lty = 2)