Question- 1 Load the data_Windmill csv file into R

#windmill data set loaded below
windmill <- read.csv("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Week 3/data_Windmill.csv")

#what does the data look like-see below
str(windmill)
## 'data.frame':    25 obs. of  2 variables:
##  $ Velocity: num  2.45 2.7 2.9 3.05 3.4 3.6 3.95 4.1 4.6 5 ...
##  $ Output  : num  0.123 0.5 0.653 0.558 1.057 ...
#What does the data look like plotted? 
#scatterplot
plot(windmill$Output ~ windmill$Velocity, xlab = "Velocity", ylab = "Output", main = "Scatterplot of Output against Velocity")

# not far off from linear, but has a slight curve

Question 2- Develop a quadratic polynomial regression model with Velocity as the covariate and Output as the target variable.

#Change variables to X and Y to make them easier to work with
Y <- windmill$Output
X <- windmill$Velocity

# Quadratic polynomial regression model
windmill_quad_mod <- lm(Y~ X + I(X^2), data = windmill)
summary(windmill_quad_mod)
## 
## Call:
## lm(formula = Y ~ X + I(X^2), data = windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26347 -0.02537  0.01264  0.03908  0.19903 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.155898   0.174650  -6.618 1.18e-06 ***
## X            0.722936   0.061425  11.769 5.77e-11 ***
## I(X^2)      -0.038121   0.004797  -7.947 6.59e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1227 on 22 degrees of freedom
## Multiple R-squared:  0.9676, Adjusted R-squared:  0.9646 
## F-statistic: 328.3 on 2 and 22 DF,  p-value: < 2.2e-16
# Both velocity and transformed velocity are significant based on p values.  Regular velocity has a positive estimate meaning as it goes up so does output however the transformed version is the opposite due to the negative estimate value.  The multiple R-squared and and Adjusted R-squared are very good. 

#Coefficients:
               #Estimate Std. Error t value Pr(>|t|)    
#(Intercept)   -1.155898   0.174650  -6.618 1.18e-06 ***
#Velocity       0.722936   0.061425  11.769 5.77e-11 ***
#I(Velocity^2) -0.038121   0.004797  -7.947 6.59e-08 ***
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

#Residual standard error: 0.1227 on 22 degrees of freedom
#Multiple R-squared:  0.9676,   Adjusted R-squared:  0.9646 
#F-statistic: 328.3 on 2 and 22 DF,  p-value: < 2.2e-16

Question 3- Use the b spline algorithm to develop a spline model with 4 degrees of freedom, and perform the following for your spline model: a) Calculate 99% confidence intervals for all data points b) Create a scatter plot comparing Velocity and Output, and on this plot, display the confidence intervals and predicted values for all data points.

#load spline library to run spline model
library(splines)

#spline model with 4 degrees of freedom
windmill_sp = lm(Y~bs(X, df = 4), data = windmill)
summary(windmill_sp)
## 
## Call:
## lm(formula = Y ~ bs(X, df = 4), data = windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16694 -0.06493  0.01365  0.06848  0.11538 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.18646    0.07063   2.640   0.0157 *  
## bs(X, df = 4)1  1.18434    0.15531   7.626 2.42e-07 ***
## bs(X, df = 4)2  1.57874    0.14060  11.229 4.36e-10 ***
## bs(X, df = 4)3  2.15893    0.15095  14.302 5.78e-12 ***
## bs(X, df = 4)4  2.10989    0.09407  22.430 1.20e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09676 on 20 degrees of freedom
## Multiple R-squared:  0.9817, Adjusted R-squared:  0.978 
## F-statistic: 267.7 on 4 and 20 DF,  p-value: < 2.2e-16
#Multiple R-Squared and Adjusted R-Squared are better than the quadratic polynomial model
#Multiple R-squared:  0.9817, Adjusted R-squared:  0.978

#a) Calculate 99% confidence intervals of all data points
wind_conf_inter <- predict(windmill_sp, interval = "confidence", level = 0.99)
wind_conf_inter
##          fit         lwr       upr
## 1  0.1864577 -0.01449874 0.3874141
## 2  0.4221021  0.28329480 0.5609094
## 3  0.5905826  0.47867879 0.7024865
## 4  0.7059901  0.60179775 0.8101825
## 5  0.9416193  0.83260905 1.0506296
## 6  1.0569047  0.94234145 1.1714679
## 7  1.2288118  1.11066078 1.3469628
## 8  1.2920829  1.17521044 1.4089555
## 9  1.4654803  1.35998979 1.5709708
## 10 1.5714619  1.47331772 1.6696061
## 11 1.6679435  1.56731392 1.7685730
## 12 1.7341634  1.62722791 1.8410988
## 13 1.7711014  1.66171542 1.8804874
## 14 1.8085503  1.69879563 1.9183050
## 15 1.8369613  1.72818576 1.9457368
## 16 1.9603534  1.85905310 2.0616537
## 17 2.0336677  1.93019605 2.1371394
## 18 2.1105235  1.99595708 2.2250900
## 19 2.1569932  2.03393302 2.2800535
## 20 2.2397156  2.11146810 2.3679630
## 21 2.2675948  2.14644449 2.3887452
## 22 2.2945499  2.18453279 2.4045671
## 23 2.2991143  2.18520514 2.4130234
## 24 2.3009263  2.15230667 2.4495460
## 25 2.2963479  2.10240137 2.4902944
#b) Scatter plot comparing Velocity and Output to confiedence intervals
plot(X, Y, xlab = "Velocity", main = "Regression Spline with df = 4", ylab = "Output", cex = .5)
lines(X, windmill_sp$fitted.values, col = "blue", lwd = 1)
wind_conf_inter <- predict(windmill_sp, interval = "confidence", level = 0.99)
lines(X, wind_conf_inter[,2], col = "red", lty = 2)
lines(X, wind_conf_inter[,3], col = "red", lty = 2)

Question 4- Use ANOVA to compare the two models that you developed in questions #2 and #3. Which of the models has the smallest sum of squared errors? Based on your results, do you believe there is a statistically significant difference between the predictive performance of the two models? If so, which model do you think would be better to use for predicting Output? Explain your reasoning.

#ANOVA comparing two models above
anova(windmill_sp, windmill_quad_mod)
## Analysis of Variance Table
## 
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ X + I(X^2)
##   Res.Df     RSS Df Sum of Sq      F   Pr(>F)   
## 1     20 0.18724                                
## 2     22 0.33108 -2  -0.14384 7.6822 0.003347 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Analysis of Variance Table

#Model 1: Y ~ bs(X, df = 4)
#Model 2: Y ~ X + I(X^2)
  #Res.Df     RSS Df Sum of Sq      F   Pr(>F)   
#1     20 0.18724                                
#2     22 0.33108 -2  -0.14384 7.6822 0.003347 **

#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

#Model 1 as designated in the ANOVA (the b spline model) has the smallest sum of squared errors at 0.18724
# There was not a big difference in adjusted R-Squared values between the models, but the windmill b spline model (model 1 here) did have a better adjusted R-squared.  Based on the ANOVA comparison Model 1, the b spline model, is statistically significant and would be better at predicting output due to the P value being less than 0.05.  

Question 5- Use the b spline algorithm to develop a spline model with 10 degrees of freedom. Use ANOVA to compare this model with the one you developed in question #3. Do you believe there is a statistically significant difference between the predictive performance of the two models? If so, which model do you think would be better to use for predicting Output? Explain your reasoning

#spline model with 10 degrees of freedom
windmill_sp_model2 = lm(Y~bs(X, df = 10), data = windmill)
summary(windmill_sp_model2)
## 
## Call:
## lm(formula = Y ~ bs(X, df = 10), data = windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16701 -0.08491  0.02911  0.05992  0.12740 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.1335     0.1117   1.196  0.25174    
## bs(X, df = 10)1    0.3156     0.2198   1.436  0.17302    
## bs(X, df = 10)2    0.6719     0.2140   3.140  0.00723 ** 
## bs(X, df = 10)3    1.1806     0.2041   5.785 4.72e-05 ***
## bs(X, df = 10)4    1.3768     0.1930   7.133 5.06e-06 ***
## bs(X, df = 10)5    1.6983     0.1688  10.064 8.63e-08 ***
## bs(X, df = 10)6    1.7851     0.2000   8.925 3.75e-07 ***
## bs(X, df = 10)7    2.1110     0.2234   9.451 1.87e-07 ***
## bs(X, df = 10)8    2.0912     0.2303   9.080 3.04e-07 ***
## bs(X, df = 10)9    2.2270     0.1960  11.363 1.88e-08 ***
## bs(X, df = 10)10   2.1483     0.1566  13.723 1.64e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1123 on 14 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9704 
## F-statistic: 79.55 on 10 and 14 DF,  p-value: 1.437e-10
#compare the spline model in question 3 to the spline model in this question
anova(windmill_sp, windmill_sp_model2)
## Analysis of Variance Table
## 
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ bs(X, df = 10)
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     20 0.18724                           
## 2     14 0.17660  6  0.010637 0.1405 0.9881
#results
#Model 1: Y ~ bs(X, df = 4)
#Model 2: Y ~ bs(X, df = 10)
  #Res.Df     RSS Df Sum of Sq      F Pr(>F)
#1     20 0.18724                           
#2     14 0.17660  6  0.010637 0.1405 0.9881

#Model 2 above has the smaller sum of squared errors, however based on the p value there is not a statistically significant difference between the predictive performance of the models

Question 6 - Develop a natural cubic spline model with six degrees of freedom, and use ANOVA to compare this model with the one you developed in question #3. Then perform the following for your spline model: a) Calculate 85% confidence intervals for all data points b) Create a scatter plot comparing Velocity and Output, and on this plot, display the confidence intervals and predicted values for all data points.

#Natural Spline 6 degrees of freedom
ns_windmill = lm(Y~ns(X, df = 6), data = windmill)
summary(ns_windmill)
## 
## Call:
## lm(formula = Y ~ ns(X, df = 6), data = windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16899 -0.06636  0.03831  0.06590  0.13102 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.18131    0.07914   2.291   0.0342 *  
## ns(X, df = 6)1  1.26537    0.12859   9.840 1.14e-08 ***
## ns(X, df = 6)2  1.62893    0.13782  11.819 6.44e-10 ***
## ns(X, df = 6)3  1.86721    0.14100  13.242 1.02e-10 ***
## ns(X, df = 6)4  1.87604    0.12613  14.874 1.48e-11 ***
## ns(X, df = 6)5  2.90818    0.20579  14.132 3.48e-11 ***
## ns(X, df = 6)6  1.55777    0.09277  16.792 1.92e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1017 on 18 degrees of freedom
## Multiple R-squared:  0.9818, Adjusted R-squared:  0.9757 
## F-statistic: 161.7 on 6 and 18 DF,  p-value: 1.177e-14
# Compare this model to the model in question 3
anova(windmill_sp, ns_windmill)
## Analysis of Variance Table
## 
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ ns(X, df = 6)
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     20 0.18724                           
## 2     18 0.18604  2 0.0011954 0.0578  0.944
#results
#Model 1: Y ~ bs(X, df = 4)
#Model 2: Y ~ ns(X, df = 6)
  #Res.Df     RSS Df Sum of Sq      F Pr(>F)
#1     20 0.18724                           
#2     18 0.18604  2 0.0011954 0.0578  0.944

#Model 2 above has the smaller sum of squared errors, however based on the p value there is not a statistically significant difference between the predictive performance of the models

#a) Calculate 85% confidence intervals for all data points
ns_wind_conf_inter <- predict(ns_windmill, interval = "confidence", level = 0.85)
ns_wind_conf_inter
##          fit        lwr       upr
## 1  0.1813098 0.06231097 0.3003086
## 2  0.4105225 0.33124873 0.4897963
## 3  0.5870989 0.52131612 0.6528817
## 4  0.7121712 0.64428747 0.7800550
## 5  0.9662633 0.88576389 1.0467628
## 6  1.0809850 1.00264919 1.1593208
## 7  1.2350628 1.16583929 1.3042863
## 8  1.2874445 1.21642980 1.3584592
## 9  1.4309811 1.33798159 1.5239806
## 10 1.5378558 1.45190860 1.6238029
## 11 1.6544810 1.58615440 1.7228076
## 12 1.7389925 1.66827392 1.8097112
## 13 1.7836864 1.71003464 1.8573381
## 14 1.8252945 1.75235358 1.8982354
## 15 1.8546607 1.78385263 1.9254689
## 16 1.9689915 1.89076504 2.0472179
## 17 2.0340475 1.94481569 2.1232793
## 18 2.1063603 2.02573978 2.1869809
## 19 2.1521916 2.07838469 2.2259985
## 20 2.2358039 2.14908546 2.3225223
## 21 2.2633765 2.17576106 2.3509919
## 22 2.2890786 2.22021602 2.3579413
## 23 2.2944607 2.22821502 2.3607063
## 24 2.3023578 2.21711962 2.3875961
## 25 2.3065215 2.19394116 2.4191019
#b) Scatter plot comparing Velocity and Output to confidence intervals
plot(X, Y, xlab = "Velocity", main = "Natural Cubic Spline with df = 6", ylab = "Output", cex = .5)
lines(X, ns_windmill$fitted.values, col = "blue", lwd = 1)
ns_wind_conf_inter <- predict(ns_windmill, interval = "confidence", level = 0.85)
lines(X, ns_wind_conf_inter[,2], col = "red", lty = 2)
lines(X, ns_wind_conf_inter[,3], col = "red", lty = 2)