#Homework 2 - Week 3 1. Load the data_Windmill csv file into R.

Windmill <- read.csv("/Users/jusimioni/Desktop/MSDA 2021-23/Spring 2023/Big Data/Data/data_Windmill.csv")
Windmill
##    Velocity Output
## 1      2.45  0.123
## 2      2.70  0.500
## 3      2.90  0.653
## 4      3.05  0.558
## 5      3.40  1.057
## 6      3.60  1.137
## 7      3.95  1.144
## 8      4.10  1.194
## 9      4.60  1.562
## 10     5.00  1.582
## 11     5.45  1.501
## 12     5.80  1.737
## 13     6.00  1.822
## 14     6.20  1.866
## 15     6.35  1.930
## 16     7.00  1.800
## 17     7.40  2.088
## 18     7.85  2.179
## 19     8.15  2.166
## 20     8.80  2.112
## 21     9.10  2.303
## 22     9.55  2.294
## 23     9.70  2.386
## 24    10.00  2.236
## 25    10.20  2.310
  1. Develop a quadratic polynomial regression model with Velocity as the covariate and Output as the target variable (i.e., Velocity is like X and Output is like Y in the class lecture on Monday, January 30).
    ## Rename the variables for ease of usage
Y <- Windmill$Output
X <- Windmill$Velocity

Scatterplot to understand the data

plot(Y~X, xlab = "Velocity", ylab = "Output")

Simple Linear Regression

lm_mod <- lm(Y~X, data = Windmill)
summary(lm_mod)
## 
## Call:
## lm(formula = Y ~ X, data = Windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59869 -0.14099  0.06059  0.17262  0.32184 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.13088    0.12599   1.039     0.31    
## X            0.24115    0.01905  12.659 7.55e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2361 on 23 degrees of freedom
## Multiple R-squared:  0.8745, Adjusted R-squared:  0.869 
## F-statistic: 160.3 on 1 and 23 DF,  p-value: 7.546e-12
plot(X, Y, xlab = "Velocity", ylab = "Output", main = "Simple Linear Regression")
abline(lm_mod, col = "blue", lwd = 1)

Quadratic Polynomial Regression

quad_mod <- lm(Y~X + I(X^2), data = Windmill) 
summary(quad_mod)
## 
## Call:
## lm(formula = Y ~ X + I(X^2), data = Windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26347 -0.02537  0.01264  0.03908  0.19903 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.155898   0.174650  -6.618 1.18e-06 ***
## X            0.722936   0.061425  11.769 5.77e-11 ***
## I(X^2)      -0.038121   0.004797  -7.947 6.59e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1227 on 22 degrees of freedom
## Multiple R-squared:  0.9676, Adjusted R-squared:  0.9646 
## F-statistic: 328.3 on 2 and 22 DF,  p-value: < 2.2e-16
#Note that cex in the below plot determines how big the circles appear in the graph
plot(X, Y, xlab = "Velocity", main = "Quadratic", ylab = "Output", cex = 0.6)
lines(X, quad_mod$fitted.values, col="blue", lwd = 1)

anova(lm_mod, quad_mod) 
## Analysis of Variance Table
## 
## Model 1: Y ~ X
## Model 2: Y ~ X + I(X^2)
##   Res.Df     RSS Df Sum of Sq     F    Pr(>F)    
## 1     23 1.28157                                 
## 2     22 0.33108  1    0.9505 63.16 6.586e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Use the b spline algorithm to develop a spline model with 4 degrees of freedom, and perform the following for your spline model:

Regression Splines

library(splines)
reg_sp <- lm(Y~bs(X, df= 4), data = Windmill)
summary(reg_sp)
## 
## Call:
## lm(formula = Y ~ bs(X, df = 4), data = Windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16694 -0.06493  0.01365  0.06848  0.11538 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.18646    0.07063   2.640   0.0157 *  
## bs(X, df = 4)1  1.18434    0.15531   7.626 2.42e-07 ***
## bs(X, df = 4)2  1.57874    0.14060  11.229 4.36e-10 ***
## bs(X, df = 4)3  2.15893    0.15095  14.302 5.78e-12 ***
## bs(X, df = 4)4  2.10989    0.09407  22.430 1.20e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09676 on 20 degrees of freedom
## Multiple R-squared:  0.9817, Adjusted R-squared:  0.978 
## F-statistic: 267.7 on 4 and 20 DF,  p-value: < 2.2e-16
  1. Calculate 99% confidence intervals for all data points
#The below code can be performed for prediction intervals as well as confidence intervals
conf_interval <- predict(reg_sp, interval = "confidence", level = 0.99)
conf_interval
##          fit         lwr       upr
## 1  0.1864577 -0.01449874 0.3874141
## 2  0.4221021  0.28329480 0.5609094
## 3  0.5905826  0.47867879 0.7024865
## 4  0.7059901  0.60179775 0.8101825
## 5  0.9416193  0.83260905 1.0506296
## 6  1.0569047  0.94234145 1.1714679
## 7  1.2288118  1.11066078 1.3469628
## 8  1.2920829  1.17521044 1.4089555
## 9  1.4654803  1.35998979 1.5709708
## 10 1.5714619  1.47331772 1.6696061
## 11 1.6679435  1.56731392 1.7685730
## 12 1.7341634  1.62722791 1.8410988
## 13 1.7711014  1.66171542 1.8804874
## 14 1.8085503  1.69879563 1.9183050
## 15 1.8369613  1.72818576 1.9457368
## 16 1.9603534  1.85905310 2.0616537
## 17 2.0336677  1.93019605 2.1371394
## 18 2.1105235  1.99595708 2.2250900
## 19 2.1569932  2.03393302 2.2800535
## 20 2.2397156  2.11146810 2.3679630
## 21 2.2675948  2.14644449 2.3887452
## 22 2.2945499  2.18453279 2.4045671
## 23 2.2991143  2.18520514 2.4130234
## 24 2.3009263  2.15230667 2.4495460
## 25 2.2963479  2.10240137 2.4902944
  1. Create a scatter plot comparing Velocity and Output, and on this plot, display the confidence intervals and predicted values for all data points.
plot(X, Y, xlab = "Velocity", main = "Regression Spline", ylab = "Output", cex = .6)
lines(X, reg_sp$fitted.values, col = "blue", lwd = 1)
lines(X, conf_interval[,2], col = "red", lty = 2)
lines(X, conf_interval[,3], col = "red", lty = 2)

  1. Use ANOVA to compare the two models that you developed in questions #2 and #3. Which of the models has the smallest sum of squared errors? Based on your results, do you believe there is a statistically significant difference between the predictive performance of the two models? If so, which model do you think would be better to use for predicting Output? Explain your reasoning.
anova(quad_mod, reg_sp)
## Analysis of Variance Table
## 
## Model 1: Y ~ X + I(X^2)
## Model 2: Y ~ bs(X, df = 4)
##   Res.Df     RSS Df Sum of Sq      F   Pr(>F)   
## 1     22 0.33108                                
## 2     20 0.18724  2   0.14384 7.6822 0.003347 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The spline model has the lower sum of squared errors. The results are statistically significant since their Pr(>F) is 0.003347 lower than 0.5. The spline is better to use as a prediction output since it thinks better the data than the quadrantic model. It also shows a considerable lower sum of squares error.

  1. Use the b spline algorithm to develop a spline model with 10 degrees of freedom. Use ANOVA to compare this model with the one you developed in question #3. Do you believe there is a statistically significant difference between the predictive performance of the two models? If so, which model do you think would be better to use for predicting Output? Explain your reasoning.
library(splines)
reg_sp10 <- lm(Y~bs(X, df= 10), data = Windmill)
summary(reg_sp10)
## 
## Call:
## lm(formula = Y ~ bs(X, df = 10), data = Windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16701 -0.08491  0.02911  0.05992  0.12740 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.1335     0.1117   1.196  0.25174    
## bs(X, df = 10)1    0.3156     0.2198   1.436  0.17302    
## bs(X, df = 10)2    0.6719     0.2140   3.140  0.00723 ** 
## bs(X, df = 10)3    1.1806     0.2041   5.785 4.72e-05 ***
## bs(X, df = 10)4    1.3768     0.1930   7.133 5.06e-06 ***
## bs(X, df = 10)5    1.6983     0.1688  10.064 8.63e-08 ***
## bs(X, df = 10)6    1.7851     0.2000   8.925 3.75e-07 ***
## bs(X, df = 10)7    2.1110     0.2234   9.451 1.87e-07 ***
## bs(X, df = 10)8    2.0912     0.2303   9.080 3.04e-07 ***
## bs(X, df = 10)9    2.2270     0.1960  11.363 1.88e-08 ***
## bs(X, df = 10)10   2.1483     0.1566  13.723 1.64e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1123 on 14 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9704 
## F-statistic: 79.55 on 10 and 14 DF,  p-value: 1.437e-10
anova(reg_sp, reg_sp10)
## Analysis of Variance Table
## 
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ bs(X, df = 10)
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     20 0.18724                           
## 2     14 0.17660  6  0.010637 0.1405 0.9881

There is not a significant difference between the two models. The model performed on question 3 may have better predictions since the model will does not fit the model so specifically.

  1. Develop a natural cubic spline model with six degrees of freedom, and use ANOVA to compare this model with the one you developed in question #3. Then perform the following for your spline model:
cubic <- lm(Y~ns(X, df = 6), data = Windmill)

anova(reg_sp, cubic)
## Analysis of Variance Table
## 
## Model 1: Y ~ bs(X, df = 4)
## Model 2: Y ~ ns(X, df = 6)
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     20 0.18724                           
## 2     18 0.18604  2 0.0011954 0.0578  0.944

The anova shoes very little difference between the two models.
a) Calculate 85% confidence intervals for all data points

#The below code can be performed for prediction intervals as well as confidence intervals
conf_interval2 <- predict(cubic, interval = "confidence", level = 0.85)
conf_interval2
##          fit        lwr       upr
## 1  0.1813098 0.06231097 0.3003086
## 2  0.4105225 0.33124873 0.4897963
## 3  0.5870989 0.52131612 0.6528817
## 4  0.7121712 0.64428747 0.7800550
## 5  0.9662633 0.88576389 1.0467628
## 6  1.0809850 1.00264919 1.1593208
## 7  1.2350628 1.16583929 1.3042863
## 8  1.2874445 1.21642980 1.3584592
## 9  1.4309811 1.33798159 1.5239806
## 10 1.5378558 1.45190860 1.6238029
## 11 1.6544810 1.58615440 1.7228076
## 12 1.7389925 1.66827392 1.8097112
## 13 1.7836864 1.71003464 1.8573381
## 14 1.8252945 1.75235358 1.8982354
## 15 1.8546607 1.78385263 1.9254689
## 16 1.9689915 1.89076504 2.0472179
## 17 2.0340475 1.94481569 2.1232793
## 18 2.1063603 2.02573978 2.1869809
## 19 2.1521916 2.07838469 2.2259985
## 20 2.2358039 2.14908546 2.3225223
## 21 2.2633765 2.17576106 2.3509919
## 22 2.2890786 2.22021602 2.3579413
## 23 2.2944607 2.22821502 2.3607063
## 24 2.3023578 2.21711962 2.3875961
## 25 2.3065215 2.19394116 2.4191019
  1. Create a scatter plot comparing Velocity and Output, and on this plot, display the confidence intervals and predicted values for all data points.
plot(X, Y, xlab = "Velocity", main = "Spline", ylab = "Output", cex = .6)
lines(X, cubic$fitted.values, col = "blue", lwd = 1)
lines(X, conf_interval2[,2], col = "red", lty = 2)
lines(X, conf_interval2[,3], col = "red", lty = 2)