library(lattice)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(bindr)
library(caret)
library(ISLR)

1) Create Simulated Data

set.seed(1)
n <- 100
t <- runif(n, 0, 5)
y <- 2*t + 3*cos(2*t) + rnorm(n,0,4)
train_data <- data.frame(y,t)
y2 <- 2*t + 3*cos(2*t) + rnorm(n,0,4)
t2 <- runif(n, 0, 5)
test_data <- data.frame(y = y2,t = t2)
ggplot() +
  geom_point(data = train_data, aes(t, y)) + 
  labs(title = "Training Data")

trainingSet <- train_data$y
testSet <- test_data$y

2) Create 12 different models with variying degree and Plot the MSE

model1 <- lm(trainingSet~poly(t,1))
model2 <- glm(trainingSet~poly(t,2))
model3 <- glm(trainingSet~poly(t,3))
model4 <- glm(trainingSet~poly(t,4))
model5 <- glm(trainingSet~poly(t,5))
model6 <- glm(trainingSet~poly(t,6))
model7 <- glm(trainingSet~poly(t,7))
model8 <- glm(trainingSet~poly(t,8))
model9 <- glm(trainingSet~poly(t,9))
model10 <- glm(trainingSet~poly(t,10))
model11 <- glm(trainingSet~poly(t,11))
model12 <- glm(trainingSet~poly(t,12))
MSE1 <- (sum(model1$residuals * model1$residuals))/n
MSE2 <- (sum(model2$residuals * model2$residuals))/n
MSE3 <- (sum(model3$residuals * model3$residuals))/n
MSE4 <- (sum(model4$residuals * model4$residuals))/n
MSE5 <- (sum(model5$residuals * model5$residuals))/n
MSE6 <- (sum(model6$residuals * model6$residuals))/n
MSE7 <- (sum(model7$residuals * model7$residuals))/n
MSE8 <- (sum(model8$residuals * model8$residuals))/n
MSE9 <- (sum(model9$residuals * model9$residuals))/n
MSE10 <- (sum(model10$residuals * model10$residuals))/n
MSE11 <- (sum(model11$residuals * model11$residuals))/n
MSE12 <- (sum(model12$residuals * model12$residuals))/n
Model_Train_MSE <- c(MSE1, MSE2, MSE3, MSE4, MSE5, MSE6, MSE7, MSE8, MSE9, MSE10, MSE11, MSE12)
Train_MSE <- data.frame(x = 1:12, MSE = Model_Train_MSE)
ggplot()+
  geom_line(data= Train_MSE, aes(x, MSE)) +
  scale_x_continuous(breaks = c(2,4,6,8,10,12)) +
  labs(title = "Training Models")

Based on this graph I would use the 12th degree polynomial becasue it has the lowest MSE

3) Why is your answer to the prior question a very bad idea to do in practice?

This plot shows that the lowest MSE comes from a 12 degree polonomial model. However this is a very bad idea in practice because the MSE plotted is the training MSE, when in practice we want to minimize the Test MSE which is how well the model predicts new data rather than how well the model fits “old” data.

4) What is the theoretical lowest MSE that could be achieved, and how does that compare to your graph from 2)?

The theoretical lowest MSE we could get would be the error of the function used to get the training data. In this case our theoretical low would be 16, which is the variance of the distribution used for our random noise. However, in our graph it shows models that have a lower variance than that because these models are of higher degrees that our cosine function that we are generating our data from.

5) Plot the Test MSE and compare the Test and Training MSE

Test_Predict1 <- predict(model1, data.frame(testSet))
Test_Predict2 <- predict(model2, data.frame(testSet))
Test_Predict3 <- predict(model3, data.frame(testSet))
Test_Predict4 <- predict(model4, data.frame(testSet))
Test_Predict5 <- predict(model5, data.frame(testSet))
Test_Predict6 <- predict(model6, data.frame(testSet))
Test_Predict7 <- predict(model7, data.frame(testSet))
Test_Predict8 <- predict(model8, data.frame(testSet))
Test_Predict9 <- predict(model9, data.frame(testSet))
Test_Predict10 <- predict(model10, data.frame(testSet))
Test_Predict11 <- predict(model11, data.frame(testSet))
Test_Predict12 <- predict(model12, data.frame(testSet))

Test_MSE1 <- sum((testSet- Test_Predict1 )^2)/n
Test_MSE2 <- sum((testSet- Test_Predict2 )^2)/n
Test_MSE3 <- sum((testSet- Test_Predict3 )^2)/n
Test_MSE4 <- sum((testSet- Test_Predict4 )^2)/n
Test_MSE5 <- sum((testSet- Test_Predict5 )^2)/n
Test_MSE6 <- sum((testSet- Test_Predict6 )^2)/n
Test_MSE7 <- sum((testSet- Test_Predict7 )^2)/n
Test_MSE8 <- sum((testSet- Test_Predict8 )^2)/n
Test_MSE9 <- sum((testSet- Test_Predict9 )^2)/n
Test_MSE10 <- sum((testSet- Test_Predict10 )^2)/n
Test_MSE11 <- sum((testSet- Test_Predict11 )^2)/n
Test_MSE12 <- sum((testSet- Test_Predict12 )^2)/n

All_Test_MSE <- c(Test_MSE1, Test_MSE2, Test_MSE3, Test_MSE4, Test_MSE5, Test_MSE6, Test_MSE7, Test_MSE8, Test_MSE9, Test_MSE10, Test_MSE11, Test_MSE12)
Test_MSE <- data.frame(x = 1:12, MSE = All_Test_MSE)
ggplot() +
  geom_line(data = Train_MSE,aes(x,MSE, color = "Training Model")) +
  geom_line(data = Test_MSE, aes(x,MSE, color = "Test Model")) +
  scale_x_continuous(breaks = c(2,4,6,8,10,12)) +
  labs(x = "Degree Polynomial", y= "MSE", title = "Training Vs. Test MSE") +
  scale_color_manual(values = c("red", "blue"))

This graph shows that trining and Test MSE are both improving at first, but once we reach a 4th degree model the training continues to decrease in MSE but the Test starts to increase meaning that the models are starting to loose predictive power or reliability for unseen data. We can now see that our answer from question 2 is a bad idea in practice becasue it is not a good predictor of unseen data.

6) Cross-Validation

lm_model = train(y~t, train_data, method = "lm")
lm_model$results

##   intercept     RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 4.406824 0.3264482 0.3879084  0.1002237

regressControl <- trainControl(method = "cv", number = 10)
cv_lm_model <- train(y~t, train_data, method = "lm", trControl = regressControl)
cv_lm_model$results

##   intercept     RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 4.226657 0.3389637 0.8824554  0.2239866

Using CV makes the MSE lower than the one that did not use CV. They are different because the CV tests subsets of data and finds test MSE of each subset on the modle trained by the other data, then averages all of the test MSEs from the subsets and returns that as the MSE. If we do not use cv then it just randomly spilts the data and trains a model on half the data and uses the other half to find the test MSE. We clearly would trust the CV more because it does not depend on the randomness of the split as much and uses a larger amount of the data to train the models.

regressControl <- trainControl(method = "repeatedcv", number = 10, repeats= 5)
repeat_lm <- train(y~t, train_data, method = "lm", trControl = regressControl)
repeat_lm$results

##   intercept     RMSE  Rsquared    RMSESD RsquaredSD
## 1      TRUE 4.261726 0.3516865 0.7657623  0.1780799

cv_MSE <- numeric(12)
cv_Test_MSE <- numeric(12)
for(i in 1:12){
  regressControl <- trainControl(method='repeatedcv', number = 10, repeats= 5)
  f <- bquote(y ~ poly(t, .(i)))
  models <- train(as.formula(f), train_data, trControl=regressControl, method='glm')
  cv_MSE[i] <- (models$results$RMSE)^2
  Test_Predict <- predict(models, test_data$t)
  Test_MSE <- sum((test_data$y- Test_Predict )^2)/n
  cv_Test_MSE[i] <- Test_MSE
}
CV_MSE_df <- data.frame(x=1:12, MSE = cv_MSE)
CV_Test_MSE_df <- data.frame(x= 1:12, MSE = cv_Test_MSE)
ggplot() +
  geom_line(data= CV_MSE_df, aes(x, MSE)) +
  scale_x_continuous(breaks= c(2,4,6,8,10,12)) +
  labs(x = "Degree of Model", title = "CV Models")

Based on this graph the model that predicts our data the best is a 5th degree polynomial.

7) Make a Graph like 5) that shows the cross-Validation MSE and the Test MSE on the same graph

ggplot() +
  geom_line(data = CV_MSE_df,aes(x,MSE, color = "CV Model")) +
  geom_line(data = CV_Test_MSE_df, aes(x,MSE, color = "Test Model")) +
  scale_x_continuous(breaks = c(2,4,6,8,10,12)) +
  labs(x = "Degree Polynomial", y= "MSE", title = "CV Vs. Test MSE") +
  scale_color_manual(values = c("red", "blue"))

8) why is the test MSE the same in questions 5) and 7)?

These two measurements of MSE are the same because they are both calculated from the same degree polynomial model. Thus there should be no difference in how accurate the different degree polynomials predict the new data.

9) repeat question 6), going up to a 20 degree polynomial. Describe what happens to the resulting cross validation error. Why do you think this is happening?

cv_MSE2 <- numeric(20)
for(i in 1:20){
  regressControl <- trainControl(method='repeatedcv', number = 10, repeats = 5)
  f <- bquote(y ~ poly(t, .(i)))
  models <- train(as.formula(f), train_data, trControl=regressControl, method='glm')
  cv_MSE2[i] <- (models$results$RMSE)^2
}
Large_CV_df <- data.frame(x= 1:20, MSE = cv_MSE2)
ggplot() +
  geom_line(data = Large_CV_df, aes(x, MSE)) +
  labs(x = "Degree of Model", title = "CV Models")

This shows that when you increase to a significant model that the predictive power decreases, that the models gets to precise based on the training model and you experience over fitting on the data, so the test MSE grows exponentially.

10) Explore the effect of choice k in a k-fold cross validation

K_fold_MSE <- numeric(97)
K_fold_test_MSE <- numeric(97)
Index_K <- numeric(97)
for(k in 3:100){
  regressControl <- trainControl(method= "repeatedcv", number = k, repeats = 1)
  models <- train(y~poly(t,5), train_data, trControl = regressControl, method = "glm")
  Test_Predict <- predict(models, test_data$t)
  Test_MSE <- sum((test_data$y- Test_Predict )^2)/n
  K_fold_MSE[k-2] <- (models$results$RMSE)^2
  K_fold_test_MSE[k-2] <- Test_MSE
}

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

K_fold_df <- data.frame(x= 3:100, Training= K_fold_MSE, Test = K_fold_test_MSE)

ggplot() +
  geom_line(data = K_fold_df,aes(x,Training, color = "CV Training Model")) +
  geom_line(data = K_fold_df, aes(x,Test, color = "Test Model")) +
  scale_x_continuous(breaks = c(10,20,30,40,50,60,70,80,90,100)) +
  labs(x = "K of K-Fold Cross Validation", y= "MSE", title = "K-Fold MSE Vs. Test MSE") +
  scale_color_manual(values = c("red", "blue"))

We can see that as K increases the MSE decreases becasue the subsets become to correlated with each other and the model tends to not represent the true unbiased MSE but rather a biased MSE.

11) What are the key takeaways of this lab?

This lab gave another reinforcement of the ideas behind training MSE and Test MSE as well as why we use cross validation. This lab gave us steps that taught us that we could make models that would minimize the training data as low as the random noise. However, these models would not be prefered because we saw that while training MSE was lower at the higher degree polynomial models they also exhibited extreamly high test MSEs, or in other words did not predict the new data points as accurately. We also looked into repeated cross validation and saw that as we did repeated cross validation our MSE was lower than a single training set and again lower when we repeated the cross validation multiple times because the more we repeated the training the more random groups there were to train models and also test said models on new data, thus decreasing the average MSE. However, while this training MSE is better than just creating a model it is still not entirely the best indicator of the best model. We can see this when we plot the CV training MSE and the Test MSE on the same graph. The Test MSE is still higher than the training and where the repeated CV recomended a 5th degree polynomial we can see the Test MSE says that the best model is a 4th degree polynomial model. Finally, we looked how the value of K in K-fold cross validation would effect our MSE results. From this exercise we noticed that as k increased to larger and larger amounts of seperations or fold our MSE observed would get farther and farther from our test MSE. THis means that as we increased the amount of seperations our training models became highly correlated and would create a biased MSE result. This is why when doing K-fold cross validation we tend to keep the number of folds at either 5 or ten, which happen to be the two highest peaks in our graph of the different values of k and their corresponding MSE. In conclusion, Cross validation is really important and powerfull in helping find the most accurate model to predict data. We just have to remember while a low training MSE is necessary the Test MSE is the measurement we really care about in choosing a predictive model.

Lab 1 Cross Validation