My methodology was as follows:

First, I split the data into three parts. First, we split the 1000 observations into a test set and training set where they contained 100 and 900 observations respectively. The test set contained the 100 observations with the missing data.

Next, I subset the 900 data points into a test subset and training subset with 450 observations in the training dataset and 450 in the validation set.

After that, I proceeded to use PCR to reduce the number of predictors I want to use. I noticed the MSEP was pretty low around 15 components, so I moved onto forward selection to find out which 15 of the 25 predictors I should keep in my model.

Finally, I proceeded to GAMs, where I put the 15 variables that forward selection told me to use. I computed MSE for this model about 200 times with different additions (splines, polynomials, natural splines, linear) and reduced my MSE from the pcr = 3788.148 to the MSE for gams = 3183.268 based on what the plots of the gams with the residual plots looked like. I used my best guess on how the residuals looked for each predictor to decide what I should use for that part of the gam.

For example, when looking at the residuals for predictors x6 and x24, I noticed that a linear model would fit the residuals the best. Generally, using smoothing splines for the rest of the variables was the best call. However, there were some variables I chose to use a polynomial function because their residuals looked fairly quadratic.

Overall, I am pleased with my results because I think this is a good enough model to predict the y-values. It could be better but I am really tired of guessing and checking using “Donner’s Method” which I’m sure he will describe in his write up. I would love to know if there are better ways to figure out what to use in the gam model.

require(leaps)
## Loading required package: leaps
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
kajal_set <- read.csv("C:/Users/Kajal/Downloads/PredictiveHW4Dataset.csv")

trains<-kajal_set[1:900,]
tests<-kajal_set[901:1000,]

set.seed(1234)
rows_for_testing <- sort(sample(1:nrow(trains), 450)) 
testsubset <- trains[rows_for_testing,] 
trainsubset <- trains[-rows_for_testing,]

matrix_kajal<-model.matrix(y~ .-id, data=kajal_set)

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
fit.pcr <- pcr(y ~ ., data = trainsubset, scale = TRUE, validation = "CV")
validationplot(fit.pcr, val.type = "MSEP")

predicted_pcr <- predict(fit.pcr, testsubset , ncomp = 25)
mean((predicted_pcr - testsubset$y)^2)
## [1] 3788.148
fwd_kajal<-regsubsets(y ~ . , data = trainsubset, method="forward", nvmax=15)
fwd_kajal
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = trainsubset, method = "forward", 
##     nvmax = 15)
## 26 Variables  (and intercept)
##     Forced in Forced out
## x1      FALSE      FALSE
## x2      FALSE      FALSE
## x3      FALSE      FALSE
## x4      FALSE      FALSE
## x5      FALSE      FALSE
## x6      FALSE      FALSE
## x7      FALSE      FALSE
## x8      FALSE      FALSE
## x9      FALSE      FALSE
## x10     FALSE      FALSE
## x11     FALSE      FALSE
## x12     FALSE      FALSE
## x13     FALSE      FALSE
## x14     FALSE      FALSE
## x15     FALSE      FALSE
## x16     FALSE      FALSE
## x17     FALSE      FALSE
## x18     FALSE      FALSE
## x19     FALSE      FALSE
## x20     FALSE      FALSE
## x21     FALSE      FALSE
## x22     FALSE      FALSE
## x23     FALSE      FALSE
## x24     FALSE      FALSE
## x25     FALSE      FALSE
## id      FALSE      FALSE
## 1 subsets of each size up to 15
## Selection Algorithm: forward
reg.summary <- summary(fwd_kajal)
reg.summary$adjr2
##  [1] 0.3187719 0.3219716 0.3245668 0.3275092 0.3296693 0.3303949 0.3312120
##  [8] 0.3317983 0.3323932 0.3324629 0.3320556 0.3315629 0.3309030 0.3339864
## [15] 0.3333805
as.data.frame(reg.summary$outmat)
##           x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18
## 1  ( 1 )               *                                                
## 2  ( 1 )               *  *                                             
## 3  ( 1 )               *  *                                             
## 4  ( 1 )               *  *                                             
## 5  ( 1 )   *           *  *                                             
## 6  ( 1 )   *           *  *                                             
## 7  ( 1 )   *           *  *                *                            
## 8  ( 1 )   *           *  *     *          *                            
## 9  ( 1 )   *        *  *  *     *          *                            
## 10  ( 1 )  *        *  *  *     *          *       *                    
## 11  ( 1 )  *        *  *  *     *          *       *                    
## 12  ( 1 )  *        *  *  *  *  *          *       *                    
## 13  ( 1 )  *        *  *  *  *  *          *       *                    
## 14  ( 1 )  *        *  *  *  *  *          *       *                   *
## 15  ( 1 )  *        *  *  *  *  *          *       *       *           *
##           x19 x20 x21 x22 x23 x24 x25 id
## 1  ( 1 )                                
## 2  ( 1 )                                
## 3  ( 1 )                *               
## 4  ( 1 )                *           *   
## 5  ( 1 )                *           *   
## 6  ( 1 )                *           *  *
## 7  ( 1 )                *           *  *
## 8  ( 1 )                *           *  *
## 9  ( 1 )                *           *  *
## 10  ( 1 )               *           *  *
## 11  ( 1 )               *       *   *  *
## 12  ( 1 )               *       *   *  *
## 13  ( 1 )       *       *       *   *  *
## 14  ( 1 )       *       *       *   *  *
## 15  ( 1 )       *       *       *   *  *
require(gam)
## Loading required package: gam
## Loading required package: splines
## Loaded gam 1.14
fit_gam_kajal<-gam(y ~ s(x1, df=2) + s(x2, df=2) + s(x5, df=3) + x6 + s(x7, df=2)  + s(x10, df=2) + s(x12, df=2) + s(x13, df=2) + s(x15, df=2) + s(x18, df=2) + s(x20, df=2) + poly(x21, 3) + s(x22, df=2) + x24 + poly(x25, 2), data=trainsubset)

plot.gam(fit_gam_kajal, col = "purple", residuals = TRUE)

predicted_kajal<-predict(fit_gam_kajal, testsubset)
mean((predicted_kajal-testsubset$y)^2)
## [1] 3183.268
tests$ybesthat <- predict(fit_gam_kajal, newdata=tests)
tests[,27:28]
##        id    ybesthat
## 901   901 157.7554468
## 902   902  29.0310804
## 903   903  85.2905347
## 904   904 120.0669007
## 905   905 125.0080772
## 906   906  36.5961580
## 907   907   9.9416879
## 908   908   2.1841726
## 909   909 169.7772681
## 910   910  37.7330149
## 911   911  11.0400637
## 912   912  12.3404989
## 913   913 100.7250681
## 914   914  -0.2029631
## 915   915  28.3425736
## 916   916  28.0677444
## 917   917 156.8868301
## 918   918 125.3968542
## 919   919  60.7996615
## 920   920  70.6343462
## 921   921   9.2018495
## 922   922  33.5925091
## 923   923 119.6965338
## 924   924  25.4269978
## 925   925  97.5741147
## 926   926  66.7077177
## 927   927  52.7203080
## 928   928  61.9105838
## 929   929 101.0669689
## 930   930  -4.9312213
## 931   931  24.8623804
## 932   932   8.6942319
## 933   933   8.0792854
## 934   934   0.5980871
## 935   935 123.4390864
## 936   936  17.8061823
## 937   937 143.7279092
## 938   938  10.4307811
## 939   939  82.9485527
## 940   940 161.5055386
## 941   941  38.3857491
## 942   942  60.5957745
## 943   943 107.8182289
## 944   944   4.0028810
## 945   945  23.8627711
## 946   946  10.4475688
## 947   947  72.6796197
## 948   948  12.9489183
## 949   949  54.6965672
## 950   950  58.3674279
## 951   951 103.7064057
## 952   952  -7.9172085
## 953   953  47.0069146
## 954   954 132.9326442
## 955   955 -16.7400787
## 956   956  86.4440691
## 957   957  32.0175866
## 958   958  22.8345037
## 959   959  24.9122462
## 960   960 131.5021016
## 961   961  94.1074107
## 962   962  83.5476531
## 963   963  57.3713890
## 964   964 161.2861372
## 965   965  46.8058469
## 966   966  11.8443191
## 967   967  41.3778488
## 968   968  68.4663745
## 969   969  59.1751442
## 970   970  45.1218587
## 971   971  38.7895430
## 972   972  38.4612650
## 973   973  55.8887357
## 974   974  55.9576343
## 975   975  10.3209141
## 976   976  52.4905167
## 977   977  80.8226219
## 978   978  94.8533463
## 979   979  23.7282455
## 980   980 108.8401719
## 981   981  48.9033723
## 982   982   8.2052534
## 983   983 133.0917452
## 984   984 139.2347098
## 985   985  27.0839758
## 986   986 142.0511805
## 987   987  19.7772451
## 988   988 104.0070335
## 989   989  32.7555957
## 990   990  15.5405432
## 991   991  36.4898371
## 992   992  57.3755626
## 993   993  18.3899493
## 994   994  73.3592913
## 995   995  42.0450591
## 996   996  36.3537572
## 997   997  18.0695274
## 998   998   5.0699615
## 999   999  55.5373158
## 1000 1000 142.4204916
write.csv(tests$ybesthat, file="PredictiveHW4", fileEncoding = "macroman")