Course: MDS 503 (Statistical Computing with R)
Student: You name (roll number)
Teacher: Shital Bhandary (Associate Professor
School: School of Mathematical Sciences, IOST, TU

Task:
1. Download the Individual recode file from: https://dhsprogram.com/data/Download-Model-Datasets.cfm
2. Read it in R Studio and split it into training (80%) and testing (20%) datasets with set.seed as your class roll number
3. Fit a supervised regression model on the training data with Total Children Ever Born (V201) as dependent variable and age group (V013), region (V024), type of place of residence (V025), highest education level (V106) and wealth index (V190) as independent variables and interpret the result carefully, check VIF too and do the needful statistically if required
4. Get the R-square and RMSE of this fitted model on training data using caret package
5. Predict the dependent variable on the test data and get the R-square and RMSE using caret package
6. Tune the R-square and RMSE values of the testing model using LOOCV, k-fold cross validation and k-fold cross-validation with repeated samples using caret package
7. Compare the R-square and RMSE of all the model and choose the one for final prediction

library(haven)
sav_data <- read_sav("D:/R programming runs/Assisgnments/Projects/Presentation/ZZIR62FL.SAV")

Splitting data to 80-20 train & test data

set.seed(16)
ind <- sample(2, nrow(sav_data), replace = T, prob = c(0.8, 0.2))

train_data <- sav_data[ind == 1, ]
test_data <- sav_data[ind == 2, ]

Support Vector Machine

library(caret)
model_svm <- train(
  V201 ~ V013 + V024 + V025 + V106 + V190,
  data = train_data,
  method = 'svmRadial',
  preProcess = c("center", "scale"),
  trCtrl = trainControl(method = "none")
)
model_svm
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 6713 samples
##    5 predictor
## 
## Pre-processing: centered (5), scaled (5) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 6713, 6713, 6713, 6713, 6713, 6713, ... 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared   MAE     
##   0.25  1.634559  0.6370052  1.163003
##   0.50  1.637227  0.6358248  1.164933
##   1.00  1.641904  0.6338846  1.167872
## 
## Tuning parameter 'sigma' was held constant at a value of 0.2161798
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.2161798 and C = 0.25.

Finding R square and RMSE for test data

pred_tr_svm <- predict(model_svm, train_data)
fit_ind_tr_svm <- data.frame(
  R2 = R2(pred_tr_svm, train_data$V201),
  RMSE = RMSE(pred_tr_svm, train_data$V201)
)

Again, to find R square and RMSE for test data

pred_tst_svm <- predict(model_svm, test_data)

fit_ind_tst_svm <- data.frame(
  R2 = R2(pred_tst_svm, test_data$V201),
  RMSE = RMSE(pred_tst_svm, test_data$V201)
)

Comparison table of test & train data

data.frame(
  Model = c("SVM Train", "SVM Test"),
  R2 = c(fit_ind_tr_svm$R2, fit_ind_tst_svm$R2),
  RMSE = c(fit_ind_tr_svm$RMSE, fit_ind_tst_svm$RMSE)
)
##       Model        R2     RMSE
## 1 SVM Train 0.6417504 1.619539
## 2  SVM Test 0.6254930 1.646716

R-square for test data of SVM model is 0.624732 which means that independent variables are able to explain 62.47% of variance in dependent variable on test data.
Here, the difference between RMSE & R-square is less than 5 percent difference so we can say that there is no over-fitting or under-fitting in the model.

Cross Vaildations

Leave One Our Cross Validation

tcr_loocv <- trainControl(method = "LOOCV")
model_loocv <- train(V201 ~ V013 + V024 + V025 + V106 + V190, data=train_data, method="lm", trControl = tcr_loocv)

pred_tst_loocv <- predict(model_loocv, test_data)

fit_ind_tst_loocv <- data.frame(
  R2 = R2(pred_tst_loocv, test_data$V201),
  RMSE = RMSE(pred_tst_loocv, test_data$V201)
)

Train model data & test model data

print(model_loocv)
## Linear Regression 
## 
## 6713 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 6712, 6712, 6712, 6712, 6712, 6712, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.665501  0.6210293  1.191331
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit_ind_tst_loocv
##          R2     RMSE
## 1 0.6087274 1.683227

K-folds Cross Validation

tcr_cv <- trainControl(method = "cv", number = 10)
model_cv <- train(V201 ~ V013 + V024 + V025 + V106 + V190, data=train_data, method="lm", trControl = tcr_cv)

pred_tst_cv <- predict(model_cv, test_data)

fit_ind_tst_cv <- data.frame(
  R2 = R2(pred_tst_cv, test_data$V201),
  RMSE = RMSE(pred_tst_cv, test_data$V201)
)

Train model data & test model data

print(model_cv)
## Linear Regression 
## 
## 6713 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 6042, 6042, 6042, 6041, 6041, 6041, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.664859  0.6217407  1.191679
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit_ind_tst_cv
##          R2     RMSE
## 1 0.6087274 1.683227

Repeated K-fold Cross Validation

tcr_rep_cv <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
model_rep_cv <- train(V201 ~ V013 + V024 + V025 + V106 + V190, data=train_data, method="lm", trControl = tcr_rep_cv)

pred_tst_rep_cv <- predict(model_rep_cv, test_data)

fit_ind_tst_rep_cv <- data.frame(
  R2 = R2(pred_tst_rep_cv, test_data$V201),
  RMSE = RMSE(pred_tst_rep_cv, test_data$V201)
)

Train model data & test model data

print(model_rep_cv)
## Linear Regression 
## 
## 6713 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 6041, 6042, 6043, 6042, 6041, 6041, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE    
##   1.664728  0.6219609  1.19127
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit_ind_tst_rep_cv
##          R2     RMSE
## 1 0.6087274 1.683227
fit_indices_table <- data.frame(
  Model = c("SVM", "LOOCV", "k-fold CV", "Repeated k-fold CV"),
  R2 = c(fit_ind_tst_svm$R2, fit_ind_tst_loocv$R2, fit_ind_tst_cv$R2, fit_ind_tst_rep_cv$R2),
  RMSE = c(fit_ind_tst_svm$RMSE, fit_ind_tst_loocv$RMSE, fit_ind_tst_cv$RMSE, fit_ind_tst_rep_cv$RMSE)
)

fit_indices_table
##                Model        R2     RMSE
## 1                SVM 0.6254930 1.646716
## 2              LOOCV 0.6087274 1.683227
## 3          k-fold CV 0.6087274 1.683227
## 4 Repeated k-fold CV 0.6087274 1.683227

The best model is

best_model <- fit_indices_table[which.max(fit_indices_table$R2), ]
print(best_model)
##   Model       R2     RMSE
## 1   SVM 0.625493 1.646716

Hence, the best model is SVM and can be used for better prediction results.