| Course: MDS 503 (Statistical Computing with R) |
| Student: You name (roll number) |
| Teacher: Shital Bhandary (Associate Professor |
| School: School of Mathematical Sciences, IOST, TU |
Task:
1. Download the Individual
recode file from: https://dhsprogram.com/data/Download-Model-Datasets.cfm
2. Read it in R Studio and split it into training (80%) and testing
(20%) datasets with set.seed as your class roll number
3. Fit a
supervised regression model on the training data with Total Children
Ever Born (V201) as dependent variable and age group (V013), region
(V024), type of place of residence (V025), highest education level
(V106) and wealth index (V190) as independent variables and interpret
the result carefully, check VIF too and do the needful statistically if
required
4. Get the R-square and RMSE of this fitted model on
training data using caret package
5. Predict the dependent variable
on the test data and get the R-square and RMSE using caret package
6. Tune the R-square and RMSE values of the testing model using LOOCV,
k-fold cross validation and k-fold cross-validation with repeated
samples using caret package
7. Compare the R-square and RMSE of all
the model and choose the one for final prediction
library(haven)
sav_data <- read_sav("D:/R programming runs/Assisgnments/Projects/Presentation/ZZIR62FL.SAV")
Splitting data to 80-20 train & test data
set.seed(16)
ind <- sample(2, nrow(sav_data), replace = T, prob = c(0.8, 0.2))
train_data <- sav_data[ind == 1, ]
test_data <- sav_data[ind == 2, ]
library(caret)
model_svm <- train(
V201 ~ V013 + V024 + V025 + V106 + V190,
data = train_data,
method = 'svmRadial',
preProcess = c("center", "scale"),
trCtrl = trainControl(method = "none")
)
model_svm
## Support Vector Machines with Radial Basis Function Kernel
##
## 6713 samples
## 5 predictor
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 6713, 6713, 6713, 6713, 6713, 6713, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.634559 0.6370052 1.163003
## 0.50 1.637227 0.6358248 1.164933
## 1.00 1.641904 0.6338846 1.167872
##
## Tuning parameter 'sigma' was held constant at a value of 0.2161798
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.2161798 and C = 0.25.
Finding R square and RMSE for test data
pred_tr_svm <- predict(model_svm, train_data)
fit_ind_tr_svm <- data.frame(
R2 = R2(pred_tr_svm, train_data$V201),
RMSE = RMSE(pred_tr_svm, train_data$V201)
)
Again, to find R square and RMSE for test data
pred_tst_svm <- predict(model_svm, test_data)
fit_ind_tst_svm <- data.frame(
R2 = R2(pred_tst_svm, test_data$V201),
RMSE = RMSE(pred_tst_svm, test_data$V201)
)
Comparison table of test & train data
data.frame(
Model = c("SVM Train", "SVM Test"),
R2 = c(fit_ind_tr_svm$R2, fit_ind_tst_svm$R2),
RMSE = c(fit_ind_tr_svm$RMSE, fit_ind_tst_svm$RMSE)
)
## Model R2 RMSE
## 1 SVM Train 0.6417504 1.619539
## 2 SVM Test 0.6254930 1.646716
R-square for test data of SVM model is 0.624732 which means that
independent variables are able to explain 62.47% of variance in
dependent variable on test data.
Here, the difference between RMSE
& R-square is less than 5 percent difference so we can say that
there is no over-fitting or under-fitting in the model.
tcr_loocv <- trainControl(method = "LOOCV")
model_loocv <- train(V201 ~ V013 + V024 + V025 + V106 + V190, data=train_data, method="lm", trControl = tcr_loocv)
pred_tst_loocv <- predict(model_loocv, test_data)
fit_ind_tst_loocv <- data.frame(
R2 = R2(pred_tst_loocv, test_data$V201),
RMSE = RMSE(pred_tst_loocv, test_data$V201)
)
Train model data & test model data
print(model_loocv)
## Linear Regression
##
## 6713 samples
## 5 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 6712, 6712, 6712, 6712, 6712, 6712, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.665501 0.6210293 1.191331
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit_ind_tst_loocv
## R2 RMSE
## 1 0.6087274 1.683227
tcr_cv <- trainControl(method = "cv", number = 10)
model_cv <- train(V201 ~ V013 + V024 + V025 + V106 + V190, data=train_data, method="lm", trControl = tcr_cv)
pred_tst_cv <- predict(model_cv, test_data)
fit_ind_tst_cv <- data.frame(
R2 = R2(pred_tst_cv, test_data$V201),
RMSE = RMSE(pred_tst_cv, test_data$V201)
)
Train model data & test model data
print(model_cv)
## Linear Regression
##
## 6713 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6042, 6042, 6042, 6041, 6041, 6041, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.664859 0.6217407 1.191679
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit_ind_tst_cv
## R2 RMSE
## 1 0.6087274 1.683227
tcr_rep_cv <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
model_rep_cv <- train(V201 ~ V013 + V024 + V025 + V106 + V190, data=train_data, method="lm", trControl = tcr_rep_cv)
pred_tst_rep_cv <- predict(model_rep_cv, test_data)
fit_ind_tst_rep_cv <- data.frame(
R2 = R2(pred_tst_rep_cv, test_data$V201),
RMSE = RMSE(pred_tst_rep_cv, test_data$V201)
)
Train model data & test model data
print(model_rep_cv)
## Linear Regression
##
## 6713 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 6041, 6042, 6043, 6042, 6041, 6041, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.664728 0.6219609 1.19127
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit_ind_tst_rep_cv
## R2 RMSE
## 1 0.6087274 1.683227
fit_indices_table <- data.frame(
Model = c("SVM", "LOOCV", "k-fold CV", "Repeated k-fold CV"),
R2 = c(fit_ind_tst_svm$R2, fit_ind_tst_loocv$R2, fit_ind_tst_cv$R2, fit_ind_tst_rep_cv$R2),
RMSE = c(fit_ind_tst_svm$RMSE, fit_ind_tst_loocv$RMSE, fit_ind_tst_cv$RMSE, fit_ind_tst_rep_cv$RMSE)
)
fit_indices_table
## Model R2 RMSE
## 1 SVM 0.6254930 1.646716
## 2 LOOCV 0.6087274 1.683227
## 3 k-fold CV 0.6087274 1.683227
## 4 Repeated k-fold CV 0.6087274 1.683227
The best model is
best_model <- fit_indices_table[which.max(fit_indices_table$R2), ]
print(best_model)
## Model R2 RMSE
## 1 SVM 0.625493 1.646716
Hence, the best model is SVM and can be used for better prediction results.