Statistical Computing with R (Presentation) Course: MDS 503 (Statistical Computing with R) Student: Krishna Gaire (15) Teacher: Shital Bhandary (Associate Professor) School: School of Mathematical Sciences, IOST, TU
library(haven)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#install.packages("lattice")
library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read_sav("ZZIR62FL.SAV")
head(data)
## # A tibble: 6 × 4,275
## CASEID V000 V001 V002 V003 V004 V005 V006 V007 V008 V009 V010
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 " 1… ZZ6 1 1 2 1 1.06e6 6 2015 1386 6 1985
## 2 " 1… ZZ6 1 3 2 1 1.06e6 6 2015 1386 4 1993
## 3 " 1… ZZ6 1 4 2 1 1.06e6 6 2015 1386 4 1973
## 4 " 1… ZZ6 1 4 3 1 1.06e6 6 2015 1386 7 1989
## 5 " 1… ZZ6 1 5 1 1 1.06e6 6 2015 1386 2 1990
## 6 " 1… ZZ6 1 6 2 1 1.06e6 6 2015 1386 2 1978
## # ℹ 4,263 more variables: V011 <dbl>, V012 <dbl>, V013 <dbl+lbl>,
## # V014 <dbl+lbl>, V015 <dbl+lbl>, V016 <dbl>, V017 <dbl>, V018 <dbl+lbl>,
## # V019 <dbl+lbl>, V019A <dbl+lbl>, V020 <dbl+lbl>, V021 <dbl>, V022 <dbl>,
## # V023 <dbl+lbl>, V024 <dbl+lbl>, V025 <dbl+lbl>, V026 <dbl+lbl>, V027 <dbl>,
## # V028 <dbl>, V029 <dbl>, V030 <dbl>, V031 <dbl>, V032 <dbl>, V034 <dbl+lbl>,
## # V040 <dbl>, V042 <dbl+lbl>, V044 <dbl+lbl>, V101 <dbl+lbl>, V102 <dbl+lbl>,
## # V103 <dbl+lbl>, V104 <dbl+lbl>, V105 <dbl+lbl>, V106 <dbl+lbl>, …
# Set the seed using your class roll number
set.seed(15)
# Select Relevant columns only
df <- data[c('V201', 'V013', 'V024', 'V025', 'V106', 'V190')]
head(df)
## # A tibble: 6 × 6
## V201 V013 V024 V025 V106 V190
## <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 6 4 [30-34] 2 [Region 2] 2 [Rural] 0 [No education] 1 [Poorest]
## 2 1 2 [20-24] 2 [Region 2] 2 [Rural] 2 [Secondary] 3 [Middle]
## 3 2 6 [40-44] 2 [Region 2] 2 [Rural] 0 [No education] 3 [Middle]
## 4 3 3 [25-29] 2 [Region 2] 2 [Rural] 1 [Primary] 3 [Middle]
## 5 4 3 [25-29] 2 [Region 2] 2 [Rural] 2 [Secondary] 2 [Poorer]
## 6 2 5 [35-39] 2 [Region 2] 2 [Rural] 0 [No education] 3 [Middle]
# Split the data into training (80%) and testing (20%) datasets
ind <- sample(2, nrow(df), replace = T, prob = c(0.8, 0.2))
train_data <- df[ind==1,]
test_data <- df[ind==2,]
qqnorm(df$V201)
qqPlot(df$V201)
## [1] 240 792
# By observing the plot the Target is not normally distributed hence it is not suitable to fit data with linear regression
# Fit the regression model on the training data
model <- lm(V201 ~ ., data = train_data)
# Print the model summary
summary(model)
##
## Call:
## lm(formula = V201 ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0789 -0.8698 -0.0290 0.8820 9.8116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09442 0.14956 0.631 0.5278
## V013 0.96353 0.01164 82.804 < 2e-16 ***
## V024 0.01695 0.01857 0.913 0.3613
## V025 0.15440 0.05548 2.783 0.0054 **
## V106 -0.41992 0.02600 -16.151 < 2e-16 ***
## V190 -0.13680 0.01974 -6.929 4.63e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.67 on 6632 degrees of freedom
## Multiple R-squared: 0.6179, Adjusted R-squared: 0.6177
## F-statistic: 2145 on 5 and 6632 DF, p-value: < 2.2e-16
#Multiple R-squared: In this case, the model explains approximately 61.42% of the variance in the dependent variable.
#F-statistic: In this case, the F-statistic is 2135 with a very low p-value, indicating that the model as a whole is statistically significant.
# Calculate the Variance Inflation Factor VIF for each independent variable
vif <- car::vif(model)
# Print the VIF values
vif
## V013 V024 V025 V106 V190
## 1.214196 1.064079 1.771193 1.466779 1.770965
# In this case, all the VIF values are close to 1, indicating that there is no significant multicollinearity among the independent variables.
# Use caret package to calculate R-squared and RMSE on the training data
train_preds <- predict(model, newdata = train_data)
train_r2 <- R2(pred = train_preds, obs = train_data$V201)
(train_r2)
## [1] 0.617947
train_rmse <- RMSE(pred = train_preds, obs = train_data$V201)
(train_rmse)
## [1] 1.668997
# it means that on Training data around 61.42% of the variance in the total number of children ever born (V201) can be explained by the independent variables in the model.
# Predict the dependent variable on the test data
test_preds <- predict(model, newdata = test_data)
# Calculate R-squared and RMSE on the test data
test_r2 <- R2(pred = test_preds, obs = test_data$V201)
(test_r2)
## [1] 0.6243418
test_rmse <- RMSE(pred = test_preds, obs = test_data$V201)
(test_rmse)
## [1] 1.662885
# The R-squared on the test data is approximately 0.6387, indicating that around 63.87% of the variance in the total number of children ever born (V201) can be explained by the independent variables in the model.
# Define the training control with LOOCV, k-fold cross-validation, and repeated k-fold cross-validation
loocv_control <- trainControl(method = "LOOCV")
kfold_control <- trainControl(method = "cv", number = 10)
repeated_kfold_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# Use caret package to tune the model and calculate R-squared and RMSE using cross-validation
model_loocv <- train(V201 ~., data = train_data, method = "lm", trControl = loocv_control)
model_kfold <- train(V201 ~., data = train_data, method = "lm", trControl = kfold_control)
model_repeated_kfold <- train(V201 ~., data = train_data, method = "lm", trControl = repeated_kfold_control)
# Predict on the test data
loocv_preds <- predict(model_loocv, newdata = test_data)
kfold_preds <- predict(model_kfold, newdata = test_data)
repeated_kfold_preds <- predict(model_repeated_kfold, newdata = test_data)
# Calculate R-squared for the test data
loocv_r2 <- R2(loocv_preds, test_data$V201)
kfold_r2 <- R2(kfold_preds, test_data$V201)
repeated_kfold_r2 <- R2(repeated_kfold_preds, test_data$V201)
# Calculate RMSE for the test data
loocv_rmse <- RMSE(loocv_preds, test_data$V201)
kfold_rmse <- RMSE(kfold_preds, test_data$V201)
repeated_kfold_rmse <- RMSE(repeated_kfold_preds, test_data$V201)
# Create a data frame to compare R-squared and RMSE values
results <- data.frame(Method = c("LOOCV", "k-fold CV", "Repeated k-fold CV"),
R_squared = c(loocv_r2, kfold_r2, repeated_kfold_r2),
RMSE = c(loocv_rmse, kfold_rmse, repeated_kfold_rmse))
results
## Method R_squared RMSE
## 1 LOOCV 0.6243418 1.662885
## 2 k-fold CV 0.6243418 1.662885
## 3 Repeated k-fold CV 0.6243418 1.662885
# In this case, all three cross-validation techniques yield the same R-squared and RMSE values
library(gbm)
## Loaded gbm 2.1.8.1
gbm_model <- gbm(V201 ~ ., data = train_data, n.trees = 1000, interaction.depth = 4, shrinkage = 0.01)
## Distribution not specified, assuming gaussian ...
# Predict the dependent variable on the training data
train_preds_gbm <- predict(gbm_model, newdata = train_data, n.trees = 1000)
# Calculate R-squared and RMSE on the training data
train_r2_gbm <- R2(pred = train_preds_gbm, obs = train_data$V201)
train_rmse_gbm <- RMSE(pred = train_preds_gbm, obs = train_data$V201)
# Predict the dependent variable on the test data
test_preds_gbm <- predict(gbm_model, newdata = test_data, n.trees = 1000)
# Calculate R-squared and RMSE on the test data
test_r2_gbm <- R2(pred = test_preds_gbm, obs = test_data$V201)
test_rmse_gbm <- RMSE(pred = test_preds_gbm, obs = test_data$V201)
library(e1071)
# Fit the Support Vector Regression (SVR) model
svm_model <- svm(V201 ~ ., data = train_data, kernel = "radial", cost = 1, epsilon = 0.1)
# Predict the dependent variable on the training data
train_preds_svm <- predict(svm_model, newdata = train_data)
# Calculate R-squared and RMSE on the training data
train_r2_svm <- R2(pred = train_preds_svm, obs = train_data$V201)
train_rmse_svm <- RMSE(pred = train_preds_svm, obs = train_data$V201)
# Predict the dependent variable on the test data
test_preds_svm <- predict(svm_model, newdata = test_data)
# Calculate R-squared and RMSE on the test data
test_r2_svm <- R2(pred = test_preds_svm, obs = test_data$V201)
test_rmse_svm <- RMSE(pred = test_preds_svm, obs = test_data$V201)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Fit the Random Forest regression model
rf_model <- randomForest(V201 ~ ., data = train_data, ntree = 80)
# Predict the dependent variable on the training data
train_preds_rf <- predict(rf_model, newdata = train_data)
# Calculate R-squared and RMSE on the training data
train_r2_rf <- R2(pred = train_preds_rf, obs = train_data$V201)
train_rmse_rf <- RMSE(pred = train_preds_rf, obs = train_data$V201)
# Predict the dependent variable on the test data
test_preds_rf <- predict(rf_model, newdata = test_data)
# Calculate R-squared and RMSE on the test data
test_r2_rf <- R2(pred = test_preds_rf, obs = test_data$V201)
test_rmse_rf <- RMSE(pred = test_preds_rf, obs = test_data$V201)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Convert the data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_data[, -1]), label = train_data$V201)
dtest <- xgb.DMatrix(data = as.matrix(test_data[, -1]), label = test_data$V201)
# Set the parameters for XGBoost regression
params <- list(objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = 6,
eta = 0.1,
nrounds = 15)
# Train the XGBoost regression model
xgb_model <- xgb.train(params = params, data = dtrain,nrounds = params$nrounds)
## [00:06:14] WARNING: src/learner.cc:767:
## Parameters: { "nrounds" } are not used.
# Predict the dependent variable on the training data
train_preds_xgb <- predict(xgb_model, newdata = dtrain)
# Calculate R-squared and RMSE on the training data
train_r2_xgb <- R2(pred = train_preds_xgb, obs = train_data$V201)
train_rmse_xgb <- RMSE(pred = train_preds_xgb, obs = train_data$V201)
# Predict the dependent variable on the test data
test_preds_xgb <- predict(xgb_model, newdata = dtest)
# Calculate R-squared and RMSE on the test data
test_r2_xgb <- R2(pred = test_preds_xgb, obs = test_data$V201)
test_rmse_xgb <- RMSE(pred = test_preds_xgb, obs = test_data$V201)
# Create a summary table
summary_table <- data.frame(Method = c("Linear Regression", "Gradient Boosting", "Support Vector Regression", "Random Forest", "XGBoost"),
R_squared_train = c(train_r2, train_r2_gbm, train_r2_svm, train_r2_rf, train_r2_xgb),
RMSE_train = c(train_rmse, train_rmse_gbm, train_rmse_svm, train_rmse_rf, train_rmse_xgb),
R_squared_test = c(test_r2, test_r2_gbm, test_r2_svm, test_r2_rf, test_r2_xgb),
RMSE_test = c(test_rmse, test_rmse_gbm, test_rmse_svm, test_rmse_rf, test_rmse_xgb))
# Print the summary table
print(summary_table)
## Method R_squared_train RMSE_train R_squared_test RMSE_test
## 1 Linear Regression 0.6179470 1.668997 0.6243418 1.662885
## 2 Gradient Boosting 0.6427652 1.613920 0.6400054 1.627839
## 3 Support Vector Regression 0.6414821 1.617525 0.6330116 1.644692
## 4 Random Forest 0.6005021 1.803495 0.5966535 1.818191
## 5 XGBoost 0.6462093 1.740785 0.6388200 1.765611
#Among the regression models evaluated, the XGBoost and gradient boosting models demonstrated the highest performance in terms of R-squared and RMSE, indicating their suitability for predicting the total number of children ever born based on the given independent variables.