Statistical Computing with R (Presentation) Course: MDS 503 (Statistical Computing with R) Student: Krishna Gaire (15) Teacher: Shital Bhandary (Associate Professor) School: School of Mathematical Sciences, IOST, TU

library(haven)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#install.packages("lattice")
library(car)
## Loading required package: carData
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- read_sav("ZZIR62FL.SAV")
head(data)
## # A tibble: 6 × 4,275
##   CASEID      V000   V001  V002  V003  V004   V005  V006  V007  V008  V009  V010
##   <chr>       <chr> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "        1… ZZ6       1     1     2     1 1.06e6     6  2015  1386     6  1985
## 2 "        1… ZZ6       1     3     2     1 1.06e6     6  2015  1386     4  1993
## 3 "        1… ZZ6       1     4     2     1 1.06e6     6  2015  1386     4  1973
## 4 "        1… ZZ6       1     4     3     1 1.06e6     6  2015  1386     7  1989
## 5 "        1… ZZ6       1     5     1     1 1.06e6     6  2015  1386     2  1990
## 6 "        1… ZZ6       1     6     2     1 1.06e6     6  2015  1386     2  1978
## # ℹ 4,263 more variables: V011 <dbl>, V012 <dbl>, V013 <dbl+lbl>,
## #   V014 <dbl+lbl>, V015 <dbl+lbl>, V016 <dbl>, V017 <dbl>, V018 <dbl+lbl>,
## #   V019 <dbl+lbl>, V019A <dbl+lbl>, V020 <dbl+lbl>, V021 <dbl>, V022 <dbl>,
## #   V023 <dbl+lbl>, V024 <dbl+lbl>, V025 <dbl+lbl>, V026 <dbl+lbl>, V027 <dbl>,
## #   V028 <dbl>, V029 <dbl>, V030 <dbl>, V031 <dbl>, V032 <dbl>, V034 <dbl+lbl>,
## #   V040 <dbl>, V042 <dbl+lbl>, V044 <dbl+lbl>, V101 <dbl+lbl>, V102 <dbl+lbl>,
## #   V103 <dbl+lbl>, V104 <dbl+lbl>, V105 <dbl+lbl>, V106 <dbl+lbl>, …
# Set the seed using your class roll number
set.seed(15)

# Select Relevant columns only

df <- data[c('V201', 'V013', 'V024', 'V025', 'V106', 'V190')]
head(df)
## # A tibble: 6 × 6
##    V201 V013      V024         V025      V106             V190       
##   <dbl> <dbl+lbl> <dbl+lbl>    <dbl+lbl> <dbl+lbl>        <dbl+lbl>  
## 1     6 4 [30-34] 2 [Region 2] 2 [Rural] 0 [No education] 1 [Poorest]
## 2     1 2 [20-24] 2 [Region 2] 2 [Rural] 2 [Secondary]    3 [Middle] 
## 3     2 6 [40-44] 2 [Region 2] 2 [Rural] 0 [No education] 3 [Middle] 
## 4     3 3 [25-29] 2 [Region 2] 2 [Rural] 1 [Primary]      3 [Middle] 
## 5     4 3 [25-29] 2 [Region 2] 2 [Rural] 2 [Secondary]    2 [Poorer] 
## 6     2 5 [35-39] 2 [Region 2] 2 [Rural] 0 [No education] 3 [Middle]
# Split the data into training (80%) and testing (20%) datasets
ind <- sample(2, nrow(df), replace = T, prob = c(0.8, 0.2))

train_data <- df[ind==1,]
test_data <- df[ind==2,]

qqnorm(df$V201)

qqPlot(df$V201)  

## [1] 240 792
# By observing the plot the Target is not normally distributed hence it is not suitable to fit data with linear regression

# Fit the regression model on the training data
model <- lm(V201 ~ ., data = train_data)

# Print the model summary
summary(model)
## 
## Call:
## lm(formula = V201 ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0789 -0.8698 -0.0290  0.8820  9.8116 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09442    0.14956   0.631   0.5278    
## V013         0.96353    0.01164  82.804  < 2e-16 ***
## V024         0.01695    0.01857   0.913   0.3613    
## V025         0.15440    0.05548   2.783   0.0054 ** 
## V106        -0.41992    0.02600 -16.151  < 2e-16 ***
## V190        -0.13680    0.01974  -6.929 4.63e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.67 on 6632 degrees of freedom
## Multiple R-squared:  0.6179, Adjusted R-squared:  0.6177 
## F-statistic:  2145 on 5 and 6632 DF,  p-value: < 2.2e-16
#Multiple R-squared: In this case, the model explains approximately 61.42% of the variance in the dependent variable.
#F-statistic: In this case, the F-statistic is 2135 with a very low p-value, indicating that the model as a whole is statistically significant.


# Calculate the Variance Inflation Factor VIF for each independent variable
vif <- car::vif(model)

# Print the VIF values
vif
##     V013     V024     V025     V106     V190 
## 1.214196 1.064079 1.771193 1.466779 1.770965
# In this case, all the VIF values are close to 1, indicating that there is no significant multicollinearity among the independent variables.

# Use caret package to calculate R-squared and RMSE on the training data
train_preds <- predict(model, newdata = train_data)
train_r2 <- R2(pred = train_preds, obs = train_data$V201)
(train_r2)
## [1] 0.617947
train_rmse <- RMSE(pred = train_preds, obs = train_data$V201)
(train_rmse)
## [1] 1.668997
#  it means that on Training data  around 61.42% of the variance in the total number of children ever born (V201) can be explained by the independent variables in the model.


# Predict the dependent variable on the test data
test_preds <- predict(model, newdata = test_data)

# Calculate R-squared and RMSE on the test data
test_r2 <- R2(pred = test_preds, obs = test_data$V201)
(test_r2)
## [1] 0.6243418
test_rmse <- RMSE(pred = test_preds, obs = test_data$V201)
(test_rmse)
## [1] 1.662885
# The R-squared on the test data is approximately 0.6387, indicating that around 63.87% of the variance in the total number of children ever born (V201) can be explained by the independent variables in the model. 

# Define the training control with LOOCV, k-fold cross-validation, and repeated k-fold cross-validation
loocv_control <- trainControl(method = "LOOCV")
kfold_control <- trainControl(method = "cv", number = 10)
repeated_kfold_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

# Use caret package to tune the model and calculate R-squared and RMSE using cross-validation
model_loocv <- train(V201 ~., data = train_data, method = "lm", trControl = loocv_control)
model_kfold <- train(V201 ~., data = train_data, method = "lm", trControl = kfold_control)
model_repeated_kfold <- train(V201 ~., data = train_data, method = "lm", trControl = repeated_kfold_control)

# Predict on the test data
loocv_preds <- predict(model_loocv, newdata = test_data)
kfold_preds <- predict(model_kfold, newdata = test_data)
repeated_kfold_preds <- predict(model_repeated_kfold, newdata = test_data)


# Calculate R-squared for the test data
loocv_r2 <- R2(loocv_preds, test_data$V201)
kfold_r2 <- R2(kfold_preds, test_data$V201)
repeated_kfold_r2 <- R2(repeated_kfold_preds, test_data$V201)


# Calculate RMSE for the test data
loocv_rmse <- RMSE(loocv_preds, test_data$V201)
kfold_rmse <- RMSE(kfold_preds, test_data$V201)
repeated_kfold_rmse <- RMSE(repeated_kfold_preds, test_data$V201)

# Create a data frame to compare R-squared and RMSE values
results <- data.frame(Method = c("LOOCV", "k-fold CV", "Repeated k-fold CV"),
                      R_squared = c(loocv_r2, kfold_r2, repeated_kfold_r2),
                      RMSE = c(loocv_rmse, kfold_rmse, repeated_kfold_rmse))


results
##               Method R_squared     RMSE
## 1              LOOCV 0.6243418 1.662885
## 2          k-fold CV 0.6243418 1.662885
## 3 Repeated k-fold CV 0.6243418 1.662885
# In this case, all three cross-validation techniques yield the same R-squared and RMSE values

library(gbm)
## Loaded gbm 2.1.8.1
gbm_model <- gbm(V201 ~ ., data = train_data, n.trees = 1000, interaction.depth = 4, shrinkage = 0.01)
## Distribution not specified, assuming gaussian ...
# Predict the dependent variable on the training data
train_preds_gbm <- predict(gbm_model, newdata = train_data, n.trees = 1000)

# Calculate R-squared and RMSE on the training data
train_r2_gbm <- R2(pred = train_preds_gbm, obs = train_data$V201)
train_rmse_gbm <- RMSE(pred = train_preds_gbm, obs = train_data$V201)

# Predict the dependent variable on the test data
test_preds_gbm <- predict(gbm_model, newdata = test_data, n.trees = 1000)

# Calculate R-squared and RMSE on the test data
test_r2_gbm <- R2(pred = test_preds_gbm, obs = test_data$V201)
test_rmse_gbm <- RMSE(pred = test_preds_gbm, obs = test_data$V201)

library(e1071)

# Fit the Support Vector Regression (SVR) model
svm_model <- svm(V201 ~ ., data = train_data, kernel = "radial", cost = 1, epsilon = 0.1)

# Predict the dependent variable on the training data
train_preds_svm <- predict(svm_model, newdata = train_data)

# Calculate R-squared and RMSE on the training data
train_r2_svm <- R2(pred = train_preds_svm, obs = train_data$V201)
train_rmse_svm <- RMSE(pred = train_preds_svm, obs = train_data$V201)

# Predict the dependent variable on the test data
test_preds_svm <- predict(svm_model, newdata = test_data)

# Calculate R-squared and RMSE on the test data
test_r2_svm <- R2(pred = test_preds_svm, obs = test_data$V201)
test_rmse_svm <- RMSE(pred = test_preds_svm, obs = test_data$V201)

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Fit the Random Forest regression model
rf_model <- randomForest(V201 ~ ., data = train_data, ntree = 80)

# Predict the dependent variable on the training data
train_preds_rf <- predict(rf_model, newdata = train_data)

# Calculate R-squared and RMSE on the training data
train_r2_rf <- R2(pred = train_preds_rf, obs = train_data$V201)
train_rmse_rf <- RMSE(pred = train_preds_rf, obs = train_data$V201)

# Predict the dependent variable on the test data
test_preds_rf <- predict(rf_model, newdata = test_data)

# Calculate R-squared and RMSE on the test data
test_r2_rf <- R2(pred = test_preds_rf, obs = test_data$V201)
test_rmse_rf <- RMSE(pred = test_preds_rf, obs = test_data$V201)

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
# Convert the data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_data[, -1]), label = train_data$V201)
dtest <- xgb.DMatrix(data = as.matrix(test_data[, -1]), label = test_data$V201)

# Set the parameters for XGBoost regression
params <- list(objective = "reg:squarederror",
               eval_metric = "rmse",
               max_depth = 6,
               eta = 0.1,
               nrounds = 15)

# Train the XGBoost regression model
xgb_model <- xgb.train(params = params, data = dtrain,nrounds = params$nrounds)
## [00:06:14] WARNING: src/learner.cc:767: 
## Parameters: { "nrounds" } are not used.
# Predict the dependent variable on the training data
train_preds_xgb <- predict(xgb_model, newdata = dtrain)

# Calculate R-squared and RMSE on the training data
train_r2_xgb <- R2(pred = train_preds_xgb, obs = train_data$V201)
train_rmse_xgb <- RMSE(pred = train_preds_xgb, obs = train_data$V201)

# Predict the dependent variable on the test data
test_preds_xgb <- predict(xgb_model, newdata = dtest)

# Calculate R-squared and RMSE on the test data
test_r2_xgb <- R2(pred = test_preds_xgb, obs = test_data$V201)
test_rmse_xgb <- RMSE(pred = test_preds_xgb, obs = test_data$V201)


# Create a summary table
summary_table <- data.frame(Method = c("Linear Regression", "Gradient Boosting", "Support Vector Regression", "Random Forest", "XGBoost"),
                            R_squared_train = c(train_r2, train_r2_gbm, train_r2_svm, train_r2_rf, train_r2_xgb),
                            RMSE_train = c(train_rmse, train_rmse_gbm, train_rmse_svm, train_rmse_rf, train_rmse_xgb),
                            R_squared_test = c(test_r2, test_r2_gbm, test_r2_svm, test_r2_rf, test_r2_xgb),
                            RMSE_test = c(test_rmse, test_rmse_gbm, test_rmse_svm, test_rmse_rf, test_rmse_xgb))

# Print the summary table
print(summary_table)
##                      Method R_squared_train RMSE_train R_squared_test RMSE_test
## 1         Linear Regression       0.6179470   1.668997      0.6243418  1.662885
## 2         Gradient Boosting       0.6427652   1.613920      0.6400054  1.627839
## 3 Support Vector Regression       0.6414821   1.617525      0.6330116  1.644692
## 4             Random Forest       0.6005021   1.803495      0.5966535  1.818191
## 5                   XGBoost       0.6462093   1.740785      0.6388200  1.765611
#Among the regression models evaluated, the XGBoost and gradient boosting models demonstrated the highest performance in terms of R-squared and RMSE, indicating their suitability for predicting the total number of children ever born based on the given independent variables.