Course: Statistical Computing with R(MDS503)
Student: Bhanu Shrestha (09)
Teacher: Shital Bhandary (Associate Professor),School of Mathematical Sciences, IOST, TU
library(foreign)
library(dplyr)
library(polycor)
library(doParallel)
library(caret)
library(gbm)
library(e1071)
library(randomForest)
library(kernlab)
#Loading Data
data.spss <- read.spss("C:/Users/BS5922/OneDrive - Hitachi Energy/Desktop/BA Bhanu Shrestha/Bhanu Shrestha - Personal/MDS I/R/Presentaion/zzir62sv/ZZIR62FL.sav")
# Know your data before analysis:
data <- data.frame(data.spss$V013,
data.spss$V024,
data.spss$V025,
data.spss$V106,
data.spss$V190,
data.spss$V201)
colnames(data) <- c("Age_Group",
"Region",
"Residence",
"Education",
"Wealth_Index",
"Children_Born")
str(data)
## 'data.frame': 8348 obs. of 6 variables:
## $ Age_Group : Factor w/ 7 levels "15-19","20-24",..: 4 2 6 3 3 5 1 2 7 5 ...
## $ Region : Factor w/ 4 levels "Region 1","Region 2",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Residence : Factor w/ 2 levels "Urban","Rural": 2 2 2 2 2 2 2 2 2 2 ...
## $ Education : Factor w/ 4 levels "No education",..: 1 3 1 2 3 1 3 1 1 2 ...
## $ Wealth_Index : Factor w/ 5 levels "Poorest","Poorer",..: 1 3 3 3 2 3 3 1 4 3 ...
## $ Children_Born: num 6 1 2 3 4 2 0 1 2 5 ...
head(data)
## Age_Group Region Residence Education Wealth_Index Children_Born
## 1 30-34 Region 2 Rural No education Poorest 6
## 2 20-24 Region 2 Rural Secondary Middle 1
## 3 40-44 Region 2 Rural No education Middle 2
## 4 25-29 Region 2 Rural Primary Middle 3
## 5 25-29 Region 2 Rural Secondary Poorer 4
## 6 35-39 Region 2 Rural No education Middle 2
tail(data)
## Age_Group Region Residence Education Wealth_Index Children_Born
## 8343 30-34 Region 2 Urban Higher Richer 2
## 8344 30-34 Region 2 Urban No education Richer 7
## 8345 15-19 Region 2 Urban No education Richer 1
## 8346 35-39 Region 2 Urban No education Middle 7
## 8347 15-19 Region 2 Urban Secondary Middle 0
## 8348 25-29 Region 2 Urban Secondary Richer 1
unique(data$Age_Group)
## [1] 30-34 20-24 40-44 25-29 35-39 15-19 45-49
## Levels: 15-19 20-24 25-29 30-34 35-39 40-44 45-49
unique(data$Region)
## [1] Region 2 Region 4 Region 1 Region 3
## Levels: Region 1 Region 2 Region 3 Region 4
unique(data$Residence)
## [1] Rural Urban
## Levels: Urban Rural
unique(data$Education)
## [1] No education Secondary Primary Higher
## Levels: No education Primary Secondary Higher
unique(data$Wealth_Index)
## [1] Poorest Middle Poorer Richer Richest
## Levels: Poorest Poorer Middle Richer Richest
unique(data$Children_Born)
## [1] 6 1 2 3 4 0 5 9 7 12 8 11 10 14 13
summary(data)
## Age_Group Region Residence Education Wealth_Index
## 15-19:2041 Region 1:3186 Urban:3424 No education:4584 Poorest:1725
## 20-24:1371 Region 2:2126 Rural:4924 Primary :1117 Poorer :1606
## 25-29:1357 Region 3:1433 Secondary :2385 Middle :1697
## 30-34:1110 Region 4:1603 Higher : 262 Richer :1863
## 35-39:1108 Richest:1457
## 40-44: 644
## 45-49: 717
## Children_Born
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 2.000
## Mean : 2.835
## 3rd Qu.: 5.000
## Max. :14.000
##
plot(data)
This plot shows that each independent variable doesn’t show a clear indication (of linear relation, quadratic relation) with children born. This can been from different independent variables relation to dependent variable separately.
polycor::hetcor(data)
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## Age_Group Region Residence Education Wealth_Index
## Age_Group 1 Polychoric Polychoric Polychoric Polychoric
## Region -0.009392 1 Polychoric Polychoric Polychoric
## Residence 0.1379 -0.3212 1 Polychoric Polychoric
## Education -0.4585 0.1235 -0.5162 1 Polychoric
## Wealth_Index -0.09031 0.144 -0.7866 0.4901 1
## Children_Born 0.7681 -0.03245 0.258 -0.5653 -0.2173
## Children_Born
## Age_Group Polyserial
## Region Polyserial
## Residence Polyserial
## Education Polyserial
## Wealth_Index Polyserial
## Children_Born 1
##
## Standard Errors:
## Age_Group Region Residence Education Wealth_Index
## Age_Group
## Region 0.01254
## Residence 0.01421 0.01391
## Education 0.01005 0.01374 0.01267
## Wealth_Index 0.01191 0.01214 0.006434 0.01024
## Children_Born 0.003968 0.01204 0.01317 0.008584 0.01082
##
## n = 8348
##
## P-values for Tests of Bivariate Normality:
## Age_Group Region Residence Education
## Age_Group
## Region 3.424e-06
## Residence 0.0009226 5.094e-203
## Education 4.322e-222 2.344e-95 4.669e-12
## Wealth_Index 4.213e-06 0 1.055e-43 1.863e-33
## Children_Born 0 0 6.89000000000015e-311 0
## Wealth_Index
## Age_Group
## Region
## Residence
## Education
## Wealth_Index
## Children_Born 4.92299999999066e-313
plot(data$Age_Group,
data$Children_Born,
main = "Age Group Vs Total Chidren Ever Born",
ylab = "Total Children Ever Born",
xlab = "Age Group")
plot(data$Region,
data$Children_Born,
main = "Region Vs Total Chidren Ever Born",
ylab = "Total Children Ever Born",
xlab = "Region")
plot(data$Residence,
data$Children_Born,
main = "Place of Residence Vs Total Chidren Ever Born",
ylab = "Total Children Ever Born",
xlab = "Place of Residence")
plot(data$Education,
data$Children_Born,
main = "Education Level Vs Total Chidren Ever Born",
ylab = "Total Children Ever Born",
xlab = "Education")
plot(data$Wealth_Index,
data$Children_Born,
main = "Wealth Index Vs Total Chidren Ever Born",
ylab = "Total Children Ever Born",
xlab = "Wealth Index")
set.seed(9)
ind <- sample(2, nrow(data), replace = T, prob = c(0.8,0.2))
train <- data[ind==1,]
test <- data[ind==2,]
Checking the Normality of dependent variable to apply Linear Model Performing Suggestive Check:
qqnorm(data$Children_Born)
qqline(data$Children_Born,
col = "red",
lwd = 2)
Look likes the data is not in normal distribution, so checking using another approach.
hist(data$Children_Born , probability = T,
main = "Histogram of Total Children Ever Born",
xlab = "Total Children Ever Born")
lines(density(data$Children_Born),
col = "red")
The histogram is the right skewed and the data is not normally distributed. But this is not enough we need to confirm it with statistical test for that we use shapiro test but it has the limitations as it can only used in the small data set with samples less than 100. So we will be sub setting the data and check the normality of the dependent variable.
Preforming Confirmation Test:
#shapiro.test(data$V201)
Gives error because sample size must be between 3 and 5000.
Now, creating subset of data and checking normality of dependent variable.
subset_data <- data %>% sample_n(5000)
shapiro.test(subset_data$Children_Born)
##
## Shapiro-Wilk normality test
##
## data: subset_data$Children_Born
## W = 0.8921, p-value < 2.2e-16
Here the p value is less than 0.05 significance label and the dependent variable is not normally distributed so we could not apply the linear regression on this so before that we need to transform the dependent variable using log to make it normal.
subset_data$Children_Born <- log(subset_data$Children_Born)
shapiro.test(subset_data$Children_Born)
##
## Shapiro-Wilk normality test
##
## data: subset_data$Children_Born
## W = NaN, p-value = NA
plot(subset_data$Children_Born)
Here by applying the log also we do not get any result for the test and from the above plot we figure out that the data is non linear and its worthless to use the linear model for this data. So we will be applying other Supervised Regression models.
Now, lets check which regression model is best.
knn.model<- knnreg(Children_Born~., data = train)
gbm.model <- gbm(Children_Born ~ .,
data = train,
n.trees = 1000,
interaction.depth = 4,
shrinkage = 0.01)
## Distribution not specified, assuming gaussian ...
svm.model <- svm(Children_Born ~ .,
data = train,
kernel = "radial",
cost = 1,
epsilon = 0.1)
rf.model <- randomForest(Children_Born ~ .,
data = train,
ntree = 80)
cores <- detectCores()
registerDoParallel(cores = cores)
set.seed(9)
knn.pred.train <- predict(knn.model,train)
knn.train.notuning <- postResample(knn.pred.train,train$Children_Born)
knn.train.notuning
## RMSE Rsquared MAE
## 1.6385601 0.6323371 1.1400761
svm.pred.train <- predict(svm.model,
newdata = train)
svm.train.notuning <- postResample(svm.pred.train,train$Children_Born)
svm.train.notuning
## RMSE Rsquared MAE
## 1.6201808 0.6413616 1.1116349
rfm.pred.train <- predict(rf.model,
newdata = train,
n.trees = 80)
rfm.train.notuning <- postResample(rfm.pred.train,train$Children_Born)
rfm.train.notuning
## RMSE Rsquared MAE
## 1.7916611 0.6068157 1.3560414
set.seed(9)
knn.pred.test <- predict(knn.model,test)
knn.test.notuning <- postResample(knn.pred.test,test$Children_Born)
knn.test.notuning
## RMSE Rsquared MAE
## 1.7762675 0.5741265 1.2593450
svm.pred.test <- predict(svm.model,
newdata = test)
svm.test.notuning <- postResample(svm.pred.test,test$Children_Born)
svm.test.notuning
## RMSE Rsquared MAE
## 1.6779906 0.6194369 1.1702743
rfm.pred.test <- predict(rf.model,
newdata = test,
n.trees = 80)
rfm.test.notuning <- postResample(rfm.pred.test,test$Children_Born)
rfm.test.notuning
## RMSE Rsquared MAE
## 1.8395882 0.5810902 1.4018941
For tuning purpose, I choose SVM.
a. LOOCV
train.control.loocv <- trainControl(method="LOOCV", allowParallel = TRUE)
Support Vector Regression
registerDoParallel()
set.seed(9)
svm.model.LOOCV <- svm(Children_Born ~ .,
data = train,
kernel = "radial",
cost = 1,
epsilon = 0.1,
trControl = train.control.loocv)
svm.pred.loocv <- predict(svm.model.LOOCV,
newdata = test)
svm.loocv.tuning <- postResample(svm.pred.loocv,test$Children_Born)
train.control.kfold <- trainControl(method="cv", number = 10, allowParallel = TRUE)
Support Vector Regression
registerDoParallel()
set.seed(9)
svm.model.kfold <- svm(Children_Born ~ .,
data = train,
kernel = "radial",
cost = 1,
epsilon = 0.1,
trControl = train.control.kfold)
svm.pred.kfold <- predict(svm.model.kfold,
newdata = test)
svm.kfold.tuning <- postResample(svm.pred.kfold,test$Children_Born)
train.control.rkfold <- trainControl(method="repeatedcv",
number = 10,
repeats = 5,
allowParallel = TRUE)
Support Vector Regression
registerDoParallel()
set.seed(9)
svm.model.rkfold <- svm(Children_Born ~ .,
data = train,
kernel = "radial",
cost = 1,
epsilon = 0.1,
trControl = train.control.rkfold)
svm.pred.rkfold <- predict(svm.model.rkfold,
newdata = test)
svm.rkfold.tuning <- postResample(svm.pred.rkfold,test$Children_Born)
Support Vector Results:
svm_compare_result <- data.frame(svm.train.notuning,
svm.test.notuning,
svm.loocv.tuning,
svm.kfold.tuning ,
svm.rkfold.tuning)
names(svm_compare_result) <- c("SVM Train",
"SVM No tuning",
"SVM LOOCV",
"SVM K Fold",
"SVM Repated K Fold")
svm_df_t <- t(svm_compare_result)
rownames(svm_df_t) <- colnames(svm_compare_result)
colnames(svm_df_t) <- rownames(svm_compare_result)
svm_df_t
## RMSE Rsquared MAE
## SVM Train 1.620181 0.6413616 1.111635
## SVM No tuning 1.677991 0.6194369 1.170274
## SVM LOOCV 1.677991 0.6194369 1.170274
## SVM K Fold 1.677991 0.6194369 1.170274
## SVM Repated K Fold 1.677991 0.6194369 1.170274
These results provide an evaluation of different regression models based on their predictive performance.
The R2 value ranges from 0 to 1 and represents the proportion of the variance in the dependent variable that is predictable from the independent variables. A higher R2 value indicates better predictive performance.
The RMSE represents the average magnitude of the residuals (prediction errors) and provides a measure of the model’s accuracy. Lower RMSE values indicate better accuracy.
Based on the above information, the Support Vector Regression achieved the highest R2 value (0.6194369) and the lowest RMSE (1.6779906) among the listed models.