———————————————————————-
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(car)
## Loading required package: carData
library(foreign)
setwd('E:/projects/R/presentation')
#1. Download the Individual recode file from: https://dhsprogram.com/data/Download-Model-Datasets.cfm
suppressWarnings({
data <- read.spss("ZZIR62FL.SAV", to.data.frame = TRUE)
})
## re-encoding from CP1252
data<-data[,c("V201","V013","V024","V025","V106","V190")]
set.seed(32)
#2. Read it in R Studio and split it into training (80%) and testing (20%) datasets with set.seed as your class roll number
idx=sample(2,nrow(data),replace=T,prob=c(0.8,0.2))
train.data<-data[idx==1,]
test.data<-data[idx==2,]
#to fit linear regression,assumptions about the linear regression must be satisfied.
#dependent variable must be normally distributed.
#lets visualize by the histogram
hist(data$V201,col="red")

# from the above histogram,we can conclude,the distribution of the dependent varible is not normal,
#the distribution is skewed to the right.
#again lets visualize by qqplot
qqnorm(data$V201)
qqline(data$V201,col="red",lw=2)

#from the qqplot and qqline ,it again suggests dependent variable is not normally
#distributed
#to be confirmed ,let's test for normality,
ks.test(data$V201,'pnorm')
## Warning in ks.test.default(data$V201, "pnorm"): ties should not be present for
## the Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: data$V201
## D = 0.57937, p-value < 2.2e-16
## alternative hypothesis: two-sided
#Since the p-value is significantly smaller than the conventional threshold
#of 0.05, we can reject the null hypothesis and conclude that the data does
#not follow a normal distribution
#so from the above conclusion ,we can not use linear regresion to fit the model,
#because dependent variable is not normally distributed.
#Therefore ,we must find alternative supervised regression to fit the model.
#Decision tree regression
dtr.model<-train(V201~.,data = train.data,method="rpart2")
#fit SVM
svm.model<-train(V201~.,data=train.data,method="svmRadial")
#predict on train data in decision tree regression
dtr.predict<-predict(dtr.model,train.data)
dtr.R2<-R2(dtr.predict,train.data$V201)
dtr.RMSE<-RMSE(dtr.predict,train.data$V201)
#svm predict
svm.predict<-predict(svm.model,train.data)
svm.R2<-R2(svm.predict,train.data$V201)
svm.RMSE<-RMSE(svm.predict,train.data$V201)
#predict on test data,decision tree regression
dtr.predict.test<-predict(dtr.model,test.data)
dtr.test.R2<-R2(dtr.predict.test,test.data$V201)
dtr.test.RMSE<-RMSE(dtr.predict.test,test.data$V201)
#predict on test data ,svm
svm.predict.test<-predict(svm.model,test.data)
svm.test.R2<-R2(svm.predict.test,test.data$V201)
svm.test.RMSE<-RMSE(svm.predict.test,test.data$V201)
#lets tune the model
#loocv
dtr.model.loocv<-train(V201~.,data=train.data,method="rpart2",trControl=trainControl(method="loocv"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
dtr.loocv.predict<-predict(dtr.model.loocv,test.data)
dtr.loocv.R2<-R2(dtr.loocv.predict,test.data$V201)
dtr.loocv.RMSE<-RMSE(dtr.loocv.predict,test.data$V201)
#k-fold cross validation
dtr.model.cv<-train(V201~.,data=train.data,method="rpart2",trControl=trainControl(method="cv",number = 3))
dtr.cv.predict<-predict(dtr.model.cv,test.data)
dtr.cv.R2<-R2(dtr.cv.predict,test.data$V201)
dtr.cv.RMSE<-RMSE(dtr.cv.predict,test.data$V201)
#repeated k-fold cross validation
dtr.model.rcv<-train(V201~.,data=train.data,method="rpart2",trControl=trainControl(method="repeatedcv",number=5,repeats = 5))
dtr.rcv.predict<-predict(dtr.model.rcv,test.data)
dtr.rcv.R2<-R2(dtr.rcv.predict,test.data$V201)
dtr.rcv.RMSE<-RMSE(dtr.rcv.predict,test.data$V201)
#7. Compare the R-square and RMSE of all the model and choose the one for final prediction
models<-data.frame(
model=c(
'SVM',
'decision tree reg validation Set Approach',
'decision tree reg loocv',
'decision tree reg k-fold cross validation',
'decision tree reg repeated k-fold'
),
R2=c(
svm.test.R2,
dtr.test.R2,
dtr.loocv.R2,
dtr.cv.R2,
dtr.rcv.R2
),
RMSE=c(
svm.test.RMSE,
dtr.test.RMSE,
dtr.loocv.RMSE,
dtr.cv.RMSE,
dtr.rcv.RMSE
)
)
models
## model R2 RMSE
## 1 SVM 0.5861869 1.713083
## 2 decision tree reg validation Set Approach 0.4518540 1.970351
## 3 decision tree reg loocv 0.4518540 1.970351
## 4 decision tree reg k-fold cross validation 0.4518540 1.970351
## 5 decision tree reg repeated k-fold 0.4518540 1.970351
#These results provide an evaluation of different regression models based on their predictive performance.
#The R2 value ranges from 0 to 1 and represents the proportion of the variance in the dependent variable
#that is predictable from the independent variables. A higher R2 value indicates better predictive performance.
#The RMSE represents the average magnitude of the residuals (prediction errors) and provides
#a measure of the model's accuracy. Lower RMSE values indicate better accuracy.
#Based on the above information, the SVM model achieved the highest R2 value (0.5861869) and
#the lowest RMSE (1.713083) among the listed models.