———————————————————————-

Course: MDS 503 (Statistical Computing with R)

Student: Santosh Kumar Pandit (32)

Teacher: Shital Bhandary (Associate Professor)

School: School of Mathematical Sciences, IOST, TU

———————————————————————-

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(car)
## Loading required package: carData
library(foreign)
setwd('E:/projects/R/presentation')
#1. Download the Individual recode file from: https://dhsprogram.com/data/Download-Model-Datasets.cfm

suppressWarnings({
  data <- read.spss("ZZIR62FL.SAV", to.data.frame = TRUE)
})
## re-encoding from CP1252
data<-data[,c("V201","V013","V024","V025","V106","V190")]
set.seed(32)

#2. Read it in R Studio and split it into training (80%) and testing (20%) datasets with set.seed as your class roll number
idx=sample(2,nrow(data),replace=T,prob=c(0.8,0.2))
train.data<-data[idx==1,]
test.data<-data[idx==2,]


#to fit linear regression,assumptions about the linear regression must be satisfied.
#dependent variable must be normally distributed.
#lets visualize by the histogram

hist(data$V201,col="red")

# from the above histogram,we can conclude,the distribution of the dependent varible is not normal,
#the distribution is skewed to the right.

#again lets visualize by qqplot

qqnorm(data$V201)
qqline(data$V201,col="red",lw=2)

#from the qqplot and qqline ,it again suggests dependent variable is not normally
#distributed


#to be confirmed ,let's test for normality,

ks.test(data$V201,'pnorm')
## Warning in ks.test.default(data$V201, "pnorm"): ties should not be present for
## the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  data$V201
## D = 0.57937, p-value < 2.2e-16
## alternative hypothesis: two-sided
#Since the p-value is significantly smaller than the conventional threshold 
#of 0.05, we can reject the null hypothesis and conclude that the data does 
#not follow a normal distribution

#so from the above conclusion ,we can not use linear regresion to fit the model,
#because dependent variable is not normally distributed.

#Therefore ,we must  find alternative supervised regression to fit the model.



#Decision tree regression

dtr.model<-train(V201~.,data = train.data,method="rpart2")


#fit SVM
svm.model<-train(V201~.,data=train.data,method="svmRadial")

#predict on train data in decision tree regression
dtr.predict<-predict(dtr.model,train.data)
dtr.R2<-R2(dtr.predict,train.data$V201)
dtr.RMSE<-RMSE(dtr.predict,train.data$V201)


#svm predict
svm.predict<-predict(svm.model,train.data)
svm.R2<-R2(svm.predict,train.data$V201)
svm.RMSE<-RMSE(svm.predict,train.data$V201)

#predict on test data,decision tree regression
dtr.predict.test<-predict(dtr.model,test.data)
dtr.test.R2<-R2(dtr.predict.test,test.data$V201)
dtr.test.RMSE<-RMSE(dtr.predict.test,test.data$V201)

#predict on test data ,svm
svm.predict.test<-predict(svm.model,test.data)
svm.test.R2<-R2(svm.predict.test,test.data$V201)
svm.test.RMSE<-RMSE(svm.predict.test,test.data$V201)




#lets tune the model
#loocv
dtr.model.loocv<-train(V201~.,data=train.data,method="rpart2",trControl=trainControl(method="loocv"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
dtr.loocv.predict<-predict(dtr.model.loocv,test.data)
dtr.loocv.R2<-R2(dtr.loocv.predict,test.data$V201)
dtr.loocv.RMSE<-RMSE(dtr.loocv.predict,test.data$V201)

#k-fold cross validation
dtr.model.cv<-train(V201~.,data=train.data,method="rpart2",trControl=trainControl(method="cv",number = 3))
dtr.cv.predict<-predict(dtr.model.cv,test.data)
dtr.cv.R2<-R2(dtr.cv.predict,test.data$V201)
dtr.cv.RMSE<-RMSE(dtr.cv.predict,test.data$V201)

#repeated k-fold cross validation
dtr.model.rcv<-train(V201~.,data=train.data,method="rpart2",trControl=trainControl(method="repeatedcv",number=5,repeats = 5))
dtr.rcv.predict<-predict(dtr.model.rcv,test.data)
dtr.rcv.R2<-R2(dtr.rcv.predict,test.data$V201)
dtr.rcv.RMSE<-RMSE(dtr.rcv.predict,test.data$V201)






#7. Compare the R-square and RMSE of all the model and choose the one for final prediction
models<-data.frame(
  model=c(
    'SVM',
    'decision tree reg validation Set Approach',
    'decision tree reg loocv',
    'decision tree reg k-fold cross validation',
    'decision tree reg repeated k-fold'
    ),
  R2=c(
    svm.test.R2,
    dtr.test.R2,
    dtr.loocv.R2,
    dtr.cv.R2,
    dtr.rcv.R2
    ),
  RMSE=c(
    svm.test.RMSE,
    dtr.test.RMSE,
    dtr.loocv.RMSE,
    dtr.cv.RMSE,
    dtr.rcv.RMSE
    )
)
models
##                                       model        R2     RMSE
## 1                                       SVM 0.5861869 1.713083
## 2 decision tree reg validation Set Approach 0.4518540 1.970351
## 3                   decision tree reg loocv 0.4518540 1.970351
## 4 decision tree reg k-fold cross validation 0.4518540 1.970351
## 5         decision tree reg repeated k-fold 0.4518540 1.970351
#These results provide an evaluation of different regression models based on their predictive performance.
#The R2 value ranges from 0 to 1 and represents the proportion of the variance in the dependent variable
#that is predictable from the independent variables. A higher R2 value indicates better predictive performance.

#The RMSE represents the average magnitude of the residuals (prediction errors) and provides
#a measure of the model's accuracy. Lower RMSE values indicate better accuracy.

#Based on the above information, the SVM model achieved the highest R2 value (0.5861869) and
#the lowest RMSE (1.713083) among the listed models.