library(mlbench) # for PimaIndians dataset
library(dplyr) # for data manipulation (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom) # for making model summary tidy
library(margins) # to calculate Average Marginal Effects
library(rcompanion) # to calculate pseudo R2
library(ROCR) # to compute and plot Reciever Opering Curve
##
## Attaching package: 'ROCR'
## The following object is masked from 'package:margins':
##
## prediction
dataset<-read.csv("D:/Rencana Tugas Akhir/DLI.csv",sep=";")
dataset
## X.Y. X.X1. X.X2. X.X3. X.X4. X.X5. X.X6. X.X7.
## 1 1 0 0 2 1 1 1 0
## 2 0 1 0 2 1 0 1 0
## 3 0 1 0 2 0 1 0 0
## 4 0 1 0 2 0 0 1 0
## 5 1 0 0 2 1 1 1 0
## 6 0 1 1 1 1 1 1 2
## 7 0 0 1 0 0 0 1 0
## 8 1 0 0 1 0 1 0 0
## 9 0 1 0 2 1 0 1 0
## 10 0 1 0 2 0 0 1 2
## 11 1 0 1 2 1 1 1 1
## 12 1 0 0 2 1 1 0 1
## 13 0 1 1 1 1 0 1 1
## 14 0 1 0 2 0 0 1 0
## 15 0 0 0 2 0 0 1 0
## 16 0 0 0 0 0 1 1 2
## 17 0 0 1 2 0 0 0 0
## 18 0 1 0 2 0 0 1 0
## 19 1 1 1 2 1 0 1 0
## 20 1 1 0 0 1 0 1 0
## 21 0 1 1 0 0 0 1 0
## 22 0 1 0 0 0 0 0 1
## 23 0 1 1 2 0 1 1 0
## 24 0 0 1 2 1 0 1 0
## 25 0 1 1 2 1 0 1 0
## 26 0 1 0 2 1 1 1 1
## 27 1 1 1 2 1 1 1 1
## 28 1 0 1 1 1 1 1 2
## 29 0 1 0 0 0 1 1 2
## 30 0 0 1 2 1 1 0 0
## 31 0 1 0 2 0 1 1 0
## 32 0 0 1 2 0 0 1 1
## 33 1 0 0 2 1 1 1 2
## 34 1 1 0 2 1 1 1 2
## 35 0 1 1 0 0 1 1 0
## 36 0 0 1 2 1 0 1 1
## 37 0 1 0 2 0 0 1 1
## 38 1 1 1 2 0 0 1 1
## 39 0 0 1 2 0 0 1 0
## 40 0 1 1 2 0 0 1 1
## 41 0 1 0 2 0 0 1 0
## 42 0 1 1 2 0 0 1 1
## 43 0 0 0 2 0 0 1 0
## 44 0 1 0 2 1 1 1 0
## 45 1 0 1 1 0 0 1 2
## 46 0 1 0 2 0 0 1 0
## 47 0 0 0 2 1 1 1 0
## 48 0 1 1 2 1 0 1 2
## 49 0 0 0 0 0 0 0 0
## 50 0 0 0 2 0 0 1 0
## 51 0 0 1 2 0 0 0 1
## 52 1 1 1 2 0 0 1 0
## 53 0 0 0 0 0 0 1 0
## 54 0 1 0 2 1 1 1 0
## 55 0 0 1 2 0 0 1 0
## 56 0 0 0 2 0 1 1 0
## 57 0 1 1 0 0 0 1 1
## 58 0 1 0 2 0 1 1 0
## 59 0 1 0 2 0 0 1 0
## 60 0 0 1 0 0 0 1 0
## 61 0 0 1 2 0 0 1 2
## 62 0 1 1 2 1 0 1 0
## 63 0 0 1 2 0 0 1 2
## 64 1 0 1 0 0 0 0 2
## 65 0 1 1 0 0 0 0 1
## 66 0 0 0 2 1 1 1 0
## 67 0 1 0 0 0 1 0 0
## 68 0 0 0 2 0 0 1 2
## 69 0 0 1 2 0 0 1 0
## 70 0 1 0 1 0 0 0 0
## 71 1 0 0 1 0 0 1 2
## 72 0 0 1 2 0 1 1 0
## 73 0 0 1 2 0 0 1 0
## 74 0 1 1 2 0 0 1 2
## 75 0 0 1 1 0 1 1 0
## 76 0 1 0 2 0 1 1 0
## 77 0 1 1 2 0 0 1 0
## 78 0 0 1 2 0 0 1 1
## 79 0 0 1 0 0 0 0 1
## 80 0 0 0 2 1 0 1 2
## 81 0 1 0 2 1 0 1 0
## 82 0 1 1 0 0 0 1 0
## 83 0 1 1 2 0 0 1 0
## 84 1 0 1 1 0 0 0 2
## 85 0 1 0 0 0 0 0 0
## 86 0 0 0 2 1 0 1 2
## 87 0 1 1 0 1 0 0 0
## 88 0 1 0 2 1 1 1 1
## 89 0 0 1 2 1 0 1 0
## 90 1 1 1 0 0 1 1 2
## 91 0 1 1 2 0 0 1 2
## 92 0 0 1 2 0 1 1 1
## 93 0 1 0 2 0 0 1 2
## 94 0 1 1 2 1 0 1 0
## 95 0 1 1 2 0 0 1 0
## 96 0 0 0 2 0 0 1 0
## 97 0 0 0 2 1 0 0 2
## 98 0 0 0 1 0 0 1 0
## 99 0 0 1 1 0 1 0 0
## 100 0 0 0 2 0 0 1 0
glimpse(dataset)
## Rows: 100
## Columns: 8
## $ X.Y. <int> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0~
## $ X.X1. <int> 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1~
## $ X.X2. <int> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0~
## $ X.X3. <int> 2, 2, 2, 2, 2, 1, 0, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 0~
## $ X.X4. <int> 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0~
## $ X.X5. <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0~
## $ X.X6. <int> 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0~
## $ X.X7. <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 1~
# Total number of rows in the Diabetes data frame
n <- nrow(dataset)
n
## [1] 100
# Number of rows for the training set (80% of the dataset)
n_train <- round(0.80 * n)
# Create a vector of indices which is an 80% random sample
set.seed(123)
train_indices <- sample(1:n, n_train)
# Subset the Diabetes data frame to training indices only
train <- dataset[train_indices, ]
# Exclude the training indices to create the test set
test <- dataset[-train_indices, ]
test
## X.Y. X.X1. X.X2. X.X3. X.X4. X.X5. X.X6. X.X7.
## 1 1 0 0 2 1 1 1 0
## 2 0 1 0 2 1 0 1 0
## 10 0 1 0 2 0 0 1 2
## 11 1 0 1 2 1 1 1 1
## 24 0 0 1 2 1 0 1 0
## 28 1 0 1 1 1 1 1 2
## 35 0 1 1 0 0 1 1 0
## 37 0 1 0 2 0 0 1 1
## 44 0 1 0 2 1 1 1 0
## 45 1 0 1 1 0 0 1 2
## 48 0 1 1 2 1 0 1 2
## 52 1 1 1 2 0 0 1 0
## 55 0 0 1 2 0 0 1 0
## 56 0 0 0 2 0 1 1 0
## 59 0 1 0 2 0 0 1 0
## 68 0 0 0 2 0 0 1 2
## 74 0 1 1 2 0 0 1 2
## 81 0 1 0 2 1 0 1 0
## 88 0 1 0 2 1 1 1 1
## 98 0 0 0 1 0 0 1 0
paste("train sample size: ", dim(train)[1])
## [1] "train sample size: 80"
paste("test sample size: ", dim(test)[1])
## [1] "test sample size: 20"
model_biner<-glm(X.Y.~ X.X1.+ X.X2.+X.X3.+X.X4.+X.X5.+X.X6.+X.X7.,family=binomial,train,)
summary(model_biner)
##
## Call:
## glm(formula = X.Y. ~ X.X1. + X.X2. + X.X3. + X.X4. + X.X5. +
## X.X6. + X.X7., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5443 -0.5604 -0.3421 -0.2116 2.4201
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.29898 1.05315 -2.183 0.0290 *
## X.X1. -0.15206 0.73701 -0.206 0.8365
## X.X2. 0.07293 0.71531 0.102 0.9188
## X.X3. -0.39269 0.47888 -0.820 0.4122
## X.X4. 1.34505 0.78850 1.706 0.0880 .
## X.X5. 1.05194 0.71721 1.467 0.1425
## X.X6. -0.62434 0.81836 -0.763 0.4455
## X.X7. 0.91447 0.40346 2.267 0.0234 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71.007 on 79 degrees of freedom
## Residual deviance: 56.804 on 72 degrees of freedom
## AIC: 72.804
##
## Number of Fisher Scoring iterations: 5
nagelkerke(model_biner)
## $Models
##
## Model: "glm, X.Y. ~ X.X1. + X.X2. + X.X3. + X.X4. + X.X5. + X.X6. + X.X7., binomial, train"
## Null: "glm, X.Y. ~ 1, binomial, train"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.200023
## Cox and Snell (ML) 0.162671
## Nagelkerke (Cragg and Uhler) 0.276486
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -7 -7.1015 14.203 0.047686
##
## $Number.of.observations
##
## Model: 80
## Null: 80
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
#plot(model_biner)
return
## .Primitive("return")
#predict the test dataset
pred <- predict(model_biner, test, type="response")
predicted <- round(pred) # round of the value; >0.5 will convert to
# 1 else 0
# Creating a contigency table
tab <- table(predicted=predicted, Reference = test$X.Y.)
tab
## Reference
## predicted 0 1
## 0 15 4
## 1 0 1
accuracy(tab)
## $Models
## Call
## 1 "Not supported"
## 2 "Not supported"
## 3 "Not supported"
## 4 "Not supported"
##
## $Fit.criteria
## Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## NRMSE.mean.accuracy NRMSE.median.accuracy Efron.r.squared CV.prcnt
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
# Creating a dataframe of observed and predicted data
act_pred <- data.frame(observed = test$X.Y., predicted =
factor(predicted))
# Calculating Accuracy
accuracy_est <- accuracy(act_pred, observed, predicted)
print(accuracy_est)
## $Models
## Call
## 1 "Not supported"
## 2 "Not supported"
##
## $Fit.criteria
## Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## NRMSE.mean.accuracy NRMSE.median.accuracy Efron.r.squared CV.prcnt
## 1 NA NA NA NA
## 2 NA NA NA NA
# Note: the accuracy( ) function gives rounded estimate i.e. 0.885
pred.rocr <- prediction(pred, test$X.Y.)
eval <- performance(pred.rocr, "acc")
plot(eval)
# Identifying the best cutoff value that maximizes accuracy
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max] #y.values are accuracy
#measures
cut <- slot(eval, "x.values")[[1]][max] #x.values are cutoff
#measures
print(c(Accuracy = acc, Cutoff = cut))
## Accuracy Cutoff.11
## 0.8500000 0.4196218
perf_rocr <- performance(pred.rocr, measure = "auc",
x.measure = "cutoff")
perf_rocr@y.values[[1]] <- round(perf_rocr@y.values[[1]], digits =
4)
perf.tpr.fpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.fpr.rocr, colorize=T,
main = paste("AUC:", (perf_rocr@y.values)))
abline(a = 0, b = 1)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.