library(mlbench) # for PimaIndians dataset
library(dplyr) # for data manipulation (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom) # for making model summary tidy
library(margins) # to calculate Average Marginal Effects
library(rcompanion) # to calculate pseudo R2
library(ROCR) # to compute and plot Reciever Opering Curve
##
## Attaching package: 'ROCR'
## The following object is masked from 'package:margins':
##
## prediction
data<-read.csv("D:/Rencana Tugas Akhir/Dataset3.csv",sep=";")
dataset<-na.omit(data) #removing NA values
dataset
## Y D1 D2 D3 D4
## 1 1 0 1 0 0
## 2 0 0 1 0 0
## 3 0 0 1 0 0
## 4 0 0 1 0 0
## 5 1 0 1 0 0
## 6 0 1 0 0 1
## 7 0 0 0 0 0
## 8 1 1 0 0 0
## 9 0 0 1 0 0
## 10 0 0 1 0 1
## 11 1 0 1 1 0
## 12 1 0 1 1 0
## 13 0 1 0 1 0
## 14 0 0 1 0 0
## 15 0 0 1 0 0
## 16 0 0 0 0 1
## 17 0 0 1 0 0
## 18 0 0 1 0 0
## 19 1 0 1 0 0
## 20 1 0 0 0 0
## 21 0 0 0 0 0
## 22 0 0 0 1 0
## 23 0 0 1 0 0
## 24 0 0 1 0 0
## 25 0 0 1 0 0
## 26 0 0 1 1 0
## 27 1 0 1 1 0
## 28 1 1 0 0 1
## 29 0 0 0 0 1
## 30 0 0 1 0 0
## 31 0 0 1 0 0
## 32 0 0 1 1 0
## 33 1 0 1 0 1
## 34 1 0 1 0 1
## 35 0 0 0 0 0
## 36 0 0 1 1 0
## 37 0 0 1 1 0
## 38 1 0 1 1 0
## 39 0 0 1 0 0
## 40 0 0 1 1 0
## 41 0 0 1 0 0
## 42 0 0 1 1 0
## 43 0 0 1 0 0
## 44 0 0 1 0 0
## 45 1 1 0 0 1
## 46 0 0 1 0 0
## 47 0 0 1 0 0
## 48 0 0 1 0 1
## 49 0 0 0 0 0
## 50 0 0 1 0 0
## 51 0 0 1 1 0
## 52 1 0 1 0 0
## 53 0 0 0 0 0
## 54 0 0 1 0 0
## 55 0 0 1 0 0
## 56 0 0 1 0 0
## 57 0 0 0 1 0
## 58 0 0 1 0 0
## 59 0 0 1 0 0
## 60 0 0 0 0 0
## 61 0 0 1 0 1
## 62 0 0 1 0 0
## 63 0 0 1 0 1
## 64 1 0 0 0 1
## 65 0 0 0 1 0
## 66 0 0 1 0 0
## 67 0 0 0 0 0
## 68 0 0 1 0 1
## 69 0 0 1 0 0
## 70 0 1 0 0 0
## 71 1 1 0 0 1
## 72 0 0 1 0 0
## 73 0 0 1 0 0
## 74 0 0 1 0 1
## 75 0 1 0 0 0
## 76 0 0 1 0 0
## 77 0 0 1 0 0
## 78 0 0 1 1 0
## 79 0 0 0 1 0
## 80 0 0 1 0 1
## 81 0 0 1 0 0
## 82 0 0 0 0 0
## 83 0 0 1 0 0
## 84 1 1 0 0 1
## 85 0 0 0 0 0
## 86 0 0 1 0 1
## 87 0 0 0 0 0
## 88 0 0 1 1 0
## 89 0 0 1 0 0
## 90 1 0 0 0 1
## 91 0 0 1 0 1
## 92 0 0 1 1 0
## 93 0 0 1 0 1
## 94 0 0 1 0 0
## 95 0 0 1 0 0
## 96 0 0 1 0 0
## 97 0 0 1 0 1
## 98 0 1 0 0 0
## 99 0 1 0 0 0
## 100 0 0 1 0 0
glimpse(dataset)
## Rows: 100
## Columns: 5
## $ Y <int> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0~
## $ D1 <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ D2 <int> 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1~
## $ D3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0~
## $ D4 <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~
# Total number of rows in the Diabetes data frame
n <- nrow(dataset)
n
## [1] 100
# Number of rows for the training set (80% of the dataset)
n_train <- round(0.80 * n)
# Create a vector of indices which is an 80% random sample
set.seed(123)
train_indices <- sample(1:n, n_train)
# Subset the Diabetes data frame to training indices only
train <- dataset[train_indices, ]
# Exclude the training indices to create the test set
test <- dataset[-train_indices, ]
test
## Y D1 D2 D3 D4
## 1 1 0 1 0 0
## 2 0 0 1 0 0
## 10 0 0 1 0 1
## 11 1 0 1 1 0
## 24 0 0 1 0 0
## 28 1 1 0 0 1
## 35 0 0 0 0 0
## 37 0 0 1 1 0
## 44 0 0 1 0 0
## 45 1 1 0 0 1
## 48 0 0 1 0 1
## 52 1 0 1 0 0
## 55 0 0 1 0 0
## 56 0 0 1 0 0
## 59 0 0 1 0 0
## 68 0 0 1 0 1
## 74 0 0 1 0 1
## 81 0 0 1 0 0
## 88 0 0 1 1 0
## 98 0 1 0 0 0
paste("train sample size: ", dim(train)[1])
## [1] "train sample size: 80"
paste("test sample size: ", dim(test)[1])
## [1] "test sample size: 20"
model_biner<-glm(Y~ D1+D2+D3+D4,family=binomial,train,)
summary(model_biner)
##
## Call:
## glm(formula = Y ~ D1 + D2 + D3 + D4, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3541 -0.6031 -0.3805 -0.3805 2.3072
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4083 0.7980 -3.018 0.00254 **
## D1 1.0222 1.0344 0.988 0.32304
## D2 -0.1810 0.7843 -0.231 0.81752
## D3 0.9771 0.8373 1.167 0.24326
## D4 1.7924 0.7487 2.394 0.01666 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71.007 on 79 degrees of freedom
## Residual deviance: 62.463 on 75 degrees of freedom
## AIC: 72.463
##
## Number of Fisher Scoring iterations: 5
nagelkerke(model_biner)
## $Models
##
## Model: "glm, Y ~ D1 + D2 + D3 + D4, binomial, train"
## Null: "glm, Y ~ 1, binomial, train"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.120319
## Cox and Snell (ML) 0.101288
## Nagelkerke (Cragg and Uhler) 0.172156
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -4 -4.2717 8.5434 0.073581
##
## $Number.of.observations
##
## Model: 80
## Null: 80
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
#plot(model_biner)
return
## .Primitive("return")
#predict the test dataset
pred <- predict(model_biner, test, type="response")
predicted <- round(pred) # round of the value; >0.5 will convert to
# 1 else 0
# Creating a contigency table
tab <- table(predicted=predicted, Reference = test$Y)
tab
## Reference
## predicted 0 1
## 0 15 3
## 1 0 2
accuracy(tab)
## $Models
## Call
## 1 "Not supported"
## 2 "Not supported"
## 3 "Not supported"
## 4 "Not supported"
##
## $Fit.criteria
## Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## NRMSE.mean.accuracy NRMSE.median.accuracy Efron.r.squared CV.prcnt
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
# Creating a dataframe of observed and predicted data
act_pred <- data.frame(observed = test$Y, predicted =
factor(predicted))
# Calculating Accuracy
accuracy_est <- accuracy(act_pred, observed, predicted)
print(accuracy_est)
## $Models
## Call
## 1 "Not supported"
## 2 "Not supported"
##
## $Fit.criteria
## Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## NRMSE.mean.accuracy NRMSE.median.accuracy Efron.r.squared CV.prcnt
## 1 NA NA NA NA
## 2 NA NA NA NA
# Note: the accuracy( ) function gives rounded estimate i.e. 0.885
pred.rocr <- prediction(pred, test$Y)
eval <- performance(pred.rocr, "acc")
plot(eval)
# Identifying the best cutoff value that maximizes accuracy
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max] #y.values are accuracy
#measures
cut <- slot(eval, "x.values")[[1]][max] #x.values are cutoff
#measures
print(c(Accuracy = acc, Cutoff = cut))
## Accuracy Cutoff.45
## 0.8500000 0.6002243
perf_rocr <- performance(pred.rocr, measure = "auc",
x.measure = "cutoff")
perf_rocr@y.values[[1]] <- round(perf_rocr@y.values[[1]], digits =
4)
perf.tpr.fpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.fpr.rocr, colorize=T,
main = paste("AUC:", (perf_rocr@y.values)))
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.