Dataset me

library(mlbench)     # for PimaIndians dataset
library(dplyr)       # for data manipulation (dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(broom)       # for making model summary tidy
library(margins)     # to calculate Average Marginal Effects
library(rcompanion)  # to calculate pseudo R2
library(ROCR)        # to compute and plot Reciever Opering Curve

## 
## Attaching package: 'ROCR'

## The following object is masked from 'package:margins':
## 
##     prediction

data<-read.csv("D:/Rencana Tugas Akhir/Dataset3.csv",sep=";")
dataset<-na.omit(data) #removing NA values
dataset

##     Y D1 D2 D3 D4
## 1   1  0  1  0  0
## 2   0  0  1  0  0
## 3   0  0  1  0  0
## 4   0  0  1  0  0
## 5   1  0  1  0  0
## 6   0  1  0  0  1
## 7   0  0  0  0  0
## 8   1  1  0  0  0
## 9   0  0  1  0  0
## 10  0  0  1  0  1
## 11  1  0  1  1  0
## 12  1  0  1  1  0
## 13  0  1  0  1  0
## 14  0  0  1  0  0
## 15  0  0  1  0  0
## 16  0  0  0  0  1
## 17  0  0  1  0  0
## 18  0  0  1  0  0
## 19  1  0  1  0  0
## 20  1  0  0  0  0
## 21  0  0  0  0  0
## 22  0  0  0  1  0
## 23  0  0  1  0  0
## 24  0  0  1  0  0
## 25  0  0  1  0  0
## 26  0  0  1  1  0
## 27  1  0  1  1  0
## 28  1  1  0  0  1
## 29  0  0  0  0  1
## 30  0  0  1  0  0
## 31  0  0  1  0  0
## 32  0  0  1  1  0
## 33  1  0  1  0  1
## 34  1  0  1  0  1
## 35  0  0  0  0  0
## 36  0  0  1  1  0
## 37  0  0  1  1  0
## 38  1  0  1  1  0
## 39  0  0  1  0  0
## 40  0  0  1  1  0
## 41  0  0  1  0  0
## 42  0  0  1  1  0
## 43  0  0  1  0  0
## 44  0  0  1  0  0
## 45  1  1  0  0  1
## 46  0  0  1  0  0
## 47  0  0  1  0  0
## 48  0  0  1  0  1
## 49  0  0  0  0  0
## 50  0  0  1  0  0
## 51  0  0  1  1  0
## 52  1  0  1  0  0
## 53  0  0  0  0  0
## 54  0  0  1  0  0
## 55  0  0  1  0  0
## 56  0  0  1  0  0
## 57  0  0  0  1  0
## 58  0  0  1  0  0
## 59  0  0  1  0  0
## 60  0  0  0  0  0
## 61  0  0  1  0  1
## 62  0  0  1  0  0
## 63  0  0  1  0  1
## 64  1  0  0  0  1
## 65  0  0  0  1  0
## 66  0  0  1  0  0
## 67  0  0  0  0  0
## 68  0  0  1  0  1
## 69  0  0  1  0  0
## 70  0  1  0  0  0
## 71  1  1  0  0  1
## 72  0  0  1  0  0
## 73  0  0  1  0  0
## 74  0  0  1  0  1
## 75  0  1  0  0  0
## 76  0  0  1  0  0
## 77  0  0  1  0  0
## 78  0  0  1  1  0
## 79  0  0  0  1  0
## 80  0  0  1  0  1
## 81  0  0  1  0  0
## 82  0  0  0  0  0
## 83  0  0  1  0  0
## 84  1  1  0  0  1
## 85  0  0  0  0  0
## 86  0  0  1  0  1
## 87  0  0  0  0  0
## 88  0  0  1  1  0
## 89  0  0  1  0  0
## 90  1  0  0  0  1
## 91  0  0  1  0  1
## 92  0  0  1  1  0
## 93  0  0  1  0  1
## 94  0  0  1  0  0
## 95  0  0  1  0  0
## 96  0  0  1  0  0
## 97  0  0  1  0  1
## 98  0  1  0  0  0
## 99  0  1  0  0  0
## 100 0  0  1  0  0

glimpse(dataset)

## Rows: 100
## Columns: 5
## $ Y  <int> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0~
## $ D1 <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ D2 <int> 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1~
## $ D3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0~
## $ D4 <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~

# Total number of rows in the Diabetes data frame
n <- nrow(dataset)
n

## [1] 100

# Number of rows for the training set (80% of the dataset)
n_train <- round(0.80 * n)
# Create a vector of indices which is an 80% random sample
set.seed(123)
train_indices <- sample(1:n, n_train)
# Subset the Diabetes data frame to training indices only
train <- dataset[train_indices, ]
# Exclude the training indices to create the test set
test <- dataset[-train_indices, ]
test

##    Y D1 D2 D3 D4
## 1  1  0  1  0  0
## 2  0  0  1  0  0
## 10 0  0  1  0  1
## 11 1  0  1  1  0
## 24 0  0  1  0  0
## 28 1  1  0  0  1
## 35 0  0  0  0  0
## 37 0  0  1  1  0
## 44 0  0  1  0  0
## 45 1  1  0  0  1
## 48 0  0  1  0  1
## 52 1  0  1  0  0
## 55 0  0  1  0  0
## 56 0  0  1  0  0
## 59 0  0  1  0  0
## 68 0  0  1  0  1
## 74 0  0  1  0  1
## 81 0  0  1  0  0
## 88 0  0  1  1  0
## 98 0  1  0  0  0

paste("train sample size: ", dim(train)[1])

## [1] "train sample size:  80"

paste("test sample size: ", dim(test)[1])

## [1] "test sample size:  20"

model_biner<-glm(Y~ D1+D2+D3+D4,family=binomial,train,)
summary(model_biner)

## 
## Call:
## glm(formula = Y ~ D1 + D2 + D3 + D4, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3541  -0.6031  -0.3805  -0.3805   2.3072  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -2.4083     0.7980  -3.018  0.00254 **
## D1            1.0222     1.0344   0.988  0.32304   
## D2           -0.1810     0.7843  -0.231  0.81752   
## D3            0.9771     0.8373   1.167  0.24326   
## D4            1.7924     0.7487   2.394  0.01666 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71.007  on 79  degrees of freedom
## Residual deviance: 62.463  on 75  degrees of freedom
## AIC: 72.463
## 
## Number of Fisher Scoring iterations: 5

nagelkerke(model_biner)

## $Models
##                                                     
## Model: "glm, Y ~ D1 + D2 + D3 + D4, binomial, train"
## Null:  "glm, Y ~ 1, binomial, train"                
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.120319
## Cox and Snell (ML)                   0.101288
## Nagelkerke (Cragg and Uhler)         0.172156
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq  p.value
##       -4     -4.2717 8.5434 0.073581
## 
## $Number.of.observations
##          
## Model: 80
## Null:  80
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

#plot(model_biner)
return

## .Primitive("return")

#predict the test dataset
pred <- predict(model_biner, test, type="response")
predicted <- round(pred) # round of the value; >0.5 will convert to 
# 1 else 0
# Creating a contigency table
tab <- table(predicted=predicted, Reference = test$Y)
tab

##          Reference
## predicted  0  1
##         0 15  3
##         1  0  2

accuracy(tab)

## $Models
##   Call           
## 1 "Not supported"
## 2 "Not supported"
## 3 "Not supported"
## 4 "Not supported"
## 
## $Fit.criteria
##   Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
## 1               NA  NA   NA  NA   NA         NA           NA
## 2               NA  NA   NA  NA   NA         NA           NA
## 3               NA  NA   NA  NA   NA         NA           NA
## 4               NA  NA   NA  NA   NA         NA           NA
##   NRMSE.mean.accuracy NRMSE.median.accuracy Efron.r.squared CV.prcnt
## 1                  NA                    NA              NA       NA
## 2                  NA                    NA              NA       NA
## 3                  NA                    NA              NA       NA
## 4                  NA                    NA              NA       NA

# Creating a dataframe of observed and predicted data
act_pred <- data.frame(observed = test$Y, predicted = 
                         factor(predicted))
# Calculating Accuracy
accuracy_est <- accuracy(act_pred, observed, predicted)
print(accuracy_est)

## $Models
##   Call           
## 1 "Not supported"
## 2 "Not supported"
## 
## $Fit.criteria
##   Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
## 1               NA  NA   NA  NA   NA         NA           NA
## 2               NA  NA   NA  NA   NA         NA           NA
##   NRMSE.mean.accuracy NRMSE.median.accuracy Efron.r.squared CV.prcnt
## 1                  NA                    NA              NA       NA
## 2                  NA                    NA              NA       NA

# Note: the accuracy( ) function gives rounded estimate i.e. 0.885
pred.rocr <- prediction(pred, test$Y)
eval <- performance(pred.rocr, "acc")
plot(eval)

# Identifying the best cutoff value that maximizes accuracy
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max] #y.values are accuracy 
#measures
cut <- slot(eval, "x.values")[[1]][max] #x.values are cutoff 
#measures
print(c(Accuracy = acc, Cutoff = cut))

##  Accuracy Cutoff.45 
## 0.8500000 0.6002243

perf_rocr <- performance(pred.rocr, measure = "auc",
                         x.measure = "cutoff")
perf_rocr@y.values[[1]] <- round(perf_rocr@y.values[[1]], digits = 
                                   4)
perf.tpr.fpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.fpr.rocr, colorize=T, 
     main = paste("AUC:", (perf_rocr@y.values)))

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Dataset me

f1A1

8/25/2022

R Markdown

Including Plots