r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
install.packages("readr")
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Owner\AppData\Local\Temp\Rtmp2hnsWR\downloaded_packages
library(readr)
## Warning: package 'readr' was built under R version 3.6.1
install.packages("ResourceSelection")
## package 'ResourceSelection' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Owner\AppData\Local\Temp\Rtmp2hnsWR\downloaded_packages
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 3.6.1
## ResourceSelection 0.3-4   2019-01-08

R Markdown

#load dataset
 binclass <- read_csv("C:/Users/Owner/Desktop/Lenin Files/Data Sciences/binary-classifier-data.csv")
## Parsed with column specification:
## cols(
##   label = col_double(),
##   x = col_double(),
##   y = col_double()
## )
str(binclass)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1498 obs. of  3 variables:
##  $ label: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ x    : num  70.9 75 73.8 66.4 69.1 ...
##  $ y    : num  83.2 87.9 92.2 81.1 84.5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   label = col_double(),
##   ..   x = col_double(),
##   ..   y = col_double()
##   .. )
#Build two models one with x and y as predictor variables and the second one with x as the predictor variable
modelfit1 <- glm(label ~ x + y, data = binclass, family = "binomial")
modelfit2 <- glm(label ~ x, data = binclass, family = "binomial")
summary(modelfit1)
## 
## Call:
## glm(formula = label ~ x + y, family = "binomial", data = binclass)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3728  -1.1697  -0.9575   1.1646   1.3989  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.424809   0.117224   3.624  0.00029 ***
## x           -0.002571   0.001823  -1.411  0.15836    
## y           -0.007956   0.001869  -4.257 2.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2075.8  on 1497  degrees of freedom
## Residual deviance: 2052.1  on 1495  degrees of freedom
## AIC: 2058.1
## 
## Number of Fisher Scoring iterations: 4
summary(modelfit2)
## 
## Call:
## glm(formula = label ~ x, family = "binomial", data = binclass)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.246  -1.159  -1.064   1.184   1.293  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.137369   0.095119   1.444   0.1487  
## x           -0.004119   0.001775  -2.321   0.0203 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2075.8  on 1497  degrees of freedom
## Residual deviance: 2070.4  on 1496  degrees of freedom
## AIC: 2074.4
## 
## Number of Fisher Scoring iterations: 3
#Find the p-value using likelihood ratio test
anova(modelfit1, modelfit2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: label ~ x + y
## Model 2: label ~ x
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1495     2052.1                          
## 2      1496     2070.4 -1  -18.329 1.858e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Lower p value(1.858e-05) in the second model indicates that we can reject null hypothesis.Having both X and Y as predictor variable help improve the outcome. So modelfit1 is a good fit logistic regression model

## ROC curve
#The goodness of fit can be determined using ROC curve
install.packages("ROCR")
## package 'ROCR' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Owner\AppData\Local\Temp\Rtmp2hnsWR\downloaded_packages
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# Compute AUC for predicting label with the model
prob <- predict(modelfit1, newdata=binclass, type="response")
pred <- prediction(prob, binclass$label)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.5732267

Accuracy using ROC model is 57% - This is the accuracy of the logical regression classifier

#Here we are using nearest neighbors algorithm
#Install class package
install.packages('class')
## package 'class' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Owner\AppData\Local\Temp\Rtmp2hnsWR\downloaded_packages
# Load class package
library(class)
## Warning: package 'class' was built under R version 3.6.1
nor <-function(x) { (x -min(x))/(max(x)-min(x))   }
binclass2 <- as.data.frame(lapply(binclass[,2:2],nor))
set.seed(123)
#Random selection of 70% data.
binclass1 <- sample(1:nrow(binclass),size=nrow(binclass)*0.7,replace = FALSE) 
#70% data
trainds <- binclass2[binclass1,]
#30% data
testds <- binclass2[-binclass1,]
#70% data for variable 2
trainds2 <- binclass[binclass1,1,drop=TRUE]
#30% data for variable 2
testds2 <- binclass[-binclass1,1,drop=TRUE]
#Find the number of observations in the dataset
dim(trainds)
## NULL
dim(testds)
## NULL
dim(trainds2)
## NULL
#Build nearest neighbors model using k values of 32 and 33 
#k value is determined based on the square root of number of observations (1048)
model1 <- knn(data.frame(trainds), data.frame(testds), cl=trainds2, k=32)
model2 <- knn(data.frame(trainds), data.frame(testds), cl=trainds2, k=33)
#Calculate the proportion of correct classification for k = 32, 33
ACCmodel1 <- 100 * sum(testds2 == model1)/NROW(testds2)
ACCmodel2 <- 100 * sum(testds2 == model2)/NROW(testds2)
ACCmodel1
## [1] 73.77778
ACCmodel2
## [1] 73.77778

The accuracy using KNN model is 74.444 and 73.77778. So the accuracy using KNN model is much better than using ROC model