Link to the project in RPubs: http://rpubs.com/ofomicheva86/379258
#required packages
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)
1.DATA EXPLORATION
#read training data set
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/crime-training-data_modified.csv",
stringsAsFactors=T, header=T)
#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/crime-evaluation-data_modified.csv",
stringsAsFactors=T, header=T)
#display first six entries
head(data)
## zn indus chas nox rm age dis rad tax ptratio lstat medv target
## 1 0 19.58 0 0.605 7.929 96.2 2.0459 5 403 14.7 3.70 50.0 1
## 2 0 19.58 1 0.871 5.403 100.0 1.3216 5 403 14.7 26.82 13.4 1
## 3 0 18.10 0 0.740 6.485 100.0 1.9784 24 666 20.2 18.85 15.4 1
## 4 30 4.93 0 0.428 6.393 7.8 7.0355 6 300 16.6 5.19 23.7 0
## 5 0 2.46 0 0.488 7.155 92.2 2.7006 3 193 17.8 4.82 37.9 0
## 6 0 8.56 0 0.520 6.781 71.3 2.8561 5 384 20.9 7.67 26.5 0
#find dimentions
dim(data)
## [1] 466 13
#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.8,
ylab=c("Proportion of missingness","Missingness Pattern"),
labels=names(data[-1]))
#build function that counts missing values
count_nas <- function(data){
variable_name_column <- c()
number_missing_column <- c()
for (i in 2:ncol(data)){
variable_name <- colnames(data[i])
number_missing <- sum(is.na(data[i]))
variable_name_column <- c(variable_name_column,variable_name)
number_missing_column <- c(number_missing_column,number_missing)
}
missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),0)) %>% arrange(desc(percentage))
missing_table
}
#count NAs
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 indus 0 0
## 2 chas 0 0
## 3 nox 0 0
## 4 rm 0 0
## 5 age 0 0
## 6 dis 0 0
## 7 rad 0 0
## 8 tax 0 0
## 9 ptratio 0 0
## 10 lstat 0 0
## 11 medv 0 0
## 12 target 0 0
#reorder data
data <- data %>% select(target,chas,everything())
#build boxplots for each variable
par(mfrow=c(2,3))
for(i in 3:ncol(data)) {
boxplot(data[,i], main=names(data)[i])
}
#correlation between variables
corrplot(cor(data[2:length(data)]), type = "upper", method = "number",
tl.cex = 0.8, tl.col="black",number.cex = .5)
#replacing each variable except binary variable "chas" with variable*log(variable)
data_linearity_test <- data
for (i in 3:(length(data_linearity_test)-1)){
for (j in 1:nrow(data_linearity_test)){
if (data_linearity_test[j,i]< 0 | data_linearity_test[j,i]> 0){
data_linearity_test[j,i] <- data_linearity_test[j,i]*log(data_linearity_test[j,i])
}
}
}
head(data)
## target chas zn indus nox rm age dis rad tax ptratio lstat medv
## 1 1 0 0 19.58 0.605 7.929 96.2 2.0459 5 403 14.7 3.70 50.0
## 2 1 1 0 19.58 0.871 5.403 100.0 1.3216 5 403 14.7 26.82 13.4
## 3 1 0 0 18.10 0.740 6.485 100.0 1.9784 24 666 20.2 18.85 15.4
## 4 0 0 30 4.93 0.428 6.393 7.8 7.0355 6 300 16.6 5.19 23.7
## 5 0 0 0 2.46 0.488 7.155 92.2 2.7006 3 193 17.8 4.82 37.9
## 6 0 0 0 8.56 0.520 6.781 71.3 2.8561 5 384 20.9 7.67 26.5
#run regression model that includes all independent variables
model <- glm(formula = target ~ ., family = binomial(link = "logit"),
data = data_linearity_test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = data_linearity_test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1110 -0.2066 -0.0005 0.0002 3.3940
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.328e+01 7.376e+00 4.513 6.40e-06 ***
## chas 9.166e-01 7.498e-01 1.222 0.22152
## zn -2.819e-02 1.240e-02 -2.273 0.02302 *
## indus -3.332e-02 1.473e-02 -2.263 0.02366 *
## nox 1.300e+02 2.205e+01 5.894 3.77e-09 ***
## rm -1.449e-01 2.440e-01 -0.594 0.55269
## age 6.050e-03 2.571e-03 2.353 0.01861 *
## dis 2.241e-01 8.459e-02 2.649 0.00807 **
## rad 2.565e-01 6.125e-02 4.188 2.82e-05 ***
## tax -1.056e-03 4.508e-04 -2.342 0.01917 *
## ptratio 9.646e-02 3.151e-02 3.062 0.00220 **
## lstat 1.685e-02 1.429e-02 1.179 0.23857
## medv 1.470e-01 6.457e-02 2.277 0.02278 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 191.66 on 453 degrees of freedom
## AIC: 217.66
##
## Number of Fisher Scoring iterations: 9
#data <- data %>% mutate(zn=ifelse(zn > 0,log(zn),""),indus=log(indus),nox = log(nox),
#age = log(age),dis =log(dis),rad = log(rad),tax = log(tax),ptratio = log(ptratio),medv = log(medv))
head(data)
## target chas zn indus nox rm age dis rad tax ptratio lstat medv
## 1 1 0 0 19.58 0.605 7.929 96.2 2.0459 5 403 14.7 3.70 50.0
## 2 1 1 0 19.58 0.871 5.403 100.0 1.3216 5 403 14.7 26.82 13.4
## 3 1 0 0 18.10 0.740 6.485 100.0 1.9784 24 666 20.2 18.85 15.4
## 4 0 0 30 4.93 0.428 6.393 7.8 7.0355 6 300 16.6 5.19 23.7
## 5 0 0 0 2.46 0.488 7.155 92.2 2.7006 3 193 17.8 4.82 37.9
## 6 0 0 0 8.56 0.520 6.781 71.3 2.8561 5 384 20.9 7.67 26.5
#convert the variable 'zn' to double format
data$zn <- as.double(data$zn)
#impute missing values
imp.data <- mice(data, m=6, method='cart', printFlag=FALSE)
data <- complete(imp.data)
head(data)
## target chas zn indus nox rm age dis rad tax ptratio lstat medv
## 1 1 0 0 19.58 0.605 7.929 96.2 2.0459 5 403 14.7 3.70 50.0
## 2 1 1 0 19.58 0.871 5.403 100.0 1.3216 5 403 14.7 26.82 13.4
## 3 1 0 0 18.10 0.740 6.485 100.0 1.9784 24 666 20.2 18.85 15.4
## 4 0 0 30 4.93 0.428 6.393 7.8 7.0355 6 300 16.6 5.19 23.7
## 5 0 0 0 2.46 0.488 7.155 92.2 2.7006 3 193 17.8 4.82 37.9
## 6 0 0 0 8.56 0.520 6.781 71.3 2.8561 5 384 20.9 7.67 26.5
#build glm model using stepwise approach
model.null = glm(target ~ 1,
data = data,
family = binomial(link="logit")
)
model.full = glm(target ~ .,
data = data,
family = binomial(link="logit")
)
step(model.null,
scope = list(upper=model.full),
direction = "both",
test = "Chisq",
data = data)
## Start: AIC=647.88
## target ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + nox 1 292.01 296.01 353.86 < 2.2e-16 ***
## + rad 1 404.16 408.16 241.71 < 2.2e-16 ***
## + dis 1 409.50 413.50 236.38 < 2.2e-16 ***
## + age 1 424.75 428.75 221.13 < 2.2e-16 ***
## + tax 1 442.38 446.38 203.50 < 2.2e-16 ***
## + indus 1 453.23 457.23 192.64 < 2.2e-16 ***
## + zn 1 518.46 522.46 127.41 < 2.2e-16 ***
## + lstat 1 528.01 532.01 117.87 < 2.2e-16 ***
## + medv 1 609.62 613.62 36.26 1.729e-09 ***
## + ptratio 1 615.64 619.64 30.24 3.823e-08 ***
## + rm 1 634.82 638.82 11.05 0.0008863 ***
## + chas 1 642.86 646.86 3.02 0.0824375 .
## <none> 645.88 647.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=296.01
## target ~ nox
##
## Df Deviance AIC LRT Pr(>Chi)
## + rad 1 239.51 245.51 52.50 4.3e-13 ***
## + rm 1 284.63 290.63 7.38 0.006598 **
## + medv 1 285.86 291.86 6.16 0.013103 *
## + indus 1 288.11 294.11 3.90 0.048195 *
## + zn 1 288.29 294.29 3.73 0.053593 .
## + tax 1 288.40 294.40 3.61 0.057432 .
## + chas 1 288.47 294.47 3.54 0.059824 .
## <none> 292.01 296.01
## + ptratio 1 290.14 296.14 1.88 0.170676
## + age 1 290.63 296.63 1.39 0.238898
## + dis 1 290.91 296.91 1.10 0.293997
## + lstat 1 291.93 297.93 0.09 0.770159
## - nox 1 645.88 647.88 353.86 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=245.51
## target ~ nox + rad
##
## Df Deviance AIC LRT Pr(>Chi)
## + tax 1 224.47 232.47 15.039 0.0001053 ***
## + indus 1 233.09 241.09 6.418 0.0112991 *
## + zn 1 235.19 243.19 4.325 0.0375672 *
## + rm 1 236.61 244.61 2.906 0.0882694 .
## + age 1 236.76 244.76 2.748 0.0973934 .
## + medv 1 236.86 244.86 2.651 0.1035095
## + ptratio 1 237.33 245.33 2.180 0.1398571
## <none> 239.51 245.51
## + chas 1 237.64 245.64 1.871 0.1713327
## + dis 1 237.96 245.96 1.548 0.2134708
## + lstat 1 239.47 247.47 0.037 0.8472926
## - rad 1 292.01 296.01 52.501 4.3e-13 ***
## - nox 1 404.16 408.16 164.650 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=232.47
## target ~ nox + rad + tax
##
## Df Deviance AIC LRT Pr(>Chi)
## + ptratio 1 218.70 228.70 5.770 0.0162983 *
## + zn 1 219.94 229.94 4.530 0.0333117 *
## + age 1 220.44 230.44 4.027 0.0447786 *
## <none> 224.47 232.47
## + dis 1 223.30 233.30 1.169 0.2796213
## + indus 1 223.40 233.40 1.076 0.2996421
## + chas 1 223.63 233.63 0.841 0.3592167
## + lstat 1 223.71 233.71 0.760 0.3832294
## + rm 1 223.75 233.75 0.720 0.3960720
## + medv 1 224.27 234.27 0.205 0.6508862
## - tax 1 239.51 245.51 15.039 0.0001053 ***
## - rad 1 288.40 294.40 63.931 1.289e-15 ***
## - nox 1 395.48 401.48 171.012 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=228.7
## target ~ nox + rad + tax + ptratio
##
## Df Deviance AIC LRT Pr(>Chi)
## + age 1 214.46 226.46 4.239 0.03949 *
## + medv 1 215.23 227.23 3.474 0.06233 .
## + rm 1 216.12 228.12 2.581 0.10815
## + zn 1 216.32 228.32 2.386 0.12246
## <none> 218.70 228.70
## + chas 1 216.81 228.81 1.888 0.16944
## + dis 1 217.79 229.79 0.907 0.34078
## + indus 1 217.82 229.82 0.885 0.34693
## + lstat 1 218.57 230.57 0.129 0.71931
## - ptratio 1 224.47 232.47 5.770 0.01630 *
## - tax 1 237.33 245.33 18.630 1.587e-05 ***
## - rad 1 287.59 295.59 68.885 < 2.2e-16 ***
## - nox 1 394.21 402.21 175.507 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=226.46
## target ~ nox + rad + tax + ptratio + age
##
## Df Deviance AIC LRT Pr(>Chi)
## + medv 1 209.55 223.55 4.910 0.02670 *
## + rm 1 212.31 226.31 2.154 0.14217
## + dis 1 212.40 226.40 2.061 0.15115
## <none> 214.46 226.46
## + zn 1 212.67 226.67 1.795 0.18037
## + chas 1 213.24 227.24 1.220 0.26945
## + indus 1 213.38 227.38 1.084 0.29775
## + lstat 1 214.35 228.35 0.113 0.73629
## - age 1 218.70 228.70 4.239 0.03949 *
## - ptratio 1 220.44 230.44 5.983 0.01445 *
## - tax 1 234.99 244.99 20.524 5.889e-06 ***
## - rad 1 286.00 296.00 71.540 < 2.2e-16 ***
## - nox 1 296.04 306.04 81.581 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=223.55
## target ~ nox + rad + tax + ptratio + age + medv
##
## Df Deviance AIC LRT Pr(>Chi)
## + dis 1 203.45 219.45 6.104 0.013484 *
## <none> 209.55 223.55
## + zn 1 207.64 223.64 1.909 0.167123
## + lstat 1 208.07 224.07 1.477 0.224216
## + chas 1 208.33 224.33 1.223 0.268838
## + indus 1 208.58 224.58 0.973 0.324036
## + rm 1 208.79 224.79 0.766 0.381415
## - medv 1 214.46 226.46 4.910 0.026698 *
## - age 1 215.23 227.23 5.675 0.017204 *
## - ptratio 1 219.94 231.94 10.394 0.001264 **
## - tax 1 224.71 236.71 15.159 9.885e-05 ***
## - rad 1 269.51 281.51 59.960 9.679e-15 ***
## - nox 1 294.08 306.08 84.529 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=219.45
## target ~ nox + rad + tax + ptratio + age + medv + dis
##
## Df Deviance AIC LRT Pr(>Chi)
## + zn 1 197.32 215.32 6.124 0.0133321 *
## + chas 1 201.29 219.29 2.157 0.1419100
## + rm 1 201.35 219.35 2.093 0.1480183
## <none> 203.45 219.45
## + lstat 1 202.05 220.05 1.393 0.2378583
## + indus 1 202.23 220.23 1.220 0.2693725
## - dis 1 209.55 223.55 6.104 0.0134845 *
## - medv 1 212.40 226.40 8.954 0.0027685 **
## - age 1 212.97 226.97 9.519 0.0020335 **
## - tax 1 216.21 230.21 12.760 0.0003541 ***
## - ptratio 1 216.35 230.35 12.907 0.0003274 ***
## - rad 1 259.98 273.98 56.530 5.534e-14 ***
## - nox 1 278.84 292.84 75.390 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=215.32
## target ~ nox + rad + tax + ptratio + age + medv + dis + zn
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 197.32 215.32
## + lstat 1 195.51 215.51 1.808 0.1787290
## + rm 1 195.75 215.75 1.575 0.2094316
## + chas 1 195.97 215.97 1.349 0.2454148
## + indus 1 196.33 216.33 0.995 0.3185882
## - zn 1 203.45 219.45 6.124 0.0133321 *
## - ptratio 1 206.27 222.27 8.948 0.0027770 **
## - age 1 207.13 223.13 9.810 0.0017361 **
## - tax 1 207.62 223.62 10.293 0.0013356 **
## - dis 1 207.64 223.64 10.320 0.0013157 **
## - medv 1 208.65 224.65 11.326 0.0007644 ***
## - rad 1 250.98 266.98 53.659 2.385e-13 ***
## - nox 1 273.18 289.18 75.852 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call: glm(formula = target ~ nox + rad + tax + ptratio + age + medv +
## dis + zn, family = binomial(link = "logit"), data = data)
##
## Coefficients:
## (Intercept) nox rad tax ptratio
## -37.415922 42.807768 0.725109 -0.007756 0.323628
## age medv dis zn
## 0.032950 0.110472 0.654896 -0.068648
##
## Degrees of Freedom: 465 Total (i.e. Null); 457 Residual
## Null Deviance: 645.9
## Residual Deviance: 197.3 AIC: 215.3
#final model
final.model <- glm(formula = target ~ nox + rad + tax + zn + ptratio + rm + dis + chas,
family = binomial(link = "logit"), data = data)
#reduced model with fewer parameters
model2 <- glm(formula = target ~ nox + rad + tax + ptratio + age + medv + dis,
family = binomial(link = "logit"), data = data)
model3 <- glm(formula = target ~ nox + rad + tax + ptratio + age + medv,
family = binomial(link = "logit"), data = data)
#residual deviance test
p_value = 1 - pchisq(final.model$deviance,final.model$df.residual)
p_value
## [1] 1
#Likelihood Ratio Test
anova(final.model, model2, test ="Chisq")
## Analysis of Deviance Table
##
## Model 1: target ~ nox + rad + tax + zn + ptratio + rm + dis + chas
## Model 2: target ~ nox + rad + tax + ptratio + age + medv + dis
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 457 206.59
## 2 458 203.45 -1 3.1474
anova(final.model, model3, test ="Chisq")
## Analysis of Deviance Table
##
## Model 1: target ~ nox + rad + tax + zn + ptratio + rm + dis + chas
## Model 2: target ~ nox + rad + tax + ptratio + age + medv
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 457 206.59
## 2 459 209.55 -2 -2.957 0.228
#Pseudo R^2 Test
pR2(final.model)
## llh llhNull G2 McFadden r2ML
## -103.2973776 -322.9379132 439.2810712 0.6801324 0.6104111
## r2CU
## 0.8139615
#Hosmer-Lemeshow Test
hoslem.test(data$target, fitted(final.model), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: data$target, fitted(final.model)
## X-squared = 6.754, df = 8, p-value = 0.5634
#create a new variable 'probability'
data$probability <- c()
data_testing$probability <- c()
#calculate logit function
logit_p <- -34.199808+ 42.656523*data$nox + 0.651366*data$rad -0.007398*data$tax -0.057776*data$zn +0.220289*data$ptratio + 0.754707*data$rm +0.425957*data$dis + 1.037430*data$chas
logit_p_testing <- -34.199808+ 42.656523*data_testing$nox + 0.651366*data_testing$rad -0.007398*data_testing$tax -0.057776*data_testing$zn + 0.220289*data_testing$ptratio + 0.754707*data_testing$rm +0.425957*data_testing$dis + 1.037430*data_testing$chas
#calculate probability
data$probability <- exp(1)^logit_p/(1+exp(1)^logit_p)
data_testing$probability <- exp(1)^logit_p_testing/(1+exp(1)^logit_p_testing)
head(data)
## target chas zn indus nox rm age dis rad tax ptratio lstat medv
## 1 1 0 0 19.58 0.605 7.929 96.2 2.0459 5 403 14.7 3.70 50.0
## 2 1 1 0 19.58 0.871 5.403 100.0 1.3216 5 403 14.7 26.82 13.4
## 3 1 0 0 18.10 0.740 6.485 100.0 1.9784 24 666 20.2 18.85 15.4
## 4 0 0 30 4.93 0.428 6.393 7.8 7.0355 6 300 16.6 5.19 23.7
## 5 0 0 0 2.46 0.488 7.155 92.2 2.7006 3 193 17.8 4.82 37.9
## 6 0 0 0 8.56 0.520 6.781 71.3 2.8561 5 384 20.9 7.67 26.5
## probability
## 1 0.87831932
## 2 0.99999469
## 3 0.99999999
## 4 0.01089129
## 5 0.08426746
## 6 0.33981158
#create a new variable that specifies predicted class
data_testing$target_pred <-c()
head(data_testing)
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 2 0 8.14 0 0.538 6.096 84.5 4.4619 4 307 21.0 10.26 18.2
## 3 0 8.14 0 0.538 6.495 94.4 4.4547 4 307 21.0 12.80 18.4
## 4 0 8.14 0 0.538 5.950 82.0 3.9900 4 307 21.0 27.71 13.2
## 5 0 5.96 0 0.499 5.850 41.5 3.9342 5 279 19.2 8.77 21.0
## 6 25 5.13 0 0.453 5.741 66.2 7.2254 8 284 19.7 13.15 18.7
## probability
## 1 0.03834625
## 2 0.55256005
## 3 0.62459101
## 4 0.47497679
## 5 0.19765268
## 6 0.18835552
#calculate probability
data = within(data, {
target_pred = ifelse(data$probability < 0.5, 0, 1)
})
data_testing = within(data_testing, {
target_pred = ifelse(data_testing$probability < 0.5, 0, 1)
})
head(data_testing)
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 2 0 8.14 0 0.538 6.096 84.5 4.4619 4 307 21.0 10.26 18.2
## 3 0 8.14 0 0.538 6.495 94.4 4.4547 4 307 21.0 12.80 18.4
## 4 0 8.14 0 0.538 5.950 82.0 3.9900 4 307 21.0 27.71 13.2
## 5 0 5.96 0 0.499 5.850 41.5 3.9342 5 279 19.2 8.77 21.0
## 6 25 5.13 0 0.453 5.741 66.2 7.2254 8 284 19.7 13.15 18.7
## probability target_pred
## 1 0.03834625 0
## 2 0.55256005 1
## 3 0.62459101 1
## 4 0.47497679 0
## 5 0.19765268 0
## 6 0.18835552 0
#export testing data file with predicted class
write.table(data_testing, file = "/Users/olga/downloads/data_testing.csv",append = FALSE)
#create confusion matrix
confusion_matrix <- table(data$target_pred, data$target)
confusion_matrix
##
## 0 1
## 0 216 27
## 1 21 202
#calculate true positive
TP <- confusion_matrix[4]
#calculate true negative
TN <- confusion_matrix[1]
#calculate false negative
FN <- confusion_matrix[2]
#calculate false positive
FP <- confusion_matrix[3]
#calculate accuracy
accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/nrow(data)
accuracy
## [1] 0.8969957
#calculate accuracy classification error rate
classification_error_rate = (FP + FN)/(TP + FP + TN + FN)
classification_error_rate
## [1] 0.1030043
#calculate precision
precision = TP/(TP + FP)
precision
## [1] 0.8820961
#calculate sensitivity
sensitivity = TP/(TP + FN)
sensitivity
## [1] 0.9058296
#calculate specificity
specificity <- TN/(TN + FP)
specificity
## [1] 0.8888889
#calculate F1 score
F1_score <- (2*precision*sensitivity)/(precision + sensitivity)
F1_score
## [1] 0.8938053
roc.val <- roc(target~probability, data)
plot(roc.val, main="pROC package ROC plot")
roc.val$auc
## Area under the curve: 0.9691