Link to the project in RPubs: http://rpubs.com/ofomicheva86/379258

#required packages
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)

1.DATA EXPLORATION

#read training data set
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/crime-training-data_modified.csv",
stringsAsFactors=T, header=T)

#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/crime-evaluation-data_modified.csv",
stringsAsFactors=T, header=T)

#display first six entries
head(data)
##   zn indus chas   nox    rm   age    dis rad tax ptratio lstat medv target
## 1  0 19.58    0 0.605 7.929  96.2 2.0459   5 403    14.7  3.70 50.0      1
## 2  0 19.58    1 0.871 5.403 100.0 1.3216   5 403    14.7 26.82 13.4      1
## 3  0 18.10    0 0.740 6.485 100.0 1.9784  24 666    20.2 18.85 15.4      1
## 4 30  4.93    0 0.428 6.393   7.8 7.0355   6 300    16.6  5.19 23.7      0
## 5  0  2.46    0 0.488 7.155  92.2 2.7006   3 193    17.8  4.82 37.9      0
## 6  0  8.56    0 0.520 6.781  71.3 2.8561   5 384    20.9  7.67 26.5      0
#find dimentions
dim(data)
## [1] 466  13
#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.8, 
     ylab=c("Proportion of missingness","Missingness Pattern"),
     labels=names(data[-1]))

#build function that counts missing values
count_nas <- function(data){
  
variable_name_column <- c()
number_missing_column <- c()

for (i in 2:ncol(data)){
  variable_name <- colnames(data[i])
  number_missing <- sum(is.na(data[i]))
  variable_name_column <- c(variable_name_column,variable_name)
  number_missing_column <- c(number_missing_column,number_missing)
}

missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),0)) %>% arrange(desc(percentage))
missing_table
}

#count NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1                 indus                     0          0
## 2                  chas                     0          0
## 3                   nox                     0          0
## 4                    rm                     0          0
## 5                   age                     0          0
## 6                   dis                     0          0
## 7                   rad                     0          0
## 8                   tax                     0          0
## 9               ptratio                     0          0
## 10                lstat                     0          0
## 11                 medv                     0          0
## 12               target                     0          0
#reorder data
data <- data %>% select(target,chas,everything()) 

#build boxplots for each variable
par(mfrow=c(2,3))
for(i in 3:ncol(data)) {
    boxplot(data[,i], main=names(data)[i])
}

  1. DATA PREPARATION
  1. Verifying multicollinearity assumption.
#correlation between variables
corrplot(cor(data[2:length(data)]), type = "upper", method = "number", 
         tl.cex = 0.8, tl.col="black",number.cex = .5)

  1. Verifying linearity assumption.
#replacing each variable except binary variable "chas" with variable*log(variable)
data_linearity_test <- data

for (i in 3:(length(data_linearity_test)-1)){
  
  for (j in 1:nrow(data_linearity_test)){
    
    if (data_linearity_test[j,i]< 0 | data_linearity_test[j,i]> 0){
    
      data_linearity_test[j,i] <- data_linearity_test[j,i]*log(data_linearity_test[j,i])
      
    }
    
      
    
  }
}

head(data)
##   target chas zn indus   nox    rm   age    dis rad tax ptratio lstat medv
## 1      1    0  0 19.58 0.605 7.929  96.2 2.0459   5 403    14.7  3.70 50.0
## 2      1    1  0 19.58 0.871 5.403 100.0 1.3216   5 403    14.7 26.82 13.4
## 3      1    0  0 18.10 0.740 6.485 100.0 1.9784  24 666    20.2 18.85 15.4
## 4      0    0 30  4.93 0.428 6.393   7.8 7.0355   6 300    16.6  5.19 23.7
## 5      0    0  0  2.46 0.488 7.155  92.2 2.7006   3 193    17.8  4.82 37.9
## 6      0    0  0  8.56 0.520 6.781  71.3 2.8561   5 384    20.9  7.67 26.5
#run regression model that includes all independent variables
model <- glm(formula = target ~ ., family = binomial(link = "logit"),
             data = data_linearity_test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = data_linearity_test)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1110  -0.2066  -0.0005   0.0002   3.3940  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.328e+01  7.376e+00   4.513 6.40e-06 ***
## chas         9.166e-01  7.498e-01   1.222  0.22152    
## zn          -2.819e-02  1.240e-02  -2.273  0.02302 *  
## indus       -3.332e-02  1.473e-02  -2.263  0.02366 *  
## nox          1.300e+02  2.205e+01   5.894 3.77e-09 ***
## rm          -1.449e-01  2.440e-01  -0.594  0.55269    
## age          6.050e-03  2.571e-03   2.353  0.01861 *  
## dis          2.241e-01  8.459e-02   2.649  0.00807 ** 
## rad          2.565e-01  6.125e-02   4.188 2.82e-05 ***
## tax         -1.056e-03  4.508e-04  -2.342  0.01917 *  
## ptratio      9.646e-02  3.151e-02   3.062  0.00220 ** 
## lstat        1.685e-02  1.429e-02   1.179  0.23857    
## medv         1.470e-01  6.457e-02   2.277  0.02278 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 191.66  on 453  degrees of freedom
## AIC: 217.66
## 
## Number of Fisher Scoring iterations: 9
#data <- data %>% mutate(zn=ifelse(zn > 0,log(zn),""),indus=log(indus),nox = log(nox),
#age = log(age),dis =log(dis),rad = log(rad),tax = log(tax),ptratio = log(ptratio),medv = log(medv))

head(data)
##   target chas zn indus   nox    rm   age    dis rad tax ptratio lstat medv
## 1      1    0  0 19.58 0.605 7.929  96.2 2.0459   5 403    14.7  3.70 50.0
## 2      1    1  0 19.58 0.871 5.403 100.0 1.3216   5 403    14.7 26.82 13.4
## 3      1    0  0 18.10 0.740 6.485 100.0 1.9784  24 666    20.2 18.85 15.4
## 4      0    0 30  4.93 0.428 6.393   7.8 7.0355   6 300    16.6  5.19 23.7
## 5      0    0  0  2.46 0.488 7.155  92.2 2.7006   3 193    17.8  4.82 37.9
## 6      0    0  0  8.56 0.520 6.781  71.3 2.8561   5 384    20.9  7.67 26.5
#convert the variable 'zn' to double format
data$zn <- as.double(data$zn)

#impute missing values
imp.data <- mice(data, m=6, method='cart', printFlag=FALSE)
data <- complete(imp.data)
head(data)
##   target chas zn indus   nox    rm   age    dis rad tax ptratio lstat medv
## 1      1    0  0 19.58 0.605 7.929  96.2 2.0459   5 403    14.7  3.70 50.0
## 2      1    1  0 19.58 0.871 5.403 100.0 1.3216   5 403    14.7 26.82 13.4
## 3      1    0  0 18.10 0.740 6.485 100.0 1.9784  24 666    20.2 18.85 15.4
## 4      0    0 30  4.93 0.428 6.393   7.8 7.0355   6 300    16.6  5.19 23.7
## 5      0    0  0  2.46 0.488 7.155  92.2 2.7006   3 193    17.8  4.82 37.9
## 6      0    0  0  8.56 0.520 6.781  71.3 2.8561   5 384    20.9  7.67 26.5
  1. BUILD MODELS
#build glm model using stepwise approach
model.null = glm(target ~ 1, 
                 data = data,
                 family = binomial(link="logit")
                 )

model.full = glm(target ~ .,
                 data = data,
                 family = binomial(link="logit")
                 )
     
step(model.null,
     scope = list(upper=model.full),
             direction = "both",
             test = "Chisq",
             data = data)
## Start:  AIC=647.88
## target ~ 1
## 
##           Df Deviance    AIC    LRT  Pr(>Chi)    
## + nox      1   292.01 296.01 353.86 < 2.2e-16 ***
## + rad      1   404.16 408.16 241.71 < 2.2e-16 ***
## + dis      1   409.50 413.50 236.38 < 2.2e-16 ***
## + age      1   424.75 428.75 221.13 < 2.2e-16 ***
## + tax      1   442.38 446.38 203.50 < 2.2e-16 ***
## + indus    1   453.23 457.23 192.64 < 2.2e-16 ***
## + zn       1   518.46 522.46 127.41 < 2.2e-16 ***
## + lstat    1   528.01 532.01 117.87 < 2.2e-16 ***
## + medv     1   609.62 613.62  36.26 1.729e-09 ***
## + ptratio  1   615.64 619.64  30.24 3.823e-08 ***
## + rm       1   634.82 638.82  11.05 0.0008863 ***
## + chas     1   642.86 646.86   3.02 0.0824375 .  
## <none>         645.88 647.88                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=296.01
## target ~ nox
## 
##           Df Deviance    AIC    LRT  Pr(>Chi)    
## + rad      1   239.51 245.51  52.50   4.3e-13 ***
## + rm       1   284.63 290.63   7.38  0.006598 ** 
## + medv     1   285.86 291.86   6.16  0.013103 *  
## + indus    1   288.11 294.11   3.90  0.048195 *  
## + zn       1   288.29 294.29   3.73  0.053593 .  
## + tax      1   288.40 294.40   3.61  0.057432 .  
## + chas     1   288.47 294.47   3.54  0.059824 .  
## <none>         292.01 296.01                     
## + ptratio  1   290.14 296.14   1.88  0.170676    
## + age      1   290.63 296.63   1.39  0.238898    
## + dis      1   290.91 296.91   1.10  0.293997    
## + lstat    1   291.93 297.93   0.09  0.770159    
## - nox      1   645.88 647.88 353.86 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=245.51
## target ~ nox + rad
## 
##           Df Deviance    AIC     LRT  Pr(>Chi)    
## + tax      1   224.47 232.47  15.039 0.0001053 ***
## + indus    1   233.09 241.09   6.418 0.0112991 *  
## + zn       1   235.19 243.19   4.325 0.0375672 *  
## + rm       1   236.61 244.61   2.906 0.0882694 .  
## + age      1   236.76 244.76   2.748 0.0973934 .  
## + medv     1   236.86 244.86   2.651 0.1035095    
## + ptratio  1   237.33 245.33   2.180 0.1398571    
## <none>         239.51 245.51                      
## + chas     1   237.64 245.64   1.871 0.1713327    
## + dis      1   237.96 245.96   1.548 0.2134708    
## + lstat    1   239.47 247.47   0.037 0.8472926    
## - rad      1   292.01 296.01  52.501   4.3e-13 ***
## - nox      1   404.16 408.16 164.650 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=232.47
## target ~ nox + rad + tax
## 
##           Df Deviance    AIC     LRT  Pr(>Chi)    
## + ptratio  1   218.70 228.70   5.770 0.0162983 *  
## + zn       1   219.94 229.94   4.530 0.0333117 *  
## + age      1   220.44 230.44   4.027 0.0447786 *  
## <none>         224.47 232.47                      
## + dis      1   223.30 233.30   1.169 0.2796213    
## + indus    1   223.40 233.40   1.076 0.2996421    
## + chas     1   223.63 233.63   0.841 0.3592167    
## + lstat    1   223.71 233.71   0.760 0.3832294    
## + rm       1   223.75 233.75   0.720 0.3960720    
## + medv     1   224.27 234.27   0.205 0.6508862    
## - tax      1   239.51 245.51  15.039 0.0001053 ***
## - rad      1   288.40 294.40  63.931 1.289e-15 ***
## - nox      1   395.48 401.48 171.012 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=228.7
## target ~ nox + rad + tax + ptratio
## 
##           Df Deviance    AIC     LRT  Pr(>Chi)    
## + age      1   214.46 226.46   4.239   0.03949 *  
## + medv     1   215.23 227.23   3.474   0.06233 .  
## + rm       1   216.12 228.12   2.581   0.10815    
## + zn       1   216.32 228.32   2.386   0.12246    
## <none>         218.70 228.70                      
## + chas     1   216.81 228.81   1.888   0.16944    
## + dis      1   217.79 229.79   0.907   0.34078    
## + indus    1   217.82 229.82   0.885   0.34693    
## + lstat    1   218.57 230.57   0.129   0.71931    
## - ptratio  1   224.47 232.47   5.770   0.01630 *  
## - tax      1   237.33 245.33  18.630 1.587e-05 ***
## - rad      1   287.59 295.59  68.885 < 2.2e-16 ***
## - nox      1   394.21 402.21 175.507 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=226.46
## target ~ nox + rad + tax + ptratio + age
## 
##           Df Deviance    AIC    LRT  Pr(>Chi)    
## + medv     1   209.55 223.55  4.910   0.02670 *  
## + rm       1   212.31 226.31  2.154   0.14217    
## + dis      1   212.40 226.40  2.061   0.15115    
## <none>         214.46 226.46                     
## + zn       1   212.67 226.67  1.795   0.18037    
## + chas     1   213.24 227.24  1.220   0.26945    
## + indus    1   213.38 227.38  1.084   0.29775    
## + lstat    1   214.35 228.35  0.113   0.73629    
## - age      1   218.70 228.70  4.239   0.03949 *  
## - ptratio  1   220.44 230.44  5.983   0.01445 *  
## - tax      1   234.99 244.99 20.524 5.889e-06 ***
## - rad      1   286.00 296.00 71.540 < 2.2e-16 ***
## - nox      1   296.04 306.04 81.581 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=223.55
## target ~ nox + rad + tax + ptratio + age + medv
## 
##           Df Deviance    AIC    LRT  Pr(>Chi)    
## + dis      1   203.45 219.45  6.104  0.013484 *  
## <none>         209.55 223.55                     
## + zn       1   207.64 223.64  1.909  0.167123    
## + lstat    1   208.07 224.07  1.477  0.224216    
## + chas     1   208.33 224.33  1.223  0.268838    
## + indus    1   208.58 224.58  0.973  0.324036    
## + rm       1   208.79 224.79  0.766  0.381415    
## - medv     1   214.46 226.46  4.910  0.026698 *  
## - age      1   215.23 227.23  5.675  0.017204 *  
## - ptratio  1   219.94 231.94 10.394  0.001264 ** 
## - tax      1   224.71 236.71 15.159 9.885e-05 ***
## - rad      1   269.51 281.51 59.960 9.679e-15 ***
## - nox      1   294.08 306.08 84.529 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=219.45
## target ~ nox + rad + tax + ptratio + age + medv + dis
## 
##           Df Deviance    AIC    LRT  Pr(>Chi)    
## + zn       1   197.32 215.32  6.124 0.0133321 *  
## + chas     1   201.29 219.29  2.157 0.1419100    
## + rm       1   201.35 219.35  2.093 0.1480183    
## <none>         203.45 219.45                     
## + lstat    1   202.05 220.05  1.393 0.2378583    
## + indus    1   202.23 220.23  1.220 0.2693725    
## - dis      1   209.55 223.55  6.104 0.0134845 *  
## - medv     1   212.40 226.40  8.954 0.0027685 ** 
## - age      1   212.97 226.97  9.519 0.0020335 ** 
## - tax      1   216.21 230.21 12.760 0.0003541 ***
## - ptratio  1   216.35 230.35 12.907 0.0003274 ***
## - rad      1   259.98 273.98 56.530 5.534e-14 ***
## - nox      1   278.84 292.84 75.390 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=215.32
## target ~ nox + rad + tax + ptratio + age + medv + dis + zn
## 
##           Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>         197.32 215.32                     
## + lstat    1   195.51 215.51  1.808 0.1787290    
## + rm       1   195.75 215.75  1.575 0.2094316    
## + chas     1   195.97 215.97  1.349 0.2454148    
## + indus    1   196.33 216.33  0.995 0.3185882    
## - zn       1   203.45 219.45  6.124 0.0133321 *  
## - ptratio  1   206.27 222.27  8.948 0.0027770 ** 
## - age      1   207.13 223.13  9.810 0.0017361 ** 
## - tax      1   207.62 223.62 10.293 0.0013356 ** 
## - dis      1   207.64 223.64 10.320 0.0013157 ** 
## - medv     1   208.65 224.65 11.326 0.0007644 ***
## - rad      1   250.98 266.98 53.659 2.385e-13 ***
## - nox      1   273.18 289.18 75.852 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:  glm(formula = target ~ nox + rad + tax + ptratio + age + medv + 
##     dis + zn, family = binomial(link = "logit"), data = data)
## 
## Coefficients:
## (Intercept)          nox          rad          tax      ptratio  
##  -37.415922    42.807768     0.725109    -0.007756     0.323628  
##         age         medv          dis           zn  
##    0.032950     0.110472     0.654896    -0.068648  
## 
## Degrees of Freedom: 465 Total (i.e. Null);  457 Residual
## Null Deviance:       645.9 
## Residual Deviance: 197.3     AIC: 215.3
  1. SELECT MODELS
  1. Test Goodness of Fit
#final model
final.model <- glm(formula = target ~ nox + rad + tax + zn + ptratio + rm + dis + chas,
                   family = binomial(link = "logit"), data = data)

#reduced model with fewer parameters
model2 <- glm(formula = target ~ nox + rad + tax + ptratio + age + medv + dis,
              family = binomial(link = "logit"), data = data)

model3 <- glm(formula = target ~ nox + rad + tax + ptratio + age + medv,
              family = binomial(link = "logit"), data = data)

#residual deviance test
p_value = 1 - pchisq(final.model$deviance,final.model$df.residual)
p_value
## [1] 1
#Likelihood Ratio Test
anova(final.model, model2, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: target ~ nox + rad + tax + zn + ptratio + rm + dis + chas
## Model 2: target ~ nox + rad + tax + ptratio + age + medv + dis
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       457     206.59                     
## 2       458     203.45 -1   3.1474
anova(final.model, model3, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: target ~ nox + rad + tax + zn + ptratio + rm + dis + chas
## Model 2: target ~ nox + rad + tax + ptratio + age + medv
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       457     206.59                     
## 2       459     209.55 -2   -2.957    0.228
#Pseudo R^2 Test
pR2(final.model)
##          llh      llhNull           G2     McFadden         r2ML 
## -103.2973776 -322.9379132  439.2810712    0.6801324    0.6104111 
##         r2CU 
##    0.8139615
#Hosmer-Lemeshow Test
hoslem.test(data$target, fitted(final.model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  data$target, fitted(final.model)
## X-squared = 6.754, df = 8, p-value = 0.5634
  1. Predict ‘target’ class for testing and training data sets.
#create a new variable 'probability'
data$probability <- c()
data_testing$probability <- c()

#calculate logit function
logit_p <- -34.199808+ 42.656523*data$nox + 0.651366*data$rad -0.007398*data$tax -0.057776*data$zn +0.220289*data$ptratio + 0.754707*data$rm +0.425957*data$dis + 1.037430*data$chas

logit_p_testing <- -34.199808+ 42.656523*data_testing$nox + 0.651366*data_testing$rad -0.007398*data_testing$tax -0.057776*data_testing$zn + 0.220289*data_testing$ptratio + 0.754707*data_testing$rm +0.425957*data_testing$dis + 1.037430*data_testing$chas

#calculate probability
data$probability <- exp(1)^logit_p/(1+exp(1)^logit_p)
data_testing$probability <- exp(1)^logit_p_testing/(1+exp(1)^logit_p_testing)
head(data)
##   target chas zn indus   nox    rm   age    dis rad tax ptratio lstat medv
## 1      1    0  0 19.58 0.605 7.929  96.2 2.0459   5 403    14.7  3.70 50.0
## 2      1    1  0 19.58 0.871 5.403 100.0 1.3216   5 403    14.7 26.82 13.4
## 3      1    0  0 18.10 0.740 6.485 100.0 1.9784  24 666    20.2 18.85 15.4
## 4      0    0 30  4.93 0.428 6.393   7.8 7.0355   6 300    16.6  5.19 23.7
## 5      0    0  0  2.46 0.488 7.155  92.2 2.7006   3 193    17.8  4.82 37.9
## 6      0    0  0  8.56 0.520 6.781  71.3 2.8561   5 384    20.9  7.67 26.5
##   probability
## 1  0.87831932
## 2  0.99999469
## 3  0.99999999
## 4  0.01089129
## 5  0.08426746
## 6  0.33981158
#create a new variable that specifies predicted class
data_testing$target_pred <-c()
head(data_testing)
##   zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 2  0  8.14    0 0.538 6.096 84.5 4.4619   4 307    21.0 10.26 18.2
## 3  0  8.14    0 0.538 6.495 94.4 4.4547   4 307    21.0 12.80 18.4
## 4  0  8.14    0 0.538 5.950 82.0 3.9900   4 307    21.0 27.71 13.2
## 5  0  5.96    0 0.499 5.850 41.5 3.9342   5 279    19.2  8.77 21.0
## 6 25  5.13    0 0.453 5.741 66.2 7.2254   8 284    19.7 13.15 18.7
##   probability
## 1  0.03834625
## 2  0.55256005
## 3  0.62459101
## 4  0.47497679
## 5  0.19765268
## 6  0.18835552
#calculate probability
data = within(data, {
    target_pred = ifelse(data$probability < 0.5, 0, 1)
 })

data_testing = within(data_testing, {
    target_pred = ifelse(data_testing$probability < 0.5, 0, 1)
 })

head(data_testing)
##   zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 2  0  8.14    0 0.538 6.096 84.5 4.4619   4 307    21.0 10.26 18.2
## 3  0  8.14    0 0.538 6.495 94.4 4.4547   4 307    21.0 12.80 18.4
## 4  0  8.14    0 0.538 5.950 82.0 3.9900   4 307    21.0 27.71 13.2
## 5  0  5.96    0 0.499 5.850 41.5 3.9342   5 279    19.2  8.77 21.0
## 6 25  5.13    0 0.453 5.741 66.2 7.2254   8 284    19.7 13.15 18.7
##   probability target_pred
## 1  0.03834625           0
## 2  0.55256005           1
## 3  0.62459101           1
## 4  0.47497679           0
## 5  0.19765268           0
## 6  0.18835552           0
#export testing data file with predicted class
write.table(data_testing, file = "/Users/olga/downloads/data_testing.csv",append = FALSE)
  1. Calculate Classification Metrics.
#create confusion matrix
confusion_matrix <- table(data$target_pred, data$target)
confusion_matrix
##    
##       0   1
##   0 216  27
##   1  21 202
#calculate true positive
TP <- confusion_matrix[4]

#calculate true negative
TN <- confusion_matrix[1]

#calculate false negative
FN <- confusion_matrix[2]

#calculate false positive
FP <- confusion_matrix[3]

#calculate accuracy
accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/nrow(data)
accuracy
## [1] 0.8969957
#calculate accuracy classification error rate
classification_error_rate = (FP + FN)/(TP + FP + TN + FN)
classification_error_rate
## [1] 0.1030043
#calculate precision
precision = TP/(TP + FP)
precision
## [1] 0.8820961
#calculate sensitivity
sensitivity = TP/(TP + FN)
sensitivity
## [1] 0.9058296
#calculate specificity
specificity <- TN/(TN + FP)
specificity
## [1] 0.8888889
#calculate F1 score
F1_score <- (2*precision*sensitivity)/(precision + sensitivity)
F1_score
## [1] 0.8938053
roc.val <- roc(target~probability, data)
plot(roc.val, main="pROC package ROC plot") 

roc.val$auc
## Area under the curve: 0.9691