DATA 621 Homework #3

Data Exploration

Are There Missing Values?

First we look at the data to see if any variables have missing data:

It looks like we have a complete data set. No need to impute values.

Splitting the Data

Next, we look to split our data between a training set (train) and a test data set (test). We’ll use a 70-30 split between train and test, respectively.

Exploratory Data Analysis

Now, let’s look at our training data. By looking at a correlation matrix, we can see which variables may be too correlated to be included together in a model as predictor variables. This will help us later during the model selection process.

Next we will look at each potential predictor and how it is distributed across the target variable.

The following plots show how predictors are distributed between a positive target variable (areas with crime rates higher than the median, i.e. blue) and a negative target variable (areas with crime rates below the median, i.e. red). What we are looking for is variables that show way to split data into two groups.

Looking at the plots above, the nox variable seems to be the best variable to divide the data into the two groups.

Data Preparation

Feature Engineering

library(fuzzyjoin)
density_diff <- function(data,eval_data,var,target,new_column){
  data[[paste0(var,"temp")]] <- data[[var]]
  eval_data[[paste0(var,"temp")]] <- eval_data[[var]]
          
  #standardize variable between 0 and 1
  data[[var]] <- (data[[var]] - min(data[[var]]))/(max(data[[var]]) - min(data[[var]]))
  eval_data[[var]] <- (eval_data[[var]] - min(eval_data[[var]]))/(max(eval_data[[var]]) - min(eval_data[[var]]))
              
  ## Calculate density estimates
  g1 <- ggplot(data, aes(x=data[[var]], group=target, colour=target)) +
    geom_density(data = data) + xlim(min(data[[var]]), max(data[[var]]))
  gg1 <- ggplot_build(g1)
  # construct the dataset
  x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
  y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
  y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
  df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2), side=(y1<y2), ydiff = y2-y1)
  ##creating the second graph object
  g3 <- ggplot(df2) +
    geom_line(aes(x = x, y = ydiff, colour = side)) +
    geom_area(aes(x = x, y = ydiff, fill = side, alpha = 0.4)) +
    guides(alpha = FALSE, fill = FALSE)
  
  data$join <- data[[var]]
  df2$join <- df2$x
  temp <- difference_left_join(data,df2, by="join", max_dist =.001)
  means <- aggregate(ydiff ~ temp$join.x, temp, mean)
  
  colnames(means) <- c("std_var","density_diff")
  
  new_data <- merge(means,data, by.x ="std_var", by.y =var)
  new_data$std_var <- NULL
  new_data$join <- NULL
  new_data[new_column] <- new_data$density_diff
  new_data$density_diff <- NULL
  ###################same thing with eval data ############################
  eval_data$join <- eval_data[[var]]
  df2$join <- df2$x
  temp <- difference_left_join(eval_data,df2, by="join", max_dist =.001)
  means <- aggregate(ydiff ~ temp$join.x, temp, mean)
                
  ##new stuff
  colnames(means) <- c("std_var","density_diff")
  #data["key"] <- (var - min(var))/(max(var) - min(var))
  
  eval_data2 <- merge(means,eval_data, by.x ="std_var", by.y =var)
  eval_data2$std_var <- NULL
  eval_data2$join <- NULL
  eval_data2[new_column] <- eval_data2$density_diff
  eval_data2$density_diff <- NULL
  
  eval_data2[[var]] <- eval_data2[[paste0(var,"temp")]]
  eval_data2[[paste0(var,"temp")]] <- NULL
  
  new_data[[var]] <- new_data[[paste0(var,"temp")]]
  new_data[[paste0(var,"temp")]] <- NULL
  
  mylist <- list(g1,g3,new_data,eval_data2)
  names(mylist)<- c("dist","dist_diff","new_data_training","new_data_eval")
  return(mylist)
}

Creating Some New Variables

'data.frame':   327 obs. of  13 variables:
 $ zn     : num  0 0 0 30 0 0 80 22 0 22 ...
 $ indus  : num  19.58 19.58 18.1 4.93 2.46 ...
 $ chas   : int  0 1 0 0 0 0 0 0 0 0 ...
 $ nox    : num  0.605 0.871 0.74 0.428 0.488 0.52 0.392 0.431 0.437 0.431 ...
 $ rm     : num  7.93 5.4 6.49 6.39 7.16 ...
 $ age    : num  96.2 100 100 7.8 92.2 71.3 19.1 8.9 45 8.4 ...
 $ dis    : num  2.05 1.32 1.98 7.04 2.7 ...
 $ rad    : int  5 5 24 6 3 5 1 7 5 7 ...
 $ tax    : int  403 403 666 300 193 384 315 330 398 330 ...
 $ ptratio: num  14.7 14.7 20.2 16.6 17.8 20.9 16.4 19.1 18.7 19.1 ...
 $ lstat  : num  3.7 26.82 18.85 5.19 4.82 ...
 $ medv   : num  50 13.4 15.4 23.7 37.9 26.5 20.9 24.8 21.4 42.8 ...
 $ target : int  1 1 1 0 0 0 0 0 0 1 ...
'data.frame':   139 obs. of  13 variables:
 $ zn     : num  0 0 0 0 0 100 0 0 0 0 ...
 $ indus  : num  18.1 18.1 5.19 18.1 2.46 1.32 18.1 3.24 6.2 2.89 ...
 $ chas   : int  0 0 0 0 0 0 0 0 1 0 ...
 $ nox    : num  0.693 0.693 0.515 0.532 0.488 0.411 0.679 0.46 0.507 0.445 ...
 $ rm     : num  5.45 4.52 6.32 7.06 6.15 ...
 $ age    : num  100 100 38.1 77 68.8 40.5 95.4 32.2 66.5 62.5 ...
 $ dis    : num  1.49 1.66 6.46 3.41 3.28 ...
 $ rad    : int  24 24 5 24 3 5 24 4 8 2 ...
 $ tax    : int  666 666 224 666 193 256 666 430 307 276 ...
 $ ptratio: num  20.2 20.2 20.2 20.2 17.8 15.1 20.2 16.9 17.4 18 ...
 $ lstat  : num  30.59 36.98 5.68 7.01 13.15 ...
 $ medv   : num  5 7 22.2 25 29.6 31.6 8.3 19.8 29 33.2 ...
 $ target : int  1 1 0 1 0 0 1 0 1 0 ...
## try it out 
train <- density_diff(train,test,"nox",train$target,"nox_density")$new_data_training
train <- density_diff(train,test,"age",train$target,"age_density")$new_data_training 
train <- density_diff(train,test,"indus",train$target,"indus_density")$new_data_training
train <- density_diff(train,test,"dis",train$target,"dis_density")$new_data_training
train <- density_diff(train,test,"rad",train$target,"rad_density")$new_data_training
train <- density_diff(train,test,"tax",train$target,"tax_density")$new_data_training
train <- density_diff(train,test,"ptratio",train$target,"ptratio_density")$new_data_training
train <- density_diff(train,test,"lstat",train$target,"lstat_density")$new_data_training
train <- density_diff(train,test,"medv",train$target,"medv_density")$new_data_training

test<- density_diff(train,test,"nox",train$target,"nox_density")$new_data_eval
test<- density_diff(train,test,"age",train$target,"age_density")$new_data_training 
test<- density_diff(train,test,"indus",train$target,"indus_density")$new_data_training
test<- density_diff(train,test,"dis",train$target,"dis_density")$new_data_training
test<- density_diff(train,test,"rad",train$target,"rad_density")$new_data_training
test<- density_diff(train,test,"tax",train$target,"tax_density")$new_data_training
test<- density_diff(train,test,"ptratio",train$target,"ptratio_density")$new_data_training
test<- density_diff(train,test,"lstat",train$target,"lstat_density")$new_data_training
test<- density_diff(train,test,"medv",train$target,"medv_density")$new_data_training

str(train)
'data.frame':   327 obs. of  22 variables:
 $ zn             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ chas           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ rm             : num  5.68 5.99 5.28 5 6.78 ...
 $ target         : int  1 1 1 1 1 0 1 1 1 1 ...
 $ nox_density    : num  1.73 1.73 1.7 1.7 1.71 ...
 $ nox            : num  0.693 0.693 0.7 0.7 0.679 0.609 0.718 0.693 0.74 0.679 ...
 $ age_density    : num  3.15 3.15 3.65 2.04 2.41 ...
 $ age            : num  100 100 98.1 89.5 90.8 98 76.5 85.4 100 95.6 ...
 $ indus_density  : num  3.58 3.58 3.58 3.58 3.58 ...
 $ indus          : num  18.1 18.1 18.1 18.1 18.1 ...
 $ dis_density    : num  3.02 3.47 3.02 3.31 3.64 ...
 $ dis            : num  1.43 1.59 1.43 1.52 1.82 ...
 $ rad_density    : num  1.54 1.54 1.54 1.54 1.54 ...
 $ rad            : int  24 24 24 24 24 4 24 24 24 24 ...
 $ tax_density    : num  1.95 1.95 1.95 1.95 1.95 ...
 $ tax            : int  666 666 666 666 666 711 666 666 666 666 ...
 $ ptratio_density: num  3.87 3.87 3.87 3.87 3.87 ...
 $ ptratio        : num  20.2 20.2 20.2 20.2 20.2 20.1 20.2 20.2 20.2 20.2 ...
 $ lstat_density  : num  1.013 0.723 0.248 0.222 0.822 ...
 $ lstat          : num  23 26.8 30.8 32 25.8 ...
 $ medv_density   : num  0.258 0.325 0.546 0.576 0.591 ...
 $ medv           : num  5 5.6 7.2 7.4 7.5 8.1 8.4 8.5 8.7 9.5 ...
'data.frame':   327 obs. of  22 variables:
 $ zn             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ chas           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ rm             : num  5.68 5.99 5.28 5 6.78 ...
 $ target         : int  1 1 1 1 1 0 1 1 1 1 ...
 $ nox_density    : num  1.73 1.73 1.7 1.7 1.71 ...
 $ nox            : num  0.693 0.693 0.7 0.7 0.679 0.609 0.718 0.693 0.74 0.679 ...
 $ age_density    : num  3.15 3.15 3.65 2.04 2.41 ...
 $ age            : num  100 100 98.1 89.5 90.8 98 76.5 85.4 100 95.6 ...
 $ indus_density  : num  3.58 3.58 3.58 3.58 3.58 ...
 $ indus          : num  18.1 18.1 18.1 18.1 18.1 ...
 $ dis_density    : num  3.02 3.47 3.02 3.31 3.64 ...
 $ dis            : num  1.43 1.59 1.43 1.52 1.82 ...
 $ rad_density    : num  1.54 1.54 1.54 1.54 1.54 ...
 $ rad            : int  24 24 24 24 24 4 24 24 24 24 ...
 $ tax_density    : num  1.95 1.95 1.95 1.95 1.95 ...
 $ tax            : int  666 666 666 666 666 711 666 666 666 666 ...
 $ ptratio_density: num  3.87 3.87 3.87 3.87 3.87 ...
 $ ptratio        : num  20.2 20.2 20.2 20.2 20.2 20.1 20.2 20.2 20.2 20.2 ...
 $ lstat_density  : num  1.013 0.723 0.248 0.222 0.822 ...
 $ lstat          : num  23 26.8 30.8 32 25.8 ...
 $ medv_density   : num  0.258 0.325 0.546 0.576 0.591 ...
 $ medv           : num  5 5.6 7.2 7.4 7.5 8.1 8.4 8.5 8.7 9.5 ...

Model Building

We start by applying occam’s razor and create a baseline model that only has one predictor. Any model we build beyond that will have to outperform this simplest model.


Call:
glm(formula = target ~ nox, family = binomial(link = "logit"), 
    data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1163  -0.4050  -0.1814   0.2717   2.5754  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -16.370      1.823  -8.981   <2e-16 ***
nox           30.372      3.444   8.818   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 451.70  on 326  degrees of freedom
Residual deviance: 205.67  on 325  degrees of freedom
AIC: 209.67

Number of Fisher Scoring iterations: 6
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 152  38
         1  23 114
                                          
               Accuracy : 0.8135          
                 95% CI : (0.7669, 0.8542)
    No Information Rate : 0.5352          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.6226          
                                          
 Mcnemar's Test P-Value : 0.07305         
                                          
            Sensitivity : 0.7500          
            Specificity : 0.8686          
         Pos Pred Value : 0.8321          
         Neg Pred Value : 0.8000          
             Prevalence : 0.4648          
         Detection Rate : 0.3486          
   Detection Prevalence : 0.4190          
      Balanced Accuracy : 0.8093          
                                          
       'Positive' Class : 1               
                                          

Our baseline model is ok with an F1 score of 0.7889273.

Next we try adding every other variable, to build a full model. From here we can work backwards and eliminate non-significant predictors:


Call:
glm(formula = target ~ ., family = binomial(link = "logit"), 
    data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5526  -0.0665   0.0000   0.0000   3.2421  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -62.180562  24.584518  -2.529  0.01143 *  
zn               -0.050415   0.040815  -1.235  0.21676    
chas             -0.195603   1.093905  -0.179  0.85809    
rm                0.011261   1.333754   0.008  0.99326    
nox_density      -0.179313   1.346273  -0.133  0.89404    
nox              76.295730  37.680730   2.025  0.04289 *  
age_density       0.301197   0.421311   0.715  0.47467    
age              -0.006903   0.026806  -0.258  0.79677    
indus_density     4.521690   1.592202   2.840  0.00451 ** 
indus            -0.597686   0.193076  -3.096  0.00196 ** 
dis_density       0.879526   0.541537   1.624  0.10435    
dis               0.907365   0.457941   1.981  0.04755 *  
rad_density      -0.108287   0.247780  -0.437  0.66209    
rad               1.652715   0.441300   3.745  0.00018 ***
tax_density      -1.857615   1.189922  -1.561  0.11849    
tax              -0.033081   0.018735  -1.766  0.07743 .  
ptratio_density  -0.683744   0.340432  -2.008  0.04459 *  
ptratio           0.987145   0.389351   2.535  0.01123 *  
lstat_density    -0.508557   0.537084  -0.947  0.34370    
lstat             0.216231   0.127437   1.697  0.08974 .  
medv_density      0.538486   0.293699   1.833  0.06673 .  
medv              0.219624   0.132442   1.658  0.09727 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 451.699  on 326  degrees of freedom
Residual deviance:  89.297  on 305  degrees of freedom
AIC: 133.3

Number of Fisher Scoring iterations: 11
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 167   9
         1   8 143
                                          
               Accuracy : 0.948           
                 95% CI : (0.9181, 0.9694)
    No Information Rate : 0.5352          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.8955          
                                          
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.9408          
            Specificity : 0.9543          
         Pos Pred Value : 0.9470          
         Neg Pred Value : 0.9489          
             Prevalence : 0.4648          
         Detection Rate : 0.4373          
   Detection Prevalence : 0.4618          
      Balanced Accuracy : 0.9475          
                                          
       'Positive' Class : 1               
                                          

The full model has an F1 score is 0.9438944, which is a bit higher than before. However, many variables do not seem to be significant.

After some backward elimination of non-significant predictor variables, we arrive at the following model:


Call:
glm(formula = target ~ . - tax - rm - chas - age - zn - indus, 
    family = binomial(link = "logit"), data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6423  -0.0708  -0.0019   0.0000   3.5667  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -48.50337   18.01499  -2.692 0.007094 ** 
nox_density       1.11605    0.99779   1.119 0.263346    
nox              29.26009   25.53773   1.146 0.251895    
age_density       0.10263    0.24099   0.426 0.670197    
indus_density     1.40396    0.49866   2.815 0.004871 ** 
dis_density       0.20473    0.43095   0.475 0.634740    
dis               0.58861    0.34861   1.688 0.091322 .  
rad_density      -0.10699    0.19274  -0.555 0.578847    
rad               1.30622    0.29319   4.455 8.38e-06 ***
tax_density      -2.80661    0.75411  -3.722 0.000198 ***
ptratio_density  -0.29021    0.27678  -1.049 0.294394    
ptratio           0.73022    0.27325   2.672 0.007532 ** 
lstat_density    -0.49402    0.46750  -1.057 0.290635    
lstat             0.21515    0.11687   1.841 0.065619 .  
medv_density      0.71614    0.27618   2.593 0.009513 ** 
medv              0.22990    0.07219   3.185 0.001450 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 451.70  on 326  degrees of freedom
Residual deviance: 102.82  on 311  degrees of freedom
AIC: 134.82

Number of Fisher Scoring iterations: 10
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 167  12
         1   8 140
                                          
               Accuracy : 0.9388          
                 95% CI : (0.9071, 0.9622)
    No Information Rate : 0.5352          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.8769          
                                          
 Mcnemar's Test P-Value : 0.5023          
                                          
            Sensitivity : 0.9211          
            Specificity : 0.9543          
         Pos Pred Value : 0.9459          
         Neg Pred Value : 0.9330          
             Prevalence : 0.4648          
         Detection Rate : 0.4281          
   Detection Prevalence : 0.4526          
      Balanced Accuracy : 0.9377          
                                          
       'Positive' Class : 1               
                                          

Call:
glm(formula = target ~ nox_density + indus_density + age_density + 
    dis_density + rad_density + tax_density + ptratio_density + 
    lstat_density + medv_density, family = binomial(link = "logit"), 
    data = train)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.96836  -0.31245  -0.06225   0.20194   3.01402  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.8093     0.5320   1.521 0.128210    
nox_density       1.1492     0.2600   4.420 9.86e-06 ***
indus_density     1.0706     0.2488   4.302 1.69e-05 ***
age_density       0.1219     0.1698   0.718 0.472916    
dis_density      -0.1849     0.2138  -0.865 0.387284    
rad_density       0.4141     0.1226   3.379 0.000727 ***
tax_density      -1.2582     0.4408  -2.854 0.004314 ** 
ptratio_density   0.1550     0.1524   1.017 0.309139    
lstat_density    -0.2456     0.2055  -1.195 0.231963    
medv_density      0.5417     0.2062   2.628 0.008596 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 451.70  on 326  degrees of freedom
Residual deviance: 164.12  on 317  degrees of freedom
AIC: 184.12

Number of Fisher Scoring iterations: 7
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 167  34
         1   8 118
                                          
               Accuracy : 0.8716          
                 95% CI : (0.8304, 0.9058)
    No Information Rate : 0.5352          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.7389          
                                          
 Mcnemar's Test P-Value : 0.0001145       
                                          
            Sensitivity : 0.9543          
            Specificity : 0.7763          
         Pos Pred Value : 0.8308          
         Neg Pred Value : 0.9365          
             Prevalence : 0.5352          
         Detection Rate : 0.5107          
   Detection Prevalence : 0.6147          
      Balanced Accuracy : 0.8653          
                                          
       'Positive' Class : 0               
                                          

Since the assignment mentions that the purpose is prediction, we will prefer F1 score as our measure of model success.

model predictors F1 deviance r2 aic
baseline 1 0.7889273 205.67297 0.5446683 209.6730
fullmodel 12 0.9438944 89.29743 0.8023078 133.2974
model1 6 0.9333333 102.81625 0.7723789 134.8163
density models 9 0.8882979 164.11933 0.6366623 184.1193

Looking these three models together, model1 looks like the best one. Although the full model scored an F1 that was ever-so-slightly higher on our test data set, model1 has half the predictors and it’s scores are almost exactly the same as fullmodel.

Model Selection

pROC Output

Baseline


Call:
roc.default(response = test[["target"]], predictor = test[["baseline"]],     plot = TRUE, legacy.axes = TRUE, print.auc = TRUE)

Data: test[["baseline"]] in 175 controls (test[["target"]] 0) < 152 cases (test[["target"]] 1).
Area under the curve: 0.8093

Full Model


Call:
roc.default(response = test[["target"]], predictor = test[["fullmodel"]],     plot = TRUE, legacy.axes = TRUE, print.auc = TRUE)

Data: test[["fullmodel"]] in 175 controls (test[["target"]] 0) < 152 cases (test[["target"]] 1).
Area under the curve: 0.9475

Model1


Call:
roc.default(response = test[["target"]], predictor = test[["model1"]],     plot = TRUE, legacy.axes = TRUE, print.auc = TRUE)

Data: test[["model1"]] in 175 controls (test[["target"]] 0) < 152 cases (test[["target"]] 1).
Area under the curve: 0.9377

Density Model


Call:
roc.default(response = test[["target"]], predictor = test[["density_models_yhat"]],     plot = TRUE, legacy.axes = TRUE, print.auc = TRUE)

Data: test[["density_models_yhat"]] in 175 controls (test[["target"]] 0) < 152 cases (test[["target"]] 1).
Area under the curve: 0.9625

Critical Thinking Group 3

2019-10-30