Introduction

We’ll be working with a mental health dataset and will be conducting exploratory data analysis, unsupervised clustering, principal component analysis, gradient boosting, and support vector machines.

Import Data

To begin, the following code will import the data and load the libraries:

library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
library(Hmisc)
library(naniar)
library(conflicted)

# resolve function name conflict
conflict_prefer('filter', 'dplyr')
conflict_prefer('summarize', 'dplyr')

# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/Data622_group5_projects/main/ADHD_data.csv'
df <- read.csv(url, header=T, na.strings="")


Variable Datatypes

In order to facilitate ease of use, we’ll be renaming the columns. Additionally, we’ll convert each of the number coded fields to factors while also including the proper labels.

# convert column names to lowercase
names(df) <- lapply(names(df), tolower)

# replace periods with underscore
names(df) <- str_replace_all(names(df), '\\.', '_')

# rename last column to remove trailing underscore
names(df)[ncol(df)] <- 'psych_meds'

names(df)
##  [1] "initial"            "age"                "sex"               
##  [4] "race"               "adhd_q1"            "adhd_q2"           
##  [7] "adhd_q3"            "adhd_q4"            "adhd_q5"           
## [10] "adhd_q6"            "adhd_q7"            "adhd_q8"           
## [13] "adhd_q9"            "adhd_q10"           "adhd_q11"          
## [16] "adhd_q12"           "adhd_q13"           "adhd_q14"          
## [19] "adhd_q15"           "adhd_q16"           "adhd_q17"          
## [22] "adhd_q18"           "adhd_total"         "md_q1a"            
## [25] "md_q1b"             "md_q1c"             "md_q1d"            
## [28] "md_q1e"             "md_q1f"             "md_q1g"            
## [31] "md_q1h"             "md_q1i"             "md_q1j"            
## [34] "md_q1k"             "md_q1l"             "md_q1m"            
## [37] "md_q2"              "md_q3"              "md_total"          
## [40] "alcohol"            "thc"                "cocaine"           
## [43] "stimulants"         "sedative_hypnotics" "opioids"           
## [46] "court_order"        "education"          "hx_of_violence"    
## [49] "disorderly_conduct" "suicide"            "abuse"             
## [52] "non_subst_dx"       "subst_dx"           "psych_meds"
# Sex
df$sex <- factor(df$sex, levels = c(1,2), labels = c('Male','Female'))

# Race
df$race <- factor(df$race, levels = c(1,2,3,4,5,6), labels = c('White','African American','Hispanic','Asian','Native American','Other or Missing Data'))

# ADHD q1 - q18
adhd_cols <- names(df[,5:22])
df[adhd_cols] <- lapply(df[adhd_cols], factor, levels = c(0,1,2,3,4), labels = c('Never','Rarely','Sometimes','Often','Very Often')) 

# Mood Disorder q1a - q2
md_cols <- names(df[,24:37])
df[md_cols] <- lapply(df[md_cols], factor, levels = c(0,1), labels = c('No','Yes')) 

# Mood Disorder q3
df$md_q3 <- factor(df$md_q3, levels = c(0,1,2,3), labels = c('No Problem','Minor','Moderate','Serious')) 

# Substance Abuse
sa_cols <- names(df[,40:45])
df[sa_cols] <- lapply(df[sa_cols], factor, levels = c(0,1,2,3), labels = c('No Use','Use','Abuse','Dependence')) 

# Court Order
df$court_order <- factor(df$court_order, levels = c(0,1), labels = c('No','Yes'))

# Education
# think it might be okay to leave this as a number

# History of Violence, Disorderly Conduct, Suicide Attempt
hist_cols <- names(df[,48:50])
df[hist_cols] <- lapply(df[hist_cols], factor, levels = c(0,1), labels = c('No','Yes'))

# Abuse History
df$abuse <- factor(df$abuse, levels = c(0,1,2,3,4,5,6,7), 
                   labels = c('No','Physical','Sexual','Emotional','Physical & Sexual','Physical & Emotional','Sexual & Emotional','Physical, Sexual, & Emotional'))

# Non-Substance Related Drugs
df$non_subst_dx <- factor(df$non_subst_dx, levels = c(0,1,2), labels = c('None','One','More than one'))

# Substance Related Drugs
df$subst_dx <- factor(df$subst_dx, levels = c(0,1,2,3), labels = c('None','One','Two','Three or more'))

# Psychiatric Meds
df$psych_meds <- factor(df$psych_meds, levels = c(0,1,2), labels = c('None','One','More than one'))

# str(df)

Exploratory Data Analysis

The following code will quantitatively and visually explore the nature of the dataset.

We begin by describing the dataset features.

Use dplyr’s glimpse() function to take a quick look at the data structure. Followed by Hmisc’s describe() function to return some basic summary statistics about the dataframe features:

# quick look at what the data structure looks like
glimpse(df)
## Rows: 175
## Columns: 54
## $ initial            <chr> "JA", "LA", "MD", "RD", "RB", "SB", "PK", "RJ", "DJ~
## $ age                <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,~
## $ sex                <fct> Male, Female, Female, Male, Male, Female, Female, M~
## $ race               <fct> White, White, White, White, White, White, White, Wh~
## $ adhd_q1            <fct> Rarely, Often, Sometimes, Often, Very Often, Someti~
## $ adhd_q2            <fct> Rarely, Often, Rarely, Often, Very Often, Often, So~
## $ adhd_q3            <fct> Very Often, Very Often, Sometimes, Sometimes, Somet~
## $ adhd_q4            <fct> Sometimes, Very Often, Rarely, Sometimes, Very Ofte~
## $ adhd_q5            <fct> Often, NA, Often, Very Often, Very Often, Often, Ve~
## $ adhd_q6            <fct> Rarely, Sometimes, Often, Often, Sometimes, Sometim~
## $ adhd_q7            <fct> Rarely, Sometimes, Often, Sometimes, Often, Often, ~
## $ adhd_q8            <fct> Often, Often, Sometimes, Very Often, Very Often, Ve~
## $ adhd_q9            <fct> Sometimes, Sometimes, Never, Very Often, Very Often~
## $ adhd_q10           <fct> Very Often, Very Often, Rarely, Sometimes, Sometime~
## $ adhd_q11           <fct> Sometimes, Rarely, Sometimes, Often, Very Often, Ve~
## $ adhd_q12           <fct> Very Often, Very Often, Never, Rarely, Rarely, Some~
## $ adhd_q13           <fct> Rarely, Sometimes, Sometimes, Often, Often, Very Of~
## $ adhd_q14           <fct> Never, Very Often, Sometimes, Often, Sometimes, Ver~
## $ adhd_q15           <fct> Often, Very Often, Often, Rarely, Rarely, Often, Ve~
## $ adhd_q16           <fct> Rarely, Often, Sometimes, Sometimes, Sometimes, Ver~
## $ adhd_q17           <fct> Often, Rarely, Rarely, Rarely, Rarely, Often, Somet~
## $ adhd_q18           <fct> Very Often, Very Often, Rarely, Sometimes, Rarely, ~
## $ adhd_total         <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,~
## $ md_q1a             <fct> Yes, Yes, No, Yes, No, No, Yes, No, Yes, Yes, Yes, ~
## $ md_q1b             <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes~
## $ md_q1c             <fct> Yes, Yes, No, No, No, No, No, No, No, No, Yes, No, ~
## $ md_q1d             <fct> Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, Yes, N~
## $ md_q1e             <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes,~
## $ md_q1f             <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Ye~
## $ md_q1g             <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Ye~
## $ md_q1h             <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No~
## $ md_q1i             <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No~
## $ md_q1j             <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, No, ~
## $ md_q1k             <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No,~
## $ md_q1l             <fct> No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,~
## $ md_q1m             <fct> Yes, No, No, Yes, No, No, No, No, Yes, Yes, Yes, No~
## $ md_q2              <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ md_q3              <fct> Serious, Serious, Moderate, Serious, Moderate, Seri~
## $ md_total           <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, ~
## $ alcohol            <fct> Use, No Use, No Use, Use, Use, Use, Dependence, No ~
## $ thc                <fct> Use, No Use, No Use, Use, Use, No Use, Dependence, ~
## $ cocaine            <fct> Use, No Use, No Use, Use, No Use, No Use, Use, No U~
## $ stimulants         <fct> No Use, No Use, No Use, Use, No Use, No Use, Use, N~
## $ sedative_hypnotics <fct> No Use, No Use, No Use, No Use, No Use, No Use, Use~
## $ opioids            <fct> No Use, No Use, No Use, No Use, No Use, No Use, No ~
## $ court_order        <fct> Yes, No, No, No, Yes, No, No, No, No, No, No, No, N~
## $ education          <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1~
## $ hx_of_violence     <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, Yes, No~
## $ disorderly_conduct <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,~
## $ suicide            <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, No, No, Yes, N~
## $ abuse              <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P~
## $ non_subst_dx       <fct> More than one, One, More than one, More than one, M~
## $ subst_dx           <fct> None, None, None, None, None, None, None, One, None~
## $ psych_meds         <fct> More than one, One, One, More than one, None, None,~
# summary of each field
describe(df)
## df 
## 
##  54  Variables      175  Observations
## --------------------------------------------------------------------------------
## initial 
##        n  missing distinct 
##      175        0      109 
## 
## lowest : AB AE AF AH AJ, highest: TJ TS VG WB WH
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       42    0.999    39.47     12.8     22.0     24.0 
##      .25      .50      .75      .90      .95 
##     29.5     42.0     48.0     53.0     56.0 
## 
## lowest : 18 19 20 21 22, highest: 55 56 57 61 69
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##      175        0        2 
##                         
## Value        Male Female
## Frequency      99     76
## Proportion  0.566  0.434
## --------------------------------------------------------------------------------
## race 
##        n  missing distinct 
##      175        0        4 
##                                                                             
## Value                      White      African American              Hispanic
## Frequency                     72                   100                     1
## Proportion                 0.411                 0.571                 0.006
##                                 
## Value      Other or Missing Data
## Frequency                      2
## Proportion                 0.011
## --------------------------------------------------------------------------------
## adhd_q1 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          39         43         44         30         19
## Proportion      0.223      0.246      0.251      0.171      0.109
## --------------------------------------------------------------------------------
## adhd_q2 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          25         46         47         33         24
## Proportion      0.143      0.263      0.269      0.189      0.137
## --------------------------------------------------------------------------------
## adhd_q3 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          26         46         46         32         25
## Proportion      0.149      0.263      0.263      0.183      0.143
## --------------------------------------------------------------------------------
## adhd_q4 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          27         31         50         31         36
## Proportion      0.154      0.177      0.286      0.177      0.206
## --------------------------------------------------------------------------------
## adhd_q5 
##        n  missing distinct 
##      174        1        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          33         21         32         47         41
## Proportion      0.190      0.121      0.184      0.270      0.236
## --------------------------------------------------------------------------------
## adhd_q6 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          36         29         45         45         20
## Proportion      0.206      0.166      0.257      0.257      0.114
## --------------------------------------------------------------------------------
## adhd_q7 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          22         53         54         25         21
## Proportion      0.126      0.303      0.309      0.143      0.120
## --------------------------------------------------------------------------------
## adhd_q8 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          21         40         40         42         32
## Proportion      0.120      0.229      0.229      0.240      0.183
## --------------------------------------------------------------------------------
## adhd_q9 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          31         43         36         41         24
## Proportion      0.177      0.246      0.206      0.234      0.137
## --------------------------------------------------------------------------------
## adhd_q10 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          15         46         49         33         32
## Proportion      0.086      0.263      0.280      0.189      0.183
## --------------------------------------------------------------------------------
## adhd_q11 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          16         33         48         43         35
## Proportion      0.091      0.189      0.274      0.246      0.200
## --------------------------------------------------------------------------------
## adhd_q12 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          55         55         37         15         13
## Proportion      0.314      0.314      0.211      0.086      0.074
## --------------------------------------------------------------------------------
## adhd_q13 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          15         29         46         47         38
## Proportion      0.086      0.166      0.263      0.269      0.217
## --------------------------------------------------------------------------------
## adhd_q14 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          27         24         40         47         37
## Proportion      0.154      0.137      0.229      0.269      0.211
## --------------------------------------------------------------------------------
## adhd_q15 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          50         39         35         27         24
## Proportion      0.286      0.223      0.200      0.154      0.137
## --------------------------------------------------------------------------------
## adhd_q16 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          40         49         39         17         30
## Proportion      0.229      0.280      0.223      0.097      0.171
## --------------------------------------------------------------------------------
## adhd_q17 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          49         41         46         22         17
## Proportion      0.280      0.234      0.263      0.126      0.097
## --------------------------------------------------------------------------------
## adhd_q18 
##        n  missing distinct 
##      175        0        5 
## 
## lowest : Never      Rarely     Sometimes  Often      Very Often
## highest: Never      Rarely     Sometimes  Often      Very Often
##                                                                  
## Value           Never     Rarely  Sometimes      Often Very Often
## Frequency          49         52         35         20         19
## Proportion      0.280      0.297      0.200      0.114      0.109
## --------------------------------------------------------------------------------
## adhd_total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       62    0.999    34.32    19.16      7.0     12.0 
##      .25      .50      .75      .90      .95 
##     21.0     33.0     47.5     55.0     62.3 
## 
## lowest :  0  1  3  5  6, highest: 65 67 69 71 72
## --------------------------------------------------------------------------------
## md_q1a 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     79    96
## Proportion 0.451 0.549
## --------------------------------------------------------------------------------
## md_q1b 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     75   100
## Proportion 0.429 0.571
## --------------------------------------------------------------------------------
## md_q1c 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     80    95
## Proportion 0.457 0.543
## --------------------------------------------------------------------------------
## md_q1d 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     73   102
## Proportion 0.417 0.583
## --------------------------------------------------------------------------------
## md_q1e 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     78    97
## Proportion 0.446 0.554
## --------------------------------------------------------------------------------
## md_q1f 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     53   122
## Proportion 0.303 0.697
## --------------------------------------------------------------------------------
## md_q1g 
##        n  missing distinct 
##      175        0        2 
##                     
## Value        No  Yes
## Frequency    49  126
## Proportion 0.28 0.72
## --------------------------------------------------------------------------------
## md_q1h 
##        n  missing distinct 
##      175        0        2 
##                     
## Value        No  Yes
## Frequency    77   98
## Proportion 0.44 0.56
## --------------------------------------------------------------------------------
## md_q1i 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     72   103
## Proportion 0.411 0.589
## --------------------------------------------------------------------------------
## md_q1j 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency    107    68
## Proportion 0.611 0.389
## --------------------------------------------------------------------------------
## md_q1k 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     90    85
## Proportion 0.514 0.486
## --------------------------------------------------------------------------------
## md_q1l 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     73   102
## Proportion 0.417 0.583
## --------------------------------------------------------------------------------
## md_q1m 
##        n  missing distinct 
##      175        0        2 
##                       
## Value         No   Yes
## Frequency     89    86
## Proportion 0.509 0.491
## --------------------------------------------------------------------------------
## md_q2 
##        n  missing distinct 
##      175        0        2 
##                     
## Value        No  Yes
## Frequency    49  126
## Proportion 0.28 0.72
## --------------------------------------------------------------------------------
## md_q3 
##        n  missing distinct 
##      175        0        4 
##                                                       
## Value      No Problem      Minor   Moderate    Serious
## Frequency          25         25         49         76
## Proportion      0.143      0.143      0.280      0.434
## --------------------------------------------------------------------------------
## md_total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       18    0.995    10.02    5.469      0.7      3.0 
##      .25      .50      .75      .90      .95 
##      6.5     11.0     14.0     16.0     17.0 
## 
## lowest :  0  1  2  3  4, highest: 13 14 15 16 17
##                                                                             
## Value          0     1     2     3     4     5     6     7     8     9    10
## Frequency      9     3     5     6     4     7    10     6     8    12    13
## Proportion 0.051 0.017 0.029 0.034 0.023 0.040 0.057 0.034 0.046 0.069 0.074
##                                                     
## Value         11    12    13    14    15    16    17
## Frequency     18    12    13    12    14    12    11
## Proportion 0.103 0.069 0.074 0.069 0.080 0.069 0.063
## --------------------------------------------------------------------------------
## alcohol 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency          80         18          7         66
## Proportion      0.468      0.105      0.041      0.386
## --------------------------------------------------------------------------------
## thc 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         116         12          3         40
## Proportion      0.678      0.070      0.018      0.234
## --------------------------------------------------------------------------------
## cocaine 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         101          9          5         56
## Proportion      0.591      0.053      0.029      0.327
## --------------------------------------------------------------------------------
## stimulants 
##        n  missing distinct 
##      171        4        3 
##                                            
## Value          No Use        Use Dependence
## Frequency         160          6          5
## Proportion      0.936      0.035      0.029
## --------------------------------------------------------------------------------
## sedative_hypnotics 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         161          4          1          5
## Proportion      0.942      0.023      0.006      0.029
## --------------------------------------------------------------------------------
## opioids 
##        n  missing distinct 
##      171        4        3 
##                                            
## Value          No Use        Use Dependence
## Frequency         146          4         21
## Proportion      0.854      0.023      0.123
## --------------------------------------------------------------------------------
## court_order 
##        n  missing distinct 
##      170        5        2 
##                       
## Value         No   Yes
## Frequency    155    15
## Proportion 0.912 0.088
## --------------------------------------------------------------------------------
## education 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      166        9       14    0.929     11.9    2.265     8.25     9.00 
##      .25      .50      .75      .90      .95 
##    11.00    12.00    13.00    14.00    16.00 
## 
## lowest :  6  7  8  9 10, highest: 15 16 17 18 19
##                                                                             
## Value          6     7     8     9    10    11    12    13    14    15    16
## Frequency      2     2     5    12    12    23    67    15    14     1     7
## Proportion 0.012 0.012 0.030 0.072 0.072 0.139 0.404 0.090 0.084 0.006 0.042
##                             
## Value         17    18    19
## Frequency      2     3     1
## Proportion 0.012 0.018 0.006
## --------------------------------------------------------------------------------
## hx_of_violence 
##        n  missing distinct 
##      164       11        2 
##                       
## Value         No   Yes
## Frequency    124    40
## Proportion 0.756 0.244
## --------------------------------------------------------------------------------
## disorderly_conduct 
##        n  missing distinct 
##      164       11        2 
##                       
## Value         No   Yes
## Frequency     45   119
## Proportion 0.274 0.726
## --------------------------------------------------------------------------------
## suicide 
##        n  missing distinct 
##      162       13        2 
##                       
## Value         No   Yes
## Frequency    113    49
## Proportion 0.698 0.302
## --------------------------------------------------------------------------------
## abuse 
##        n  missing distinct 
##      161       14        8 
## 
## lowest : No                            Physical                      Sexual                        Emotional                     Physical & Sexual            
## highest: Emotional                     Physical & Sexual             Physical & Emotional          Sexual & Emotional            Physical, Sexual, & Emotional
## --------------------------------------------------------------------------------
## non_subst_dx 
##        n  missing distinct 
##      153       22        3 
##                                                     
## Value               None           One More than one
## Frequency            102            35            16
## Proportion         0.667         0.229         0.105
## --------------------------------------------------------------------------------
## subst_dx 
##        n  missing distinct 
##      152       23        4 
##                                                                   
## Value               None           One           Two Three or more
## Frequency             42            61            35            14
## Proportion         0.276         0.401         0.230         0.092
## --------------------------------------------------------------------------------
## psych_meds 
##        n  missing distinct 
##       57      118        3 
##                                                     
## Value               None           One More than one
## Frequency             19            21            17
## Proportion         0.333         0.368         0.298
## --------------------------------------------------------------------------------

From this output, we can summarize each dataset feature as follows:

  1. loan_id (ordinal): each entry is a unique value, therefore this feature is not informative for loan status
  2. gender (categorical): 2 distinct values with missing data
  3. married (categorical): 2 distinct values with missing data
  4. dependents (categorical): 4 distinct values with missing data
  5. education (categorical): 2 distinct values, no missing data
  6. self_employed (categorical): 2 distinct values with missing data
  7. applicantincome (numeric): value range, no missing data
  8. coapplicantincome (numeric): value range, no missing data
  9. loanamount (numeric): value range with missing data
  10. loan_amount_term (numeric): relatively few unique values (10) with missing data
  11. credit_history (categorical): 2 distinct values with missing data
  12. property_area (categorical): 3 distinct values, no missing data
  13. loan_status (categorical): 2 distinct values, no missing data

Removing loan_id: this feature was found to have as many unique values as there are rows in the dataframe and is a record identification label. Therefore, we will drop this feature from the data:

# remove loan ID
# df <- df %>%
#   select(-loan_id)

Missing Values

Use naniar’s miss_var_summary() and vis_miss() functions to summarize and visualize the missing values in the features of the dataset:

# return a summary table of the missing data in each column
miss_var_summary(df)
## # A tibble: 54 x 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 psych_meds            118    67.4 
##  2 subst_dx               23    13.1 
##  3 non_subst_dx           22    12.6 
##  4 abuse                  14     8   
##  5 suicide                13     7.43
##  6 hx_of_violence         11     6.29
##  7 disorderly_conduct     11     6.29
##  8 education               9     5.14
##  9 court_order             5     2.86
## 10 alcohol                 4     2.29
## # ... with 44 more rows
# visualize the amount of missing data for each feature
vis_miss( df, cluster = TRUE )

The figure above shows a grouped view of the missing values in each feature column. Overall, 2% of the values are missing from the dataset. Several features have no missing values (education, applicantincome, and coapplicantincome). Many of the features have relatively few missing values. However, the credit_history features is missing 8.14% of the data.

Explore the missing data further by using the gg_miss_upset() function to show patterns correlated missing values.

gg_miss_upset( df )

The figure above shows that the vast majority of rows only have a singleton missing value; this is represented by the 5 bars in the left of the plot with only one dot to indicate the missing feature. However, a small minority or rows have 2-3 missing elements; this is indicated by multiple dots under the 5 bars to the right side of the plot.

Since there are relatively few rows with multiple missing values, it would not adversely affect the analysis to remove them. The rest of the missing values can be dealt with by imputation.

# create a vector holding the sum of NAs for each row
count_na <- apply( df, 1, function(x) sum(is.na(x)))
# keep only the rows with less than 2 missing values
df <- df[count_na < 2,]
dim( df )
## [1] 143  54

For a simple first approximation, we will use the simputation package\(^1\) to fill NA values for categorical and numeric features with ‘hot-deck’ imputation (i.e. a values pulled at random from complete cases in the dataset).

# # single imputation analysis
# df <- bind_shadow( df ) %>%
#   data.frame() %>%
#   simputation::impute_rhd(., credit_history ~ 1 ) %>%
#   simputation::impute_rhd(., loan_amount_term ~ 1 ) %>%
#   simputation::impute_rhd(., loanamount ~ 1 ) %>%
#   simputation::impute_rhd(., self_employed ~ 1 ) %>%
#   simputation::impute_rhd(., gender ~ 1 ) %>%
#   simputation::impute_rhd(., dependents ~ 1 ) %>%  
#   tbl_df()  %>%
#   select( -c(13:24) )

Confirm that we have filled all NA values:

# return a summary table of the missing data in each column
miss_var_summary(df)
## # A tibble: 54 x 3
##    variable   n_miss pct_miss
##    <chr>       <int>    <dbl>
##  1 psych_meds     87   60.8  
##  2 adhd_q5         1    0.699
##  3 subst_dx        1    0.699
##  4 initial         0    0    
##  5 age             0    0    
##  6 sex             0    0    
##  7 race            0    0    
##  8 adhd_q1         0    0    
##  9 adhd_q2         0    0    
## 10 adhd_q3         0    0    
## # ... with 44 more rows

Distributions of Numeric Variables

Now that the missing values have been imputed across the dataframe, we can explore the relationships of the variables in more depth. To start we visualize the distributions of the numeric variables grouped by the outcome of the target variable (loan_status):

# numeric distributions
df %>%
  select_if(is.numeric) %>%
  bind_cols(select(df, suicide)) %>%
  gather(var, val, -suicide) %>%
  ggplot(aes(x = val, fill = suicide)) +
  geom_density(alpha = .3) +
  facet_wrap(~var, scales = 'free', ncol = 2) +
  theme_bw() + 
  labs(x = element_blank(),
       y = element_blank(),
       title = 'Distribution of Numeric Variables'
       )

The distributions do not suggest any obviously significant differences when grouped by the target variable for any of the numeric features. It does not appear to be likely that either of these 3 features are correlated to loan_status. This can be confirmed with ANOVA\(^2\):

# # ANOVA for applicantincome
# applicantincome.aov <- aov(applicantincome ~ loan_status, data = df)
# # Summary of the analysis
# summary(applicantincome.aov)
# ```
# ```{r}
# # ANOVA for coapplicantincome
# coapplicantincome.aov <- aov(coapplicantincome ~ loan_status, data = df)
# # Summary of the analysis
# summary(coapplicantincome.aov)
# # ANOVA for applicantincome
# loanamount.aov <- aov(loanamount ~ loan_status, data = df)
# # Summary of the analysis
# summary(loanamount.aov)

The p-values for all three ANOVA tests are very high indicating that there is no significant relationship between the features variables and the target.

Correlation of Numeric Variables

Here we can look for correlations between feature variables

df_numeric <- df %>%
  select_if(is.numeric)
  # select(applicantincome, coapplicantincome, loanamount )

plot_corr_matrix(df_numeric, -1)

We can see a strong positive correlation between the features applicantincome and loanamount. There is a weak positive correlation between coapplicantincome and loanamount. Interestingly there is a weak negative correlation between applicantincome and coapplicantincome; presumptively due to a high-earning family being able to sustain with a single income.

Distributions of Categorical Variables

No we turn to the categorical features to see if there are any strong relationships between them and the target variable.
The following code will visualize the proportions of each target variable level for each level of a given feature:

yes_count <- sum(df$suicide == 'Yes')
no_count <- sum(df$suicide == 'No')
  
df %>%
  select(!is.numeric) %>%
  gather(var, value, -suicide) %>%
  group_by(var, value, suicide) %>%
  summarise(count = n(),
            .groups = 'drop') %>%
  mutate(prop = count / ifelse(suicide == 'Yes', yes_count, no_count)) %>%
  ggplot(aes(x = value, y = prop, fill = suicide)) +
  geom_col(position = 'dodge') +
  facet_wrap(~var, scales = 'free') +
  theme_bw() +
  labs(y = 'Frequency Proportion',
       x = element_blank(),
       title = 'Frequency Distributions For Non-Numeric Variables') +
  scale_y_continuous(labels = percent_format(accuracy = 1))

When interpreting the categorical bar plots, differences between loan_status for a given feature-level suggest that a relationship exists between a feature and the target variable. For example, we see a clear difference between the Y/N bars for credit_history, married and property_area whereas the is little difference for the levels of gender and no noticeable difference for self_employed.

The existence of a significant relationship between the categorical features and the target variable can be evaluated with a Chi-square test\(^3\).

# # Chi-square test for credit_history
# test <- chisq.test(table(df$credit_history, df$loan_status))
# test
# # Chi-square test for married
# test <- chisq.test(table(df$married, df$loan_status))
# test
# # Chi-square test for property_area
# test <- chisq.test(table(df$property_area, df$loan_status))
# test
# # Chi-square test for education
# test <- chisq.test(table(df$education, df$loan_status))
# test
# # Chi-square test for loan_amount_term
# test <- chisq.test(table(df$loan_amount_term, df$loan_status))
# test
# # Chi-square test for dependents
# test <- chisq.test(table(df$dependents, df$loan_status))
# test
# # Chi-square test for gender
# test <- chisq.test(table(df$gender, df$loan_status))
# test
# # Chi-square test for self employed
# test <- chisq.test(table(df$self_employed, df$loan_status))
# test

Data Prep

# impute NA  daaaang!, sorry Leo, I didn't see that you impute the NAs here until too late
# preproc <- preProcess(df, 'bagImpute')
# df2 <- predict(preproc, df)
df2 <- df 
# %>%
  # select( married, property_area, credit_history, education, loan_amount_term, loan_status )

# train test split
set.seed(101)
trainIndex <- createDataPartition(df2$suicide,
                                  p = 0.75,
                                  list = F)

train <- df2[trainIndex,]
test <- df2[-trainIndex,]

# cross validation train control
ctrl <- trainControl(method = 'cv', number = 10)