We’ll be working with a mental health dataset and will be conducting exploratory data analysis, unsupervised clustering, principal component analysis, gradient boosting, and support vector machines.
To begin, the following code will import the data and load the libraries:
library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
library(Hmisc)
library(naniar)
library(conflicted)
# resolve function name conflict
conflict_prefer('filter', 'dplyr')
conflict_prefer('summarize', 'dplyr')
# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/Data622_group5_projects/main/ADHD_data.csv'
df <- read.csv(url, header=T, na.strings="")In order to facilitate ease of use, we’ll be renaming the columns. Additionally, we’ll convert each of the number coded fields to factors while also including the proper labels.
# convert column names to lowercase
names(df) <- lapply(names(df), tolower)
# replace periods with underscore
names(df) <- str_replace_all(names(df), '\\.', '_')
# rename last column to remove trailing underscore
names(df)[ncol(df)] <- 'psych_meds'
names(df)## [1] "initial" "age" "sex"
## [4] "race" "adhd_q1" "adhd_q2"
## [7] "adhd_q3" "adhd_q4" "adhd_q5"
## [10] "adhd_q6" "adhd_q7" "adhd_q8"
## [13] "adhd_q9" "adhd_q10" "adhd_q11"
## [16] "adhd_q12" "adhd_q13" "adhd_q14"
## [19] "adhd_q15" "adhd_q16" "adhd_q17"
## [22] "adhd_q18" "adhd_total" "md_q1a"
## [25] "md_q1b" "md_q1c" "md_q1d"
## [28] "md_q1e" "md_q1f" "md_q1g"
## [31] "md_q1h" "md_q1i" "md_q1j"
## [34] "md_q1k" "md_q1l" "md_q1m"
## [37] "md_q2" "md_q3" "md_total"
## [40] "alcohol" "thc" "cocaine"
## [43] "stimulants" "sedative_hypnotics" "opioids"
## [46] "court_order" "education" "hx_of_violence"
## [49] "disorderly_conduct" "suicide" "abuse"
## [52] "non_subst_dx" "subst_dx" "psych_meds"
# Sex
df$sex <- factor(df$sex, levels = c(1,2), labels = c('Male','Female'))
# Race
df$race <- factor(df$race, levels = c(1,2,3,4,5,6), labels = c('White','African American','Hispanic','Asian','Native American','Other or Missing Data'))
# ADHD q1 - q18
adhd_cols <- names(df[,5:22])
df[adhd_cols] <- lapply(df[adhd_cols], factor, levels = c(0,1,2,3,4), labels = c('Never','Rarely','Sometimes','Often','Very Often'))
# Mood Disorder q1a - q2
md_cols <- names(df[,24:37])
df[md_cols] <- lapply(df[md_cols], factor, levels = c(0,1), labels = c('No','Yes'))
# Mood Disorder q3
df$md_q3 <- factor(df$md_q3, levels = c(0,1,2,3), labels = c('No Problem','Minor','Moderate','Serious'))
# Substance Abuse
sa_cols <- names(df[,40:45])
df[sa_cols] <- lapply(df[sa_cols], factor, levels = c(0,1,2,3), labels = c('No Use','Use','Abuse','Dependence'))
# Court Order
df$court_order <- factor(df$court_order, levels = c(0,1), labels = c('No','Yes'))
# Education
# think it might be okay to leave this as a number
# History of Violence, Disorderly Conduct, Suicide Attempt
hist_cols <- names(df[,48:50])
df[hist_cols] <- lapply(df[hist_cols], factor, levels = c(0,1), labels = c('No','Yes'))
# Abuse History
df$abuse <- factor(df$abuse, levels = c(0,1,2,3,4,5,6,7),
labels = c('No','Physical','Sexual','Emotional','Physical & Sexual','Physical & Emotional','Sexual & Emotional','Physical, Sexual, & Emotional'))
# Non-Substance Related Drugs
df$non_subst_dx <- factor(df$non_subst_dx, levels = c(0,1,2), labels = c('None','One','More than one'))
# Substance Related Drugs
df$subst_dx <- factor(df$subst_dx, levels = c(0,1,2,3), labels = c('None','One','Two','Three or more'))
# Psychiatric Meds
df$psych_meds <- factor(df$psych_meds, levels = c(0,1,2), labels = c('None','One','More than one'))
# str(df)The following code will quantitatively and visually explore the nature of the dataset.
We begin by describing the dataset features.
Use dplyr’s glimpse() function to take a quick look at the data structure. Followed by Hmisc’s describe() function to return some basic summary statistics about the dataframe features:
# quick look at what the data structure looks like
glimpse(df)## Rows: 175
## Columns: 54
## $ initial <chr> "JA", "LA", "MD", "RD", "RB", "SB", "PK", "RJ", "DJ~
## $ age <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,~
## $ sex <fct> Male, Female, Female, Male, Male, Female, Female, M~
## $ race <fct> White, White, White, White, White, White, White, Wh~
## $ adhd_q1 <fct> Rarely, Often, Sometimes, Often, Very Often, Someti~
## $ adhd_q2 <fct> Rarely, Often, Rarely, Often, Very Often, Often, So~
## $ adhd_q3 <fct> Very Often, Very Often, Sometimes, Sometimes, Somet~
## $ adhd_q4 <fct> Sometimes, Very Often, Rarely, Sometimes, Very Ofte~
## $ adhd_q5 <fct> Often, NA, Often, Very Often, Very Often, Often, Ve~
## $ adhd_q6 <fct> Rarely, Sometimes, Often, Often, Sometimes, Sometim~
## $ adhd_q7 <fct> Rarely, Sometimes, Often, Sometimes, Often, Often, ~
## $ adhd_q8 <fct> Often, Often, Sometimes, Very Often, Very Often, Ve~
## $ adhd_q9 <fct> Sometimes, Sometimes, Never, Very Often, Very Often~
## $ adhd_q10 <fct> Very Often, Very Often, Rarely, Sometimes, Sometime~
## $ adhd_q11 <fct> Sometimes, Rarely, Sometimes, Often, Very Often, Ve~
## $ adhd_q12 <fct> Very Often, Very Often, Never, Rarely, Rarely, Some~
## $ adhd_q13 <fct> Rarely, Sometimes, Sometimes, Often, Often, Very Of~
## $ adhd_q14 <fct> Never, Very Often, Sometimes, Often, Sometimes, Ver~
## $ adhd_q15 <fct> Often, Very Often, Often, Rarely, Rarely, Often, Ve~
## $ adhd_q16 <fct> Rarely, Often, Sometimes, Sometimes, Sometimes, Ver~
## $ adhd_q17 <fct> Often, Rarely, Rarely, Rarely, Rarely, Often, Somet~
## $ adhd_q18 <fct> Very Often, Very Often, Rarely, Sometimes, Rarely, ~
## $ adhd_total <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,~
## $ md_q1a <fct> Yes, Yes, No, Yes, No, No, Yes, No, Yes, Yes, Yes, ~
## $ md_q1b <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes~
## $ md_q1c <fct> Yes, Yes, No, No, No, No, No, No, No, No, Yes, No, ~
## $ md_q1d <fct> Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, Yes, N~
## $ md_q1e <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes,~
## $ md_q1f <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Ye~
## $ md_q1g <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Ye~
## $ md_q1h <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No~
## $ md_q1i <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No~
## $ md_q1j <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, No, ~
## $ md_q1k <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No,~
## $ md_q1l <fct> No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,~
## $ md_q1m <fct> Yes, No, No, Yes, No, No, No, No, Yes, Yes, Yes, No~
## $ md_q2 <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ md_q3 <fct> Serious, Serious, Moderate, Serious, Moderate, Seri~
## $ md_total <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, ~
## $ alcohol <fct> Use, No Use, No Use, Use, Use, Use, Dependence, No ~
## $ thc <fct> Use, No Use, No Use, Use, Use, No Use, Dependence, ~
## $ cocaine <fct> Use, No Use, No Use, Use, No Use, No Use, Use, No U~
## $ stimulants <fct> No Use, No Use, No Use, Use, No Use, No Use, Use, N~
## $ sedative_hypnotics <fct> No Use, No Use, No Use, No Use, No Use, No Use, Use~
## $ opioids <fct> No Use, No Use, No Use, No Use, No Use, No Use, No ~
## $ court_order <fct> Yes, No, No, No, Yes, No, No, No, No, No, No, No, N~
## $ education <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1~
## $ hx_of_violence <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, Yes, No~
## $ disorderly_conduct <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,~
## $ suicide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, No, No, Yes, N~
## $ abuse <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P~
## $ non_subst_dx <fct> More than one, One, More than one, More than one, M~
## $ subst_dx <fct> None, None, None, None, None, None, None, One, None~
## $ psych_meds <fct> More than one, One, One, More than one, None, None,~
# summary of each field
describe(df)## df
##
## 54 Variables 175 Observations
## --------------------------------------------------------------------------------
## initial
## n missing distinct
## 175 0 109
##
## lowest : AB AE AF AH AJ, highest: TJ TS VG WB WH
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 175 0 42 0.999 39.47 12.8 22.0 24.0
## .25 .50 .75 .90 .95
## 29.5 42.0 48.0 53.0 56.0
##
## lowest : 18 19 20 21 22, highest: 55 56 57 61 69
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 175 0 2
##
## Value Male Female
## Frequency 99 76
## Proportion 0.566 0.434
## --------------------------------------------------------------------------------
## race
## n missing distinct
## 175 0 4
##
## Value White African American Hispanic
## Frequency 72 100 1
## Proportion 0.411 0.571 0.006
##
## Value Other or Missing Data
## Frequency 2
## Proportion 0.011
## --------------------------------------------------------------------------------
## adhd_q1
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 39 43 44 30 19
## Proportion 0.223 0.246 0.251 0.171 0.109
## --------------------------------------------------------------------------------
## adhd_q2
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 25 46 47 33 24
## Proportion 0.143 0.263 0.269 0.189 0.137
## --------------------------------------------------------------------------------
## adhd_q3
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 26 46 46 32 25
## Proportion 0.149 0.263 0.263 0.183 0.143
## --------------------------------------------------------------------------------
## adhd_q4
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 27 31 50 31 36
## Proportion 0.154 0.177 0.286 0.177 0.206
## --------------------------------------------------------------------------------
## adhd_q5
## n missing distinct
## 174 1 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 33 21 32 47 41
## Proportion 0.190 0.121 0.184 0.270 0.236
## --------------------------------------------------------------------------------
## adhd_q6
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 36 29 45 45 20
## Proportion 0.206 0.166 0.257 0.257 0.114
## --------------------------------------------------------------------------------
## adhd_q7
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 22 53 54 25 21
## Proportion 0.126 0.303 0.309 0.143 0.120
## --------------------------------------------------------------------------------
## adhd_q8
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 21 40 40 42 32
## Proportion 0.120 0.229 0.229 0.240 0.183
## --------------------------------------------------------------------------------
## adhd_q9
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 31 43 36 41 24
## Proportion 0.177 0.246 0.206 0.234 0.137
## --------------------------------------------------------------------------------
## adhd_q10
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 15 46 49 33 32
## Proportion 0.086 0.263 0.280 0.189 0.183
## --------------------------------------------------------------------------------
## adhd_q11
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 16 33 48 43 35
## Proportion 0.091 0.189 0.274 0.246 0.200
## --------------------------------------------------------------------------------
## adhd_q12
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 55 55 37 15 13
## Proportion 0.314 0.314 0.211 0.086 0.074
## --------------------------------------------------------------------------------
## adhd_q13
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 15 29 46 47 38
## Proportion 0.086 0.166 0.263 0.269 0.217
## --------------------------------------------------------------------------------
## adhd_q14
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 27 24 40 47 37
## Proportion 0.154 0.137 0.229 0.269 0.211
## --------------------------------------------------------------------------------
## adhd_q15
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 50 39 35 27 24
## Proportion 0.286 0.223 0.200 0.154 0.137
## --------------------------------------------------------------------------------
## adhd_q16
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 40 49 39 17 30
## Proportion 0.229 0.280 0.223 0.097 0.171
## --------------------------------------------------------------------------------
## adhd_q17
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 49 41 46 22 17
## Proportion 0.280 0.234 0.263 0.126 0.097
## --------------------------------------------------------------------------------
## adhd_q18
## n missing distinct
## 175 0 5
##
## lowest : Never Rarely Sometimes Often Very Often
## highest: Never Rarely Sometimes Often Very Often
##
## Value Never Rarely Sometimes Often Very Often
## Frequency 49 52 35 20 19
## Proportion 0.280 0.297 0.200 0.114 0.109
## --------------------------------------------------------------------------------
## adhd_total
## n missing distinct Info Mean Gmd .05 .10
## 175 0 62 0.999 34.32 19.16 7.0 12.0
## .25 .50 .75 .90 .95
## 21.0 33.0 47.5 55.0 62.3
##
## lowest : 0 1 3 5 6, highest: 65 67 69 71 72
## --------------------------------------------------------------------------------
## md_q1a
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 79 96
## Proportion 0.451 0.549
## --------------------------------------------------------------------------------
## md_q1b
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 75 100
## Proportion 0.429 0.571
## --------------------------------------------------------------------------------
## md_q1c
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 80 95
## Proportion 0.457 0.543
## --------------------------------------------------------------------------------
## md_q1d
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 73 102
## Proportion 0.417 0.583
## --------------------------------------------------------------------------------
## md_q1e
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 78 97
## Proportion 0.446 0.554
## --------------------------------------------------------------------------------
## md_q1f
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 53 122
## Proportion 0.303 0.697
## --------------------------------------------------------------------------------
## md_q1g
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 49 126
## Proportion 0.28 0.72
## --------------------------------------------------------------------------------
## md_q1h
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 77 98
## Proportion 0.44 0.56
## --------------------------------------------------------------------------------
## md_q1i
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 72 103
## Proportion 0.411 0.589
## --------------------------------------------------------------------------------
## md_q1j
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 107 68
## Proportion 0.611 0.389
## --------------------------------------------------------------------------------
## md_q1k
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 90 85
## Proportion 0.514 0.486
## --------------------------------------------------------------------------------
## md_q1l
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 73 102
## Proportion 0.417 0.583
## --------------------------------------------------------------------------------
## md_q1m
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 89 86
## Proportion 0.509 0.491
## --------------------------------------------------------------------------------
## md_q2
## n missing distinct
## 175 0 2
##
## Value No Yes
## Frequency 49 126
## Proportion 0.28 0.72
## --------------------------------------------------------------------------------
## md_q3
## n missing distinct
## 175 0 4
##
## Value No Problem Minor Moderate Serious
## Frequency 25 25 49 76
## Proportion 0.143 0.143 0.280 0.434
## --------------------------------------------------------------------------------
## md_total
## n missing distinct Info Mean Gmd .05 .10
## 175 0 18 0.995 10.02 5.469 0.7 3.0
## .25 .50 .75 .90 .95
## 6.5 11.0 14.0 16.0 17.0
##
## lowest : 0 1 2 3 4, highest: 13 14 15 16 17
##
## Value 0 1 2 3 4 5 6 7 8 9 10
## Frequency 9 3 5 6 4 7 10 6 8 12 13
## Proportion 0.051 0.017 0.029 0.034 0.023 0.040 0.057 0.034 0.046 0.069 0.074
##
## Value 11 12 13 14 15 16 17
## Frequency 18 12 13 12 14 12 11
## Proportion 0.103 0.069 0.074 0.069 0.080 0.069 0.063
## --------------------------------------------------------------------------------
## alcohol
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 80 18 7 66
## Proportion 0.468 0.105 0.041 0.386
## --------------------------------------------------------------------------------
## thc
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 116 12 3 40
## Proportion 0.678 0.070 0.018 0.234
## --------------------------------------------------------------------------------
## cocaine
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 101 9 5 56
## Proportion 0.591 0.053 0.029 0.327
## --------------------------------------------------------------------------------
## stimulants
## n missing distinct
## 171 4 3
##
## Value No Use Use Dependence
## Frequency 160 6 5
## Proportion 0.936 0.035 0.029
## --------------------------------------------------------------------------------
## sedative_hypnotics
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 161 4 1 5
## Proportion 0.942 0.023 0.006 0.029
## --------------------------------------------------------------------------------
## opioids
## n missing distinct
## 171 4 3
##
## Value No Use Use Dependence
## Frequency 146 4 21
## Proportion 0.854 0.023 0.123
## --------------------------------------------------------------------------------
## court_order
## n missing distinct
## 170 5 2
##
## Value No Yes
## Frequency 155 15
## Proportion 0.912 0.088
## --------------------------------------------------------------------------------
## education
## n missing distinct Info Mean Gmd .05 .10
## 166 9 14 0.929 11.9 2.265 8.25 9.00
## .25 .50 .75 .90 .95
## 11.00 12.00 13.00 14.00 16.00
##
## lowest : 6 7 8 9 10, highest: 15 16 17 18 19
##
## Value 6 7 8 9 10 11 12 13 14 15 16
## Frequency 2 2 5 12 12 23 67 15 14 1 7
## Proportion 0.012 0.012 0.030 0.072 0.072 0.139 0.404 0.090 0.084 0.006 0.042
##
## Value 17 18 19
## Frequency 2 3 1
## Proportion 0.012 0.018 0.006
## --------------------------------------------------------------------------------
## hx_of_violence
## n missing distinct
## 164 11 2
##
## Value No Yes
## Frequency 124 40
## Proportion 0.756 0.244
## --------------------------------------------------------------------------------
## disorderly_conduct
## n missing distinct
## 164 11 2
##
## Value No Yes
## Frequency 45 119
## Proportion 0.274 0.726
## --------------------------------------------------------------------------------
## suicide
## n missing distinct
## 162 13 2
##
## Value No Yes
## Frequency 113 49
## Proportion 0.698 0.302
## --------------------------------------------------------------------------------
## abuse
## n missing distinct
## 161 14 8
##
## lowest : No Physical Sexual Emotional Physical & Sexual
## highest: Emotional Physical & Sexual Physical & Emotional Sexual & Emotional Physical, Sexual, & Emotional
## --------------------------------------------------------------------------------
## non_subst_dx
## n missing distinct
## 153 22 3
##
## Value None One More than one
## Frequency 102 35 16
## Proportion 0.667 0.229 0.105
## --------------------------------------------------------------------------------
## subst_dx
## n missing distinct
## 152 23 4
##
## Value None One Two Three or more
## Frequency 42 61 35 14
## Proportion 0.276 0.401 0.230 0.092
## --------------------------------------------------------------------------------
## psych_meds
## n missing distinct
## 57 118 3
##
## Value None One More than one
## Frequency 19 21 17
## Proportion 0.333 0.368 0.298
## --------------------------------------------------------------------------------
From this output, we can summarize each dataset feature as follows:
loan_id (ordinal): each entry is a unique value, therefore this feature is not informative for loan statusgender (categorical): 2 distinct values with missing datamarried (categorical): 2 distinct values with missing datadependents (categorical): 4 distinct values with missing dataeducation (categorical): 2 distinct values, no missing dataself_employed (categorical): 2 distinct values with missing dataapplicantincome (numeric): value range, no missing datacoapplicantincome (numeric): value range, no missing dataloanamount (numeric): value range with missing dataloan_amount_term (numeric): relatively few unique values (10) with missing datacredit_history (categorical): 2 distinct values with missing dataproperty_area (categorical): 3 distinct values, no missing dataloan_status (categorical): 2 distinct values, no missing dataRemoving loan_id: this feature was found to have as many unique values as there are rows in the dataframe and is a record identification label. Therefore, we will drop this feature from the data:
# remove loan ID
# df <- df %>%
# select(-loan_id)Use naniar’s miss_var_summary() and vis_miss() functions to summarize and visualize the missing values in the features of the dataset:
# return a summary table of the missing data in each column
miss_var_summary(df)## # A tibble: 54 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 psych_meds 118 67.4
## 2 subst_dx 23 13.1
## 3 non_subst_dx 22 12.6
## 4 abuse 14 8
## 5 suicide 13 7.43
## 6 hx_of_violence 11 6.29
## 7 disorderly_conduct 11 6.29
## 8 education 9 5.14
## 9 court_order 5 2.86
## 10 alcohol 4 2.29
## # ... with 44 more rows
# visualize the amount of missing data for each feature
vis_miss( df, cluster = TRUE )The figure above shows a grouped view of the missing values in each feature column. Overall, 2% of the values are missing from the dataset. Several features have no missing values (education, applicantincome, and coapplicantincome). Many of the features have relatively few missing values. However, the credit_history features is missing 8.14% of the data.
Explore the missing data further by using the gg_miss_upset() function to show patterns correlated missing values.
gg_miss_upset( df )The figure above shows that the vast majority of rows only have a singleton missing value; this is represented by the 5 bars in the left of the plot with only one dot to indicate the missing feature. However, a small minority or rows have 2-3 missing elements; this is indicated by multiple dots under the 5 bars to the right side of the plot.
Since there are relatively few rows with multiple missing values, it would not adversely affect the analysis to remove them. The rest of the missing values can be dealt with by imputation.
# create a vector holding the sum of NAs for each row
count_na <- apply( df, 1, function(x) sum(is.na(x)))
# keep only the rows with less than 2 missing values
df <- df[count_na < 2,]
dim( df )## [1] 143 54
For a simple first approximation, we will use the simputation package\(^1\) to fill NA values for categorical and numeric features with ‘hot-deck’ imputation (i.e. a values pulled at random from complete cases in the dataset).
# # single imputation analysis
# df <- bind_shadow( df ) %>%
# data.frame() %>%
# simputation::impute_rhd(., credit_history ~ 1 ) %>%
# simputation::impute_rhd(., loan_amount_term ~ 1 ) %>%
# simputation::impute_rhd(., loanamount ~ 1 ) %>%
# simputation::impute_rhd(., self_employed ~ 1 ) %>%
# simputation::impute_rhd(., gender ~ 1 ) %>%
# simputation::impute_rhd(., dependents ~ 1 ) %>%
# tbl_df() %>%
# select( -c(13:24) )Confirm that we have filled all NA values:
# return a summary table of the missing data in each column
miss_var_summary(df)## # A tibble: 54 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 psych_meds 87 60.8
## 2 adhd_q5 1 0.699
## 3 subst_dx 1 0.699
## 4 initial 0 0
## 5 age 0 0
## 6 sex 0 0
## 7 race 0 0
## 8 adhd_q1 0 0
## 9 adhd_q2 0 0
## 10 adhd_q3 0 0
## # ... with 44 more rows
Now that the missing values have been imputed across the dataframe, we can explore the relationships of the variables in more depth. To start we visualize the distributions of the numeric variables grouped by the outcome of the target variable (loan_status):
# numeric distributions
df %>%
select_if(is.numeric) %>%
bind_cols(select(df, suicide)) %>%
gather(var, val, -suicide) %>%
ggplot(aes(x = val, fill = suicide)) +
geom_density(alpha = .3) +
facet_wrap(~var, scales = 'free', ncol = 2) +
theme_bw() +
labs(x = element_blank(),
y = element_blank(),
title = 'Distribution of Numeric Variables'
) The distributions do not suggest any obviously significant differences when grouped by the target variable for any of the numeric features. It does not appear to be likely that either of these 3 features are correlated to
loan_status. This can be confirmed with ANOVA\(^2\):
# # ANOVA for applicantincome
# applicantincome.aov <- aov(applicantincome ~ loan_status, data = df)
# # Summary of the analysis
# summary(applicantincome.aov)
# ```
# ```{r}
# # ANOVA for coapplicantincome
# coapplicantincome.aov <- aov(coapplicantincome ~ loan_status, data = df)
# # Summary of the analysis
# summary(coapplicantincome.aov)# # ANOVA for applicantincome
# loanamount.aov <- aov(loanamount ~ loan_status, data = df)
# # Summary of the analysis
# summary(loanamount.aov)The p-values for all three ANOVA tests are very high indicating that there is no significant relationship between the features variables and the target.
Here we can look for correlations between feature variables
df_numeric <- df %>%
select_if(is.numeric)
# select(applicantincome, coapplicantincome, loanamount )
plot_corr_matrix(df_numeric, -1) We can see a strong positive correlation between the features
applicantincome and loanamount. There is a weak positive correlation between coapplicantincome and loanamount. Interestingly there is a weak negative correlation between applicantincome and coapplicantincome; presumptively due to a high-earning family being able to sustain with a single income.
No we turn to the categorical features to see if there are any strong relationships between them and the target variable.
The following code will visualize the proportions of each target variable level for each level of a given feature:
yes_count <- sum(df$suicide == 'Yes')
no_count <- sum(df$suicide == 'No')
df %>%
select(!is.numeric) %>%
gather(var, value, -suicide) %>%
group_by(var, value, suicide) %>%
summarise(count = n(),
.groups = 'drop') %>%
mutate(prop = count / ifelse(suicide == 'Yes', yes_count, no_count)) %>%
ggplot(aes(x = value, y = prop, fill = suicide)) +
geom_col(position = 'dodge') +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(y = 'Frequency Proportion',
x = element_blank(),
title = 'Frequency Distributions For Non-Numeric Variables') +
scale_y_continuous(labels = percent_format(accuracy = 1))When interpreting the categorical bar plots, differences between loan_status for a given feature-level suggest that a relationship exists between a feature and the target variable. For example, we see a clear difference between the Y/N bars for credit_history, married and property_area whereas the is little difference for the levels of gender and no noticeable difference for self_employed.
The existence of a significant relationship between the categorical features and the target variable can be evaluated with a Chi-square test\(^3\).
# # Chi-square test for credit_history
# test <- chisq.test(table(df$credit_history, df$loan_status))
# test# # Chi-square test for married
# test <- chisq.test(table(df$married, df$loan_status))
# test# # Chi-square test for property_area
# test <- chisq.test(table(df$property_area, df$loan_status))
# test# # Chi-square test for education
# test <- chisq.test(table(df$education, df$loan_status))
# test# # Chi-square test for loan_amount_term
# test <- chisq.test(table(df$loan_amount_term, df$loan_status))
# test# # Chi-square test for dependents
# test <- chisq.test(table(df$dependents, df$loan_status))
# test# # Chi-square test for gender
# test <- chisq.test(table(df$gender, df$loan_status))
# test# # Chi-square test for self employed
# test <- chisq.test(table(df$self_employed, df$loan_status))
# test# impute NA daaaang!, sorry Leo, I didn't see that you impute the NAs here until too late
# preproc <- preProcess(df, 'bagImpute')
# df2 <- predict(preproc, df)
df2 <- df
# %>%
# select( married, property_area, credit_history, education, loan_amount_term, loan_status )
# train test split
set.seed(101)
trainIndex <- createDataPartition(df2$suicide,
p = 0.75,
list = F)
train <- df2[trainIndex,]
test <- df2[-trainIndex,]
# cross validation train control
ctrl <- trainControl(method = 'cv', number = 10)