Mental Health Survey

Introduction

We’ll be working with a mental health dataset and will be conducting exploratory data analysis, unsupervised clustering, principal component analysis, gradient boosting, and support vector machines.

Import Data

To begin, the following code will import the data and load the libraries:

library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
library(Hmisc)
library(naniar)
library(conflicted)
library(pheatmap)
library(corrplot)
library(factoextra)
library(gbm)
library(caret)
library(gridExtra)
library(MLmetrics)

# resolve function name conflict
conflict_prefer('filter', 'dplyr')
conflict_prefer('summarize', 'dplyr')

# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/Data622_group5_projects/main/ADHD_data.csv'
df <- read.csv(url, header=T, na.strings="")

Variable Datatypes

In order to facilitate ease of use, we’ll be renaming the columns. Additionally, we’ll convert each of the number coded fields to factors while also including the proper labels.

# convert column names to lowercase
names(df) <- lapply(names(df), tolower)

# replace periods with underscore
names(df) <- str_replace_all(names(df), '\\.', '_')

# rename last column to remove trailing underscore
names(df)[ncol(df)] <- 'psych_meds'

# raw data with renamed columns
df_raw <- df

names(df_raw)

##  [1] "initial"            "age"                "sex"               
##  [4] "race"               "adhd_q1"            "adhd_q2"           
##  [7] "adhd_q3"            "adhd_q4"            "adhd_q5"           
## [10] "adhd_q6"            "adhd_q7"            "adhd_q8"           
## [13] "adhd_q9"            "adhd_q10"           "adhd_q11"          
## [16] "adhd_q12"           "adhd_q13"           "adhd_q14"          
## [19] "adhd_q15"           "adhd_q16"           "adhd_q17"          
## [22] "adhd_q18"           "adhd_total"         "md_q1a"            
## [25] "md_q1b"             "md_q1c"             "md_q1d"            
## [28] "md_q1e"             "md_q1f"             "md_q1g"            
## [31] "md_q1h"             "md_q1i"             "md_q1j"            
## [34] "md_q1k"             "md_q1l"             "md_q1m"            
## [37] "md_q2"              "md_q3"              "md_total"          
## [40] "alcohol"            "thc"                "cocaine"           
## [43] "stimulants"         "sedative_hypnotics" "opioids"           
## [46] "court_order"        "education"          "hx_of_violence"    
## [49] "disorderly_conduct" "suicide"            "abuse"             
## [52] "non_subst_dx"       "subst_dx"           "psych_meds"

# Sex
df$sex <- factor(df$sex, levels = c(1,2), labels = c('Male','Female'))

# Race
df$race <- factor(df$race, levels = c(1,2,3,4,5,6), labels = c('White','African American','Hispanic','Asian','Native American','Other or Missing Data'))

# ADHD q1 - q18
adhd_cols <- names(df[,5:22])
df[adhd_cols] <- lapply(df[adhd_cols], factor, levels = c(0,1,2,3,4), labels = c('Never','Rarely','Sometimes','Often','Very Often')) 

# Mood Disorder q1a - q2
md_cols <- names(df[,24:37])
df[md_cols] <- lapply(df[md_cols], factor, levels = c(0,1), labels = c('No','Yes')) 

# Mood Disorder q3
df$md_q3 <- factor(df$md_q3, levels = c(0,1,2,3), labels = c('No Problem','Minor','Moderate','Serious')) 

# Substance Abuse
sa_cols <- names(df[,40:45])
df[sa_cols] <- lapply(df[sa_cols], factor, levels = c(0,1,2,3), labels = c('No Use','Use','Abuse','Dependence')) 

# Court Order
df$court_order <- factor(df$court_order, levels = c(0,1), labels = c('No','Yes'))

# Education
# think it might be okay to leave this as a number

# History of Violence, Disorderly Conduct, Suicide Attempt
hist_cols <- names(df[,48:50])
df[hist_cols] <- lapply(df[hist_cols], factor, levels = c(0,1), labels = c('No','Yes'))

# Abuse History
df$abuse <- factor(df$abuse, levels = c(0,1,2,3,4,5,6,7), 
                   labels = c('No','Physical','Sexual','Emotional','Physical & Sexual','Physical & Emotional','Sexual & Emotional','Physical, Sexual, & Emotional'))

# Non-Substance Related Drugs
df$non_subst_dx <- factor(df$non_subst_dx, levels = c(0,1,2), labels = c('None','One','More than one'))

# Substance Related Drugs
df$subst_dx <- factor(df$subst_dx, levels = c(0,1,2,3), labels = c('None','One','Two','Three or more'))

# Psychiatric Meds
df$psych_meds <- factor(df$psych_meds, levels = c(0,1,2), labels = c('None','One','More than one'))

# copy df
df_raw2 <- df

# str(df)

Exploratory Data Analysis

The following code will quantitatively and visually explore the nature of the dataset.

We begin by describing the dataset features.

Use dplyr’s glimpse() function to take a quick look at the data structure:

# quick look at what the data structure looks like
glimpse(df)

## Rows: 175
## Columns: 54
## $ initial            <chr> "JA", "LA", "MD", "RD", "RB", "SB", "PK", "RJ", "DJ…
## $ age                <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,…
## $ sex                <fct> Male, Female, Female, Male, Male, Female, Female, M…
## $ race               <fct> White, White, White, White, White, White, White, Wh…
## $ adhd_q1            <fct> Rarely, Often, Sometimes, Often, Very Often, Someti…
## $ adhd_q2            <fct> Rarely, Often, Rarely, Often, Very Often, Often, So…
## $ adhd_q3            <fct> Very Often, Very Often, Sometimes, Sometimes, Somet…
## $ adhd_q4            <fct> Sometimes, Very Often, Rarely, Sometimes, Very Ofte…
## $ adhd_q5            <fct> Often, NA, Often, Very Often, Very Often, Often, Ve…
## $ adhd_q6            <fct> Rarely, Sometimes, Often, Often, Sometimes, Sometim…
## $ adhd_q7            <fct> Rarely, Sometimes, Often, Sometimes, Often, Often, …
## $ adhd_q8            <fct> Often, Often, Sometimes, Very Often, Very Often, Ve…
## $ adhd_q9            <fct> Sometimes, Sometimes, Never, Very Often, Very Often…
## $ adhd_q10           <fct> Very Often, Very Often, Rarely, Sometimes, Sometime…
## $ adhd_q11           <fct> Sometimes, Rarely, Sometimes, Often, Very Often, Ve…
## $ adhd_q12           <fct> Very Often, Very Often, Never, Rarely, Rarely, Some…
## $ adhd_q13           <fct> Rarely, Sometimes, Sometimes, Often, Often, Very Of…
## $ adhd_q14           <fct> Never, Very Often, Sometimes, Often, Sometimes, Ver…
## $ adhd_q15           <fct> Often, Very Often, Often, Rarely, Rarely, Often, Ve…
## $ adhd_q16           <fct> Rarely, Often, Sometimes, Sometimes, Sometimes, Ver…
## $ adhd_q17           <fct> Often, Rarely, Rarely, Rarely, Rarely, Often, Somet…
## $ adhd_q18           <fct> Very Often, Very Often, Rarely, Sometimes, Rarely, …
## $ adhd_total         <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,…
## $ md_q1a             <fct> Yes, Yes, No, Yes, No, No, Yes, No, Yes, Yes, Yes, …
## $ md_q1b             <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ md_q1c             <fct> Yes, Yes, No, No, No, No, No, No, No, No, Yes, No, …
## $ md_q1d             <fct> Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, Yes, N…
## $ md_q1e             <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes,…
## $ md_q1f             <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Ye…
## $ md_q1g             <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Ye…
## $ md_q1h             <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1i             <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1j             <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, No, …
## $ md_q1k             <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No,…
## $ md_q1l             <fct> No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ md_q1m             <fct> Yes, No, No, Yes, No, No, No, No, Yes, Yes, Yes, No…
## $ md_q2              <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
## $ md_q3              <fct> Serious, Serious, Moderate, Serious, Moderate, Seri…
## $ md_total           <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, …
## $ alcohol            <fct> Use, No Use, No Use, Use, Use, Use, Dependence, No …
## $ thc                <fct> Use, No Use, No Use, Use, Use, No Use, Dependence, …
## $ cocaine            <fct> Use, No Use, No Use, Use, No Use, No Use, Use, No U…
## $ stimulants         <fct> No Use, No Use, No Use, Use, No Use, No Use, Use, N…
## $ sedative_hypnotics <fct> No Use, No Use, No Use, No Use, No Use, No Use, Use…
## $ opioids            <fct> No Use, No Use, No Use, No Use, No Use, No Use, No …
## $ court_order        <fct> Yes, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ education          <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1…
## $ hx_of_violence     <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, Yes, No…
## $ disorderly_conduct <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ suicide            <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, No, No, Yes, N…
## $ abuse              <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P…
## $ non_subst_dx       <fct> More than one, One, More than one, More than one, M…
## $ subst_dx           <fct> None, None, None, None, None, None, None, One, None…
## $ psych_meds         <fct> More than one, One, One, More than one, None, None,…

From this output, we can summarize each dataset feature as follows:

Column Numbers	Column Labels	Description
1	`initial`	(string) subject’s initials
2	`age`	(numeric) integer values (years)
3	`sex`	(categorical): binary (‘male’ and ‘female’)
4	`race`	(categorical)
5-22	`adhd_q1`-`adhd_q18`	(categorical) ordinal values
23	`adhd_total`	(numeric): summary feature derived from `adhd_q1`-`adhd_q18`
24-38	`md_q1a`-`md_q3`	(categorical) ordinal values
39	`md_total`	(numeric): summary feature derived from `md_q1a`-`md_q3`
40	`alcohol`	(categorical): ordinal values
41	`thc`	(categorical): ordinal values
42	`cocaine`	(categorical): ordinal values
43	`stimulants`	(categorical): ordinal values
44	`sedative_hypnotics`	(categorical): ordinal values
45	`opioids`	(categorical): ordinal values
46	`court_order`	(categorical): binary (yes/no)
47	`education`	(numeric): interger values (years)
48	`hx_of_violence`	(categorical): binary (yes/no)
49	`disorderly_conduct`	(categorical): binary (yes/no)
50	`suicide`	(categorical): binary (yes/no)
51	`abuse`	(categorical): ordinal values
52	`non_subst_dx`	(categorical): ordinal values
53	`subst_dx`	(categorical): ordinal values
54	`psych_meds`	(categorical): ordinal values

The columns adhd_q1-adhd_q18 and md_q1a-md_q3 are summarized by the derivative columns adhd_total and md_total respectively. The adhd features correspond to an ADHD self-report survey whereas the md features give responses to a mood disorder self-report survey. For the initial Exploratory Data Analysis, the individual question responses will be dropped in place of visualizations and summary statistics on the derived columns, adhd_total and md_total. A detailed analysis of the individual survey questions will be taken up in the Principal Components Analysis section.

Next, we visualize the extent of the missing values using the naniar library.

Missing Values

Use naniar’s miss_var_summary() and vis_miss() functions to summarize and visualize the missing values in the features of the dataset:

# return a summary table of the missing data in each column
miss_var_summary(df)

## # A tibble: 54 × 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 psych_meds            118    67.4 
##  2 subst_dx               23    13.1 
##  3 non_subst_dx           22    12.6 
##  4 abuse                  14     8   
##  5 suicide                13     7.43
##  6 hx_of_violence         11     6.29
##  7 disorderly_conduct     11     6.29
##  8 education               9     5.14
##  9 court_order             5     2.86
## 10 alcohol                 4     2.29
## # … with 44 more rows

# visualize the amount of missing data for each feature
vis_miss( df, cluster = TRUE )

The figure above shows a grouped view of the missing values in each feature column. Overall, 2.7% of the values are missing from the dataset. The majority of the features have no missing values. Many of the features have relatively few missing values. However, the psych_meds features is missing a considerable portion of the data.

df <- df %>%
  select( -c(psych_meds) )

Having dropped psych_meds, we can now use the gg_miss_upset() function to look for any remaining patterns of missing values in the data.

gg_miss_upset( df )

Both the vis_miss() and gg_miss_upset() visualization are informative, because they reveal some interesting patterns in the missing values for the data set. For instance, once we remove psych_meds most of the remaining rows that are not full, hold multiple missing values from a subset of the features. In other words, the remaining missing values are not randomly dispersed across the data set. Rather, they are concentrated in a subset of the features: disorderly_conduct, suicide, abuse, non_subst_dx and subst_dx. Notably, the responses that correspond to the ADHD and Mood Disorder self-reporting surveys are have no missing values; this knowledge will be helpful downstream. Further handling of missing values will be tailored to the analysis of each of the sections below.

Exploratory Data Visualizations

To gain a feel for the data, we will evaluate visualizations of select features.

Distributions of Demographic Variables

Summary Statistics of Demographic Variables

# summary of each demographic feature
demo_df <- df %>%
  select(c('age','sex','race','education'))#select( -c( 5:22, 24:38 ) )
describe( demo_df )

## demo_df 
## 
##  4  Variables      175  Observations
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       42    0.999    39.47     12.8     22.0     24.0 
##      .25      .50      .75      .90      .95 
##     29.5     42.0     48.0     53.0     56.0 
## 
## lowest : 18 19 20 21 22, highest: 55 56 57 61 69
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##      175        0        2 
##                         
## Value        Male Female
## Frequency      99     76
## Proportion  0.566  0.434
## --------------------------------------------------------------------------------
## race 
##        n  missing distinct 
##      175        0        4 
##                                                                             
## Value                      White      African American              Hispanic
## Frequency                     72                   100                     1
## Proportion                 0.411                 0.571                 0.006
##                                 
## Value      Other or Missing Data
## Frequency                      2
## Proportion                 0.011
## --------------------------------------------------------------------------------
## education 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      166        9       14    0.929     11.9    2.265     8.25     9.00 
##      .25      .50      .75      .90      .95 
##    11.00    12.00    13.00    14.00    16.00 
## 
## lowest :  6  7  8  9 10, highest: 15 16 17 18 19
##                                                                             
## Value          6     7     8     9    10    11    12    13    14    15    16
## Frequency      2     2     5    12    12    23    67    15    14     1     7
## Proportion 0.012 0.012 0.030 0.072 0.072 0.139 0.404 0.090 0.084 0.006 0.042
##                             
## Value         17    18    19
## Frequency      2     3     1
## Proportion 0.012 0.018 0.006
## --------------------------------------------------------------------------------

Visualizing the demographic information of the mental health study

p1 <- ggplot( df, aes(x = age) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Age Density Plot' )
num_obs <- dim(df)[1]
p2 <- df %>%
  dplyr::count( sex ) %>%
  dplyr::mutate( n = n/num_obs ) %>%
  ggplot( aes(x=sex, y=n)) + 
  geom_bar(stat = "identity") +
  ylim( 0, 0.6 ) +
  ylab( 'proportion' ) +
  ggtitle( 'Distribution  of Sex (binary)' ) +
  theme_classic()
p3 <- df %>%
  dplyr::count( race ) %>%
  dplyr::mutate( n = n/num_obs ) %>%
  ggplot( aes(x=race, y=n)) + 
  geom_bar(stat = "identity") +
  ylab( 'proportion' ) +
  ylim( 0, 0.6 ) +
  ggtitle( 'Distribution  of Race' ) +
  theme_classic() + 
  scale_x_discrete(guide = guide_axis(n.dodge = 2))
p4 <- ggplot( df, aes(x = education) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Education Density Plot' )
num_obs <- dim(df)[1]
grid.arrange(p1, p4, p2, p3, ncol=2)

From these visualization we learn several things about the mental health study participants:

Age - the age distribution is bimodal with one peak at approximate 26 and another at 46 and a range from 18 to 69 years of age.
Education - education has an interesting envelope. There is a sharp peak at ~12years (high school graduates) with long kurtosis to either side
Sex - sex is given as a binary feature and is not equally balanced with 57% Male and 43% Female
Race - race takes one of four values: ‘White’ (41%), ‘African American’ (57%), ‘Hispanic’(.6%) and ‘Other or Missing Data’(1%).

ADHD and Mood Disorder Survey Scores

Summary Statistics for self-report surveys

# summary of each field
initial_eda_df <- df %>%
  select( c('adhd_total', 'md_total' ) )
describe( initial_eda_df )

## initial_eda_df 
## 
##  2  Variables      175  Observations
## --------------------------------------------------------------------------------
## adhd_total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       62    0.999    34.32    19.16      7.0     12.0 
##      .25      .50      .75      .90      .95 
##     21.0     33.0     47.5     55.0     62.3 
## 
## lowest :  0  1  3  5  6, highest: 65 67 69 71 72
## --------------------------------------------------------------------------------
## md_total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       18    0.995    10.02    5.469      0.7      3.0 
##      .25      .50      .75      .90      .95 
##      6.5     11.0     14.0     16.0     17.0 
## 
## lowest :  0  1  2  3  4, highest: 13 14 15 16 17
##                                                                             
## Value          0     1     2     3     4     5     6     7     8     9    10
## Frequency      9     3     5     6     4     7    10     6     8    12    13
## Proportion 0.051 0.017 0.029 0.034 0.023 0.040 0.057 0.034 0.046 0.069 0.074
##                                                     
## Value         11    12    13    14    15    16    17
## Frequency     18    12    13    12    14    12    11
## Proportion 0.103 0.069 0.074 0.069 0.080 0.069 0.063
## --------------------------------------------------------------------------------

Visualizing the adhd_total and md_total features. These two features give a summary score for the questions on an ADHD Self-Reporting Survey and a separate Mood Disorder Questionnaire.

p1 <- ggplot( df, aes(x = adhd_total) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( df, aes(x = md_total) ) +
  geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )

adhd_total has a roughly normal distribution with a mean centered at 34.3 and a range of 0 to 72. md_total has an entirely different range from 0 to 17; it’s distribution is more left-skewed than ADHD.

Next we visualize adhd_total ~ md_total to evaluate the relationship between the two total scores

ggplot(df, aes(x=md_total, y=adhd_total)) + 
  geom_point()+
  geom_smooth(method=lm) +
  theme_classic() +
  ggtitle('ADHD total ~ Mood Disorder total')

We can evaluate the fit of a linear model to the data via the summary report and diagnostic plots:

fit = lm( adhd_total ~ md_total, df )
summary( fit )

## 
## Call:
## lm(formula = adhd_total ~ md_total, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.718  -8.672  -1.941   9.163  38.713 
## 
## Coefficients:
##             Estimate Std. Error t value          Pr(>|t|)    
## (Intercept)  16.5104     2.5173   6.559 0.000000000603804 ***
## md_total      1.7769     0.2265   7.844 0.000000000000432 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.38 on 173 degrees of freedom
## Multiple R-squared:  0.2623, Adjusted R-squared:  0.2581 
## F-statistic: 61.53 on 1 and 173 DF,  p-value: 0.0000000000004319

par(mfrow = c(2, 2))
p1 <- plot(fit, which=1)
p2 <- plot(fit, which=2)
p3 <- plot(fit, which=3)
p4 <- plot(fit, which=4)

par(mfrow = c(1, 1))

The fit is statistically significant and the diagnostics suggest that the assumptions for linearity are reasonably met. Therefore, we can say with confidence that for a given point increase in Mood Disorder Score, the average ADHD score increases 1.7769 points.

Substance Misuse and other Features

Summary Statistics for Substance Misuse Features

# summary of each field
sub_df <- df %>%
  select( c( 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'subst_dx' ) )
describe( sub_df )

## sub_df 
## 
##  7  Variables      175  Observations
## --------------------------------------------------------------------------------
## alcohol 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency          80         18          7         66
## Proportion      0.468      0.105      0.041      0.386
## --------------------------------------------------------------------------------
## thc 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         116         12          3         40
## Proportion      0.678      0.070      0.018      0.234
## --------------------------------------------------------------------------------
## cocaine 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         101          9          5         56
## Proportion      0.591      0.053      0.029      0.327
## --------------------------------------------------------------------------------
## stimulants 
##        n  missing distinct 
##      171        4        3 
##                                            
## Value          No Use        Use Dependence
## Frequency         160          6          5
## Proportion      0.936      0.035      0.029
## --------------------------------------------------------------------------------
## sedative_hypnotics 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         161          4          1          5
## Proportion      0.942      0.023      0.006      0.029
## --------------------------------------------------------------------------------
## opioids 
##        n  missing distinct 
##      171        4        3 
##                                            
## Value          No Use        Use Dependence
## Frequency         146          4         21
## Proportion      0.854      0.023      0.123
## --------------------------------------------------------------------------------
## subst_dx 
##        n  missing distinct 
##      152       23        4 
##                                                                   
## Value               None           One           Two Three or more
## Frequency             42            61            35            14
## Proportion         0.276         0.401         0.230         0.092
## --------------------------------------------------------------------------------

The individual substance features (e.g. opioids) describe the severity with which a participants uses or does not use a particular substance. On the other hand, subst_dx counts how many substances a participant has a reported using irrespective of severity of use. The following figure will attempt to find patterns between the individual substances and the summary feature.

none_count <- sum(sub_df$subst_dx == 'None')
one_count <- sum(sub_df$subst_dx == 'One')
two_count <- sum(sub_df$subst_dx == 'Two')
more_count <- sum(sub_df$subst_dx == 'Three or more')
total_count <- dim(sub_df)[1]
  
plot_df <- sub_df %>%
  drop_na() %>%
  #mutate_if(is.factor, as.numeric) %>%
  gather(var, value, -subst_dx) %>%
  group_by(var, value, subst_dx) %>%
  dplyr::summarize(count = n(),
            .groups = 'drop') %>%
  dplyr::mutate(prop = count / case_when(subst_dx == 'None' ~ none_count, 
                                         subst_dx == 'One' ~ one_count,
                                         subst_dx == 'Two' ~ two_count,
                                         subst_dx == 'Three or more' ~ more_count))
plot_df$value <- factor(plot_df$value,levels = c("No Use", "Use", "Dependence", "Abuse"))

ggplot(plot_df, aes(x = value, y = count, fill = subst_dx)) +
  geom_col(position="stack") +#, position = 'dodge') +
  facet_wrap(~var, scales = 'free') +
  theme_bw() +
  labs(y = 'count',
       x = element_blank(),
       title = 'Distributions For Substances') +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

  #scale_y_continuous(labels = percent_format(accuracy = 1))

From the figure above, we see that for each individual substance, the majority of participants report ‘No use’. However, the instances of ‘Dependence’ are elevated for several substances notably alcohol, cocaine and THC. For these substances it is the case that there is a substantial proportion of participants that also reported a substance abuse diagnosis (subst_dx) of three or more substances. There is another interesting result in this figure: there are many instances where a level of substance use was self-reported for a participant that also reported ‘None’ for substance abuse diagnosis. It is interesting that a participant can simultaneously report to, for example, have a dependence on alcohol yet report ‘None’ for a substance abuse diagnosis. Perhaps more documentation on the dataset can shed light on this.

To re-represent this data, we will generate a new summary feature that sums the reported sevarity of use for each substance included in the self report:

sub_df <- sub_df %>%
  mutate_if(is.factor, as.numeric) %>%
  dplyr::mutate( subst_sum = alcohol + thc + cocaine + stimulants + sedative_hypnotics + opioids )

We can derive a similar summary variable for the non-substance abuse behaviors reported (e.g. disorderly conduct, court order etc.). Hoever, we exclude abuse because it has a difference and non-binary reporting criteria.

abuse_df <- df %>%
  select( c( 'suicide', 'disorderly_conduct', 'hx_of_violence', 'court_order' ) ) %>%
  mutate_if(is.factor, as.numeric) %>%
  dplyr::mutate( abuse_sum = suicide + disorderly_conduct + hx_of_violence + court_order )

Visualize the distribution of the new summary variables

p1 <- ggplot( sub_df, aes(x = subst_sum) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( abuse_df, aes(x = abuse_sum) ) +
  geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )

For one last exploratory visualization of the data, we will look at correlations between the self-reporting survey summary features (adhd_total & md_total) and the new summary feature generated here. Obviously, a positive correlation is expected between the two survey totals, because we demonstrated the linear regression earlier.

df_feats <- df %>%
  select( c('adhd_total','md_total') ) %>%
  dplyr::mutate( subst_sum = as.integer( sub_df$subst_sum ),
          abuse_sum = as.integer( abuse_df$abuse_sum ) ) %>%
  drop_na()

corrplot(cor(df_feats), method="number")

From the correlation plot above, we do not see a strong relationship between the self-reported survey total scores and the new features we generated to summarize substance abuse and non-substance abuses. However, in upcoming sections we will use machine learning techniques to investigate this relationship further.

Clustering Methods

Finding K

In order to find patients of clusters, we first need to determine the number of clusters. Below, we’ll use two common methods, evaluating the sum of squares and silhouette width, in order to decide how many clusters we’ll use. Since clustering requires no null values, we’ll use the bagImpute method from caret and use the raw data before each field was converted to factors.

# exclude names
df3 <- df_raw %>%
  select(-initial)

# impute null
preproc <- preProcess(df3, 'bagImpute')
df4 <- predict(preproc, df3)

Total Within Sum of Square

fviz_nbclust(df4, kmeans, method = "wss")

Based on the sum of squares for each value of K, we can use 2 clusters as the ‘elbow’ point. This gives us a good starting point to determine K, but we should note that in unsupervised learning there is no definite value of k. For the purposes of this homework assignment, we’ll choose one value of k and proceed with the analysis, but in real life situations, it could be helpful to try multiple values of k and analyze the results to find anything that might be interesting.

Average Siluotte Width

fviz_nbclust(df4, kmeans, method = "silhouette")

Based on the average silhouette width of clusters, the optimal number of clusters shows k = 2.

Since both methods above have a consensus on two clusters, we’ll proceed with calculating the clusters using k = 2 and then plot the distributions to observe and note any relative differences.

Calculating Clusters

Once the clusters are calculated, let’s check the size of each cluster to make sure we’re looking at different groups with enough data points.

# 25 random starts
cluster_k2 <- kmeans(df4, 2, nstart = 25)

# add cluster number to dataframe
df4$cluster_k2 <- cluster_k2$cluster

# check cluster sizes
table(df4$cluster_k2)

## 
##  1  2 
## 88 87

Using k = 2, we created an almost even split of observations. Cluster 1 has 88 observations and cluster 2 has 87.

Note that when this file is run, the cluster numbers may switch or possibly form different clusters altogther. We used the paramter nstart = 25, which increases the chance that we’ll end up with the same cluster. It’s possible that the algorithm ends up with a very close vote, like 12 to 13 or 11 to 14 – which could lead to different clusters and not just switched cluster labels.

ADHD Total vs Suicide Plot

# adhd vs suicide
fviz_cluster(cluster_k2, data = df4, c('adhd_total', 'suicide'))

The clusters created show a clear delineation for respondents based on adhd_total. This suggests that the clusters are separated based on levels of ADHD. Note that the variables have not been scaled. Since the ADHD total has a wider variance than each of the individual adhd question variables, it’s likely that different clusters would be formed if all the data was scaled. This could really affect the forming of the clusters since the variance of the total variable is much larger than a single question, which may range from 0-4.

Mood Disorder Total vs Suicide Plot

# mood disorder vs suicide
fviz_cluster(cluster_k2, data = df4, c('md_total', 'suicide'))

The results of the k2 cluster show considerable overlap based on md_total. This means that the clusters aren’t separated by mood disorder levels. Note that the mood disorder total variable is not scaled, similar to the ADHD total variable. Because the clusters are overlapped, that tells us that the cluster weren’t formed simply because of the wide variance, but rather because these two clusters have differences in other ways.

Relative Cluster Differences

Let’s examine relative distributions for each cluster and variable. We decided to use probability density charts because we’re most interested in relative differences in proportion of the clusters.

# function to plot relative variable distributions for clusters
cluster_plot <- function(start_col, end_col, plot = 'density', legend_position = 'bottom') {

  temp_df <- df4[,start_col:end_col] %>%
    bind_cols(cluster = factor(df4$cluster_k2)) %>%
    gather(var, val, -cluster)

  if (plot == 'density') {
    p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
    geom_density(alpha = 0.3)
  } else if (plot == 'histogram') {
    p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
    geom_histogram(alpha = 0.3)
  }
  
  p +
    facet_wrap(~var, scales = 'free') +
    theme_bw() +
    labs(fill = 'Cluster',
         x = element_blank(),
         y = element_blank()
         ) +
    theme(axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          axis.text.x = element_text(size = 8),
          legend.position = legend_position)

}

Demographics

cluster_plot(1,3)

Cluster 1 shows a bimodal distribution of age, with modes near 25 and 45. Cluster 2 has fewer younger people with a higher single mode, near 45 years of age.
Cluster 2 is more likely to be African American.
Cluster 1 is more likely to be female and Cluster 2 is more likely to be male.

ADHD

cluster_plot(4,22)

With clear differences for each question as well as the total, cluster 1 reported higher adhd values relative to cluster 2.

Mood Disorders

cluster_plot(23,38)

Some questions have similiar distributions, but generally overall as well as altogether, cluster 1 is more likely to report higher levels of mood disorders.

Substance Abuse

cluster_plot(39,44)

There are different drug use patterns for both of the classes depending on the type of drug.

Alcohol: Both clusters show similar usage patterns for alcohol, with cluster 1 having a slightly higher chance of using, abusing, and being dependent. Cluster 2 has a higher chance of not drinking.
Cocaine: Cluster 1 is twice as likely as cluster 2 to not use cocaine and cluster 2 has a relatively slightly higher chance of being dependent.
Opioids: Cluster 2 has a higher chance of not using.
Sedative Hypnotics: Cluster 2 has a slighly higher chance of not using.
Stimulants: Cluster 2 is approximately twice as likely to not use stimulants.
THC: Cluster 1 more likely to both not use and also to be dependent.

Substance Abuse Levels

cluster_plot(51,53)

Cluster 2 has a higher chance of not taking non-substance abuse related drugs and is more likely to take one substance related drug.

Principal Component Analysis

In this section we will explore the ADHD data set further using Principal Component Analysis (PCA). PCA is a dimensionality reduction method that can be very useful for detecting patterns in complex, feature-rich data. At it’s core, PCA is a fairly simple linear algebra manipulation, an eigendecomposition. Basically, PCA maps the data onto a new set of components that consist of orthogonal axes (eigenvectors) and magnitudes (eigenvalues). Each successive component is added such that it describes as much remaining variance in the data as possible. From the resulting components we can select a subset that describe most of the variance in a new, lower-dimensional feature space which is more convenient for observing relationships in the data that were previously masked by the data’s complexity.

To approach this analysis, we derive 2 sets of features from the original data. Data features that correspond to 1) self-reported answers for an ADHD survey, 2) self-reported answers for a Mood Disorder survey.

PCA: ADHD Self-Report Survey

The Adult ADHD Self-Reporting Scale Symptoms Checklist is a survey consisting of 18 questions that all range on an ordinal categorical scale: ‘Never’, ‘Rarely’, ‘Sometimes’, ‘Often’, ‘Very Often’

Preprocessing

To prepare the ADHD survey responses we performed the following:

selected the ADHD survery question responses and exclude the feature adhd_total which gives a summary score for the survey
PCA works best with numeric data, so we will transform the ordinal categorical features to numeric values
evaluate the presence of missing values in the data

# preparing the ADHD self-report questionnaire data
adhd_df <- df %>%
  select( starts_with("adhd_") ) %>%
  select( -'adhd_total' )
# recode all questions from factor to numeric values
adhd_df <- adhd_df %>%
  mutate_at( 
    vars(matches("adhd_q") ), 
         funs( recode_factor( ., 'Never'=0,'Rarely'=1,'Sometimes'=2,'Often'=3,'Very Often'=4 ) ) )  %>%
  mutate_if( is.factor, as.numeric )
paste( 'initial number of rows:', dim( adhd_df )[1] )

## [1] "initial number of rows: 175"

Summary of missing responses in the ADHD Self-Reporting Survey:

# return a summary table of the missing data in each column
miss_var_summary( adhd_df )

## # A tibble: 18 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 adhd_q5       1    0.571
##  2 adhd_q1       0    0    
##  3 adhd_q2       0    0    
##  4 adhd_q3       0    0    
##  5 adhd_q4       0    0    
##  6 adhd_q6       0    0    
##  7 adhd_q7       0    0    
##  8 adhd_q8       0    0    
##  9 adhd_q9       0    0    
## 10 adhd_q10      0    0    
## 11 adhd_q11      0    0    
## 12 adhd_q12      0    0    
## 13 adhd_q13      0    0    
## 14 adhd_q14      0    0    
## 15 adhd_q15      0    0    
## 16 adhd_q16      0    0    
## 17 adhd_q17      0    0    
## 18 adhd_q18      0    0

We see that there is only 1 missing value, so it seems reasonable to just drop it

# remove missing
adhd_dna <- adhd_df %>%
  drop_na()
paste( 'after drop_na number of rows:', dim( adhd_dna )[1] )

## [1] "after drop_na number of rows: 174"

PCA from scratch

We can perform PCA easily from scratch and use the intermediate steps to learn more about the relationship of the individual question features to the ADHD total score:

First: we need to scale the data.
- This we de-mean the data and normalize the range of all features. Perhaps, redundant because all questions for this survey are on the same scale.
- use the scale() function
Second: we find the covariance matrix
- use the cov() function

# scale the data to facilitate PCA
adhd_df_scaled <- scale( adhd_dna )
# create a covariance matrix
adhd_df_cov <- cov( adhd_df_scaled )

Visualizing the covariance matrix

pheatmap(adhd_df_cov, display_numbers = T, color = colorRampPalette(c('white','red'))(100), cluster_rows = F, cluster_cols = F, fontsize_number = 15)

The values off the diagonal of the matrix describe the degree of covariance between the two features. We see that some pairs of questions are more closely related than others. For instance, questions #5 and #13 have the highest covariance:

adhd_q5 - “How often do you fidget or squirm with your hands or feet when you have to sit down for a long time?”
adhd_q13 - “How often do you feel restless or fidgety”

Perhaps it is no surprise that subjects who report the frequency they fidget when sitting down for a long time report a similar incidence of general fidgeting/squirming.

Another closely covarying pair of questions are #16 and #18

adhd_q16 - “When you’re in a conversation, how often do you find yourself finishing the sentences of the people you are talking to, before they can finish them themselves?”
adhd_q18 - “How often do you interrupt others when they are busy?”

Again, these are two very similar questions.
It is equally interesting to evaluate the questions with the lowest covariance, questions #3 and #16(see above).

adhd_q3 - “How often do you have problems remembering appointments or obligations?”

Indeed, conversation skills and the ability to remember appointments are two (subjectively) different tasks, so it is reasonable to not see a strong relation between the two.

Moving on, the covariance matrix can now be used for an eigendecomposition to find the principal components that describe the most variance in the data

# eigendecomposition of the covariance matrix
eig <- eigen( adhd_df_cov )
explained_variance <- eig$values
components <- eig$vectors
explained_variance

##  [1] 9.3075284 1.3703108 1.0189437 0.8140854 0.7215297 0.5939121 0.5529787
##  [8] 0.5013390 0.4471306 0.4144583 0.3953239 0.3693263 0.3523467 0.2795267
## [15] 0.2734344 0.2160613 0.2060649 0.1656991

Now, by storing the eigenvalues in a dataframe, we can explore the amount of variance explained by each PCA component. Furthermore, we find the ratio of explained variance by normalizing the cumulative variance of the components

compnum <- 1:length( explained_variance )
exv_cumsum <- cumsum( explained_variance )/length( explained_variance )
pca_res <- data.frame( compnum, explained_variance, exv_cumsum )
pca_res

##    compnum explained_variance exv_cumsum
## 1        1          9.3075284  0.5170849
## 2        2          1.3703108  0.5932133
## 3        3          1.0189437  0.6498213
## 4        4          0.8140854  0.6950482
## 5        5          0.7215297  0.7351332
## 6        6          0.5939121  0.7681283
## 7        7          0.5529787  0.7988494
## 8        8          0.5013390  0.8267015
## 9        9          0.4471306  0.8515421
## 10      10          0.4144583  0.8745676
## 11      11          0.3953239  0.8965300
## 12      12          0.3693263  0.9170482
## 13      13          0.3523467  0.9366230
## 14      14          0.2795267  0.9521522
## 15      15          0.2734344  0.9673430
## 16      16          0.2160613  0.9793464
## 17      17          0.2060649  0.9907945
## 18      18          0.1656991  1.0000000

Visualizing the cumulative explained variance described by the principal components with a Skree Plot:

ggplot( pca_res, aes( x = compnum, y = exv_cumsum ) ) +
  geom_line() +
  ylim( c(0,1.1) ) +
  scale_x_continuous(breaks = seq(0, length( explained_variance ), by = 1)) +
  geom_hline( yintercept = 0.95, linetype = 'dotted', col = 'red') +
  annotate("text", x = 2, y = 0.98, 
           label = expression( "95%" ~ sigma), vjust = -0.5) +
  theme_minimal() +
  xlab( 'Principal Component Number' ) +
  ylab( 'Cummulative Explained Variance' ) +
  ggtitle( 'Scree Plot: Explained Variance of Principal Components' )

The figure above plots the cumulative explained variance of the ADHD survey questions and total score as a result of the eigendecomposition. We see that the first principal component accounts for 51.7% of the total variance. Adding successive components gradually increases the cumulative explained variance. It takes as many as 14 components to describe 95% of the data’s variance. Therefore, dimensionality reduction by way of PCA only moderately decreases the components necessary to adequately explain the data (with a threshold of 95%).

PCA using `prcomp()`

We can determine the principal components of the ADHD data more quickly using the prcomp() function from the stats library.

# use the prcomp function
adhd_dna.pca <- prcomp( adhd_dna, center = TRUE, scale = TRUE )
# print the pca summary
summary( adhd_dna.pca )

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5    PC6     PC7
## Standard deviation     3.0508 1.17060 1.00943 0.90227 0.84943 0.7707 0.74363
## Proportion of Variance 0.5171 0.07613 0.05661 0.04523 0.04008 0.0330 0.03072
## Cumulative Proportion  0.5171 0.59321 0.64982 0.69505 0.73513 0.7681 0.79885
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.70805 0.66868 0.64378 0.62875 0.60772 0.59359 0.52870
## Proportion of Variance 0.02785 0.02484 0.02303 0.02196 0.02052 0.01957 0.01553
## Cumulative Proportion  0.82670 0.85154 0.87457 0.89653 0.91705 0.93662 0.95215
##                           PC15   PC16    PC17    PC18
## Standard deviation     0.52291 0.4648 0.45394 0.40706
## Proportion of Variance 0.01519 0.0120 0.01145 0.00921
## Cumulative Proportion  0.96734 0.9794 0.99079 1.00000

It is interesting to compare the Cumulative Proportion from the prcomp() output with the feature pca_res$exv_cumsum; the results are the same for both the built-in PCA function as well as are PCA from scratch (a relief!).

The PCA object that results from prcomp() can be used to visualize the components using the ggbiplot library:

library( ggbiplot )
ggbiplot( adhd_dna.pca )

The biplot shown above renders the data in the feature space of principal components PC1 and PC2; these are the components that explain most of the variance in the data. The scatterplot points show the projection of the data onto PC1 & PC2. The red arrows represent the original feature space that the data was represented in. We can see that the original features have the largest projection onto the PC1 axis. Relatively few original features have a sizeable projection onto PC2 (e.g. adhd_q16 & adhd_q17). Additionally, these vectors all have a positive component along PC1 showing that the result for each question has a positive relation to PC1. The size of the angle between a given pair of vectors is inversely related to the correlation of the pair. For example, we see that adhd_q3 and adhd_q16 have the largest angle between them; this corresponds to our earlier observation looking at the covariance matrix.

Mood Disorders self-report

The Mood Disorder Questionnaire is a self-reporting survey that can be used to identify subjects that may be bipolar.

We can perform a similar PCA analysis for the Mood Disorder survey results. Taking similar preprocessing steps: 1) select mood disorder question features, evaluate missing values…

# prepare the mood disorder data and recode all questions from factor to numeric values
md_df <- df %>%
  select( starts_with("md_") ) %>%
  select( -'md_total' ) %>%
  mutate_if( is.factor, as.numeric )

# return a summary table of the missing data in each column
miss_var_summary( md_df )

## # A tibble: 15 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 md_q1a        0        0
##  2 md_q1b        0        0
##  3 md_q1c        0        0
##  4 md_q1d        0        0
##  5 md_q1e        0        0
##  6 md_q1f        0        0
##  7 md_q1g        0        0
##  8 md_q1h        0        0
##  9 md_q1i        0        0
## 10 md_q1j        0        0
## 11 md_q1k        0        0
## 12 md_q1l        0        0
## 13 md_q1m        0        0
## 14 md_q2         0        0
## 15 md_q3         0        0

Wonderful, there are no missing values, so we can proceed with PCA using prcomp()…

# use the prcomp function
md_df.pca <- prcomp( md_df, center = TRUE, scale = TRUE )
# print the pca summary
summary( md_df.pca )

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3857 1.3474 0.9721 0.94891 0.91563 0.82515 0.79246
## Proportion of Variance 0.3794 0.1210 0.0630 0.06003 0.05589 0.04539 0.04187
## Cumulative Proportion  0.3794 0.5004 0.5635 0.62348 0.67937 0.72476 0.76663
##                            PC8     PC9    PC10    PC11    PC12   PC13    PC14
## Standard deviation     0.77044 0.74286 0.72814 0.70302 0.65162 0.5835 0.55685
## Proportion of Variance 0.03957 0.03679 0.03535 0.03295 0.02831 0.0227 0.02067
## Cumulative Proportion  0.80620 0.84299 0.87834 0.91128 0.93959 0.9623 0.98296
##                           PC15
## Standard deviation     0.50553
## Proportion of Variance 0.01704
## Cumulative Proportion  1.00000

Now to visualize the PCA biplot:

ggbiplot( md_df.pca ) + xlim( -2,2 )

A similar pattern emerges for the Mood Disorder survey responses. We see that all the projections again have a major component along PC1. Again, we can infer relationships between features from the biplot: md_q1c and md_q3 have the widest angle separating them and are therefore the least correlated features. where:

md_q1c - Has there been a period of time when (…) you felt much more self-confident than usual
- encoding - No:0, Yes:1
md_q3 - How much of a problem did any of these cause you - being unable to work; having family, money or legal troubles; getting into arguments or fights
- encoding - No Problem:0, Minor Problem:1, Moderate Problem:2, Serious Problem:3

Responses to md_q1c with a ‘Yes’ reflect a positive affect with a larger numeric value. Conversely, larger numeric values for md_q3 are associated with a negative aspect. In light of the opposite score polarities, it is not surprising to see a lack of correlation between these two features in the PC1/PC2 space.

On major difference between the Mood Disorder and ADHD biplot is that all the feature vectors point in a negative direction in relation to PC1. This suggests inverse relationship between the features variables and PC1. You may recall from the previous section that the responses to the ADHD survey were all positive in relation to PC1.

Gradient Boosting

Gradient Boosting builds a prediction model from an ensemble of smaller ‘weak learner’ models. In this section we will implement the gradient boosting approach to fit a binary classification model to features from the ADHD data set to predict whether a patient attempts suicide.

Data Preparation

Before fitting a Gradient Boost Model, we will collect a new subset of features for this approach. Instead of including all features for individual questions on either of the summaries, we will only be using the total scores (adhd_total and md_total) because the total scores are composite features build from the scores on the questions.

ordinal_cats <- c( 'sex', 'race', 'abuse', 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'suicide',
                   'court_order', 'hx_of_violence', 'disorderly_conduct', 'non_subst_dx', 'subst_dx')
gboost_df <- df %>%
  select( -starts_with("md_q") ) %>%
  select( -starts_with("adhd_q")) %>%
  select( -c('initial') ) %>%
  dplyr::mutate( across( all_of( ordinal_cats ), as.numeric ) ) %>%
  dplyr::mutate( suicide = suicide-1 )
paste( 'number of rows:', dim( gboost_df )[1] )

## [1] "number of rows: 175"

Evaluate the presence of missing data

miss_var_summary( gboost_df )

## # A tibble: 19 × 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 subst_dx               23    13.1 
##  2 non_subst_dx           22    12.6 
##  3 abuse                  14     8   
##  4 suicide                13     7.43
##  5 hx_of_violence         11     6.29
##  6 disorderly_conduct     11     6.29
##  7 education               9     5.14
##  8 court_order             5     2.86
##  9 alcohol                 4     2.29
## 10 thc                     4     2.29
## 11 cocaine                 4     2.29
## 12 stimulants              4     2.29
## 13 sedative_hypnotics      4     2.29
## 14 opioids                 4     2.29
## 15 age                     0     0   
## 16 sex                     0     0   
## 17 race                    0     0   
## 18 adhd_total              0     0   
## 19 md_total                0     0

gg_miss_upset( gboost_df )

We see that there are approximately two dozen rows that are missing values for multiple features. Therefore, instead of imputing the missing, we will simply drop these rows.

gboost_df <- gboost_df %>%
  drop_na() #%>%
  #mutate_if(is.factor, as.numeric)
paste( 'number of rows:', dim( gboost_df )[1] )

## [1] "number of rows: 142"

miss_var_summary( gboost_df )

## # A tibble: 19 × 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 age                     0        0
##  2 sex                     0        0
##  3 race                    0        0
##  4 adhd_total              0        0
##  5 md_total                0        0
##  6 alcohol                 0        0
##  7 thc                     0        0
##  8 cocaine                 0        0
##  9 stimulants              0        0
## 10 sedative_hypnotics      0        0
## 11 opioids                 0        0
## 12 court_order             0        0
## 13 education               0        0
## 14 hx_of_violence          0        0
## 15 disorderly_conduct      0        0
## 16 suicide                 0        0
## 17 abuse                   0        0
## 18 non_subst_dx            0        0
## 19 subst_dx                0        0

We will begin by splitting the data into train and test sets

set.seed(101)
indexes = createDataPartition(gboost_df$suicide, p = .75, list = F)
train = gboost_df[indexes, ]
test = gboost_df[-indexes, ]

Now, we use the train split to train a generalized boosted model using the gbm() function for a bernoulli distribution because our target, suicides has only two values.

# Train model with preprocessing & repeated cv
gbm.model = gbm(suicide~., 
                data=train, 
                shrinkage=0.01, 
                distribution = 'bernoulli', 
                cv.folds=5, 
                n.trees=3000, 
                verbose=F)
gbm.model

## gbm(formula = suicide ~ ., distribution = "bernoulli", data = train, 
##     n.trees = 3000, shrinkage = 0.01, cv.folds = 5, verbose = F)
## A gradient boosted model with bernoulli loss function.
## 3000 iterations were performed.
## The best cross-validation iteration was 346.
## There were 18 predictors of which 14 had non-zero influence.

Using the summary() on the gbm object returns information about the relative influence of each feature variable which is a measure of importance of each variable for training the model.

summary( gbm.model )

##                                   var   rel.inf
## adhd_total                 adhd_total 20.646957
## age                               age 16.149402
## md_total                     md_total 14.252044
## abuse                           abuse 11.876281
## subst_dx                     subst_dx  7.075128
## cocaine                       cocaine  5.693245
## alcohol                       alcohol  5.083782
## education                   education  4.123351
## thc                               thc  3.765223
## disorderly_conduct disorderly_conduct  2.836531
## race                             race  2.733989
## opioids                       opioids  2.229160
## hx_of_violence         hx_of_violence  1.203982
## sex                               sex  1.169502
## non_subst_dx             non_subst_dx  1.161424
## stimulants                 stimulants  0.000000
## sedative_hypnotics sedative_hypnotics  0.000000
## court_order               court_order  0.000000

We see that adhd_total, age, md_total, abuse and subst_dx have the highest influence on training the grandient boost model

From this result, we take the iteration with the best performance:

best.iter = gbm.perf(gbm.model, method = 'cv' )

best.iter

## [1] 346

The tree structure determined from the best performing iteration is then used to train a new model with these settings using the caret library.

set.seed(123)
fitControl = trainControl(method="cv", number=5, returnResamp = "all")

train$suicide <- factor( train$suicide )

model_bestiter = train(suicide~., 
                       data=train, 
                       method="gbm",
                       distribution="bernoulli", 
                       trControl=fitControl,
                       verbose=F, 
                       tuneGrid=data.frame(.n.trees=best.iter, 
                                           .shrinkage=0.01, 
                                           .interaction.depth=1, 
                                           .n.minobsinnode=1))

Best Iteration Model Summary:

model_bestiter

## Stochastic Gradient Boosting 
## 
## 107 samples
##  18 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 85, 86, 86, 86, 85 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7190476  0.2141068
## 
## Tuning parameter 'n.trees' was held constant at a value of 346
## Tuning
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 1

One way to evaluate the performance of the model is by looking at the confusion matrix. This gives us an idea of how well the model performs at correctly classifying the two possible outcomes.

confusionMatrix( model_bestiter )

## Cross-Validated (5 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction    0    1
##          0 63.6 22.4
##          1  5.6  8.4
##                             
##  Accuracy (average) : 0.7196

Another way of evaluating the performance of our model is with a lift chart:

test$suicide <- factor( test$suicide )
testres = predict( model_bestiter, test, type = 'prob')
testres$obs = test$suicide
testres$pred = predict(  model_bestiter, test )
multiClassSummary(testres, lev = levels(testres$obs))

##           logLoss               AUC             prAUC          Accuracy 
##         0.6142656         0.6413043         0.5919714         0.7142857 
##             Kappa                F1       Sensitivity       Specificity 
##         0.2081448         0.8214286         1.0000000         0.1666667 
##    Pos_Pred_Value    Neg_Pred_Value         Precision            Recall 
##         0.6969697         1.0000000         0.6969697         1.0000000 
##    Detection_Rate Balanced_Accuracy 
##         0.6571429         0.5833333

evalResults <- data.frame(Class = test$suicide)
evalResults$GBM <- predict( model_bestiter, test, type = "prob")[,'0']
trellis.par.set(caretTheme())
liftData <- caret::lift(Class ~ GBM, data = evalResults)
plot(liftData, values = 60, auto.key = list(columns = 1,
                                            lines = TRUE,
                                            points = FALSE))

From this lift chart, we can see that to find 60% of the hits, ~55% of the data. The shaded region bounds ranges of performance for models of the data: the diagonal (longest side of the triangle) represents chance performance whereas the other two sides of the shaded triangle bound a classifier that finds all the targets for a given sample size (perfect classifier). We can see that our Gradient Boost Model performs better, but not much better than chance.
Perhaps an SVM classifier can perform better….

Support Vector Machine

In this section we’ll start training a model using the linear kernel, with all the available variables and use that as a benchmark for any other SVM models we may train. The idea is to minimize test error and maximize interpretability. In the real world, being able to interpret the model will help one explain what makes the model work, but in some cases understanding how the model works takes a back seat to pure predictive power. This model won’t be used to make any life altering predictions, so we’ll discuss the results along the way.

We’ll create new training and test sets in order to preserve as many rows as possible. The raw data will have the initial and psych_meds fields omitted, and the rest of the observations with any missing values will be omitted. We’ll also be setting the tune length to 8, which is an arbitrary choice. We would like to set the number high enough to tune through a number of variations, but not too high where it would take too much processing power and time to train one model.

All models below will be trained using 10 fold cross validation and will have all variables centered and scaled before training. Using a 75/25 split leaves us with 106 training and 35 test observations. Note that the bag impute method fills in decimals where the variable may refer to a question with discrete answers. This may not be ideal, but is acceptable because each of the questions follow a similar scale of none/low to more/high in increasing order.

Finally, the last thing to note is that the dataset we’re working with has about 30% of observations flagged for suicide.

df5 <- df_raw2 %>%
  select(-initial, -psych_meds) %>%
  na.omit

set.seed(101)
trainIndex <- createDataPartition(df5$suicide,
                                  p = 0.75,
                                  list = F)

train <- df5[trainIndex,]
test <- df5[-trainIndex,]

ctrl <- trainControl(method = 'cv', number = 10)
tune_length <- 8

Pairs Plot

library(GGally)

# excluding individual questions for pairs plot
df6 <- df5 %>%
  select(age, sex, race, adhd_total, md_total) %>%
  bind_cols(df5[,39:52])

ggpairs(df6,
        mapping=ggplot2::aes(colour = suicide, alpha = 0.3))

The first step we took is to generate a pairs plot to look for any obvious separations. The idea is to visually identify any possible variable combinations where we could clearly separate people who had attempted suicide or not. Unfortunately this method is limited to two dimensions as it’s difficult to visualize higher dimensions and looking at the chart above, there aren’t any obvious separations we can make.

All Variables

In this section, we’ll train 3 SVM models (linear, polynomial, radial) with all available variables. Since support vector machines

SVM Linear, All Variables

svm_linear <- train(suicide ~ .,
                    data = train,
                    method = 'svmLinear',
                    preProc = c('center', 'scale'),
                    tuneLength = tune_length,
                    trControl = ctrl
                    )

test$svm_linear <- predict(svm_linear, test)

svm_linear

## Support Vector Machines with Linear Kernel 
## 
## 106 samples
##  51 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (132), scaled (132) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 96, 95, 96, 96, 94, 95, ... 
## Resampling results:
## 
##   Accuracy   Kappa     
##   0.6210606  0.09399862
## 
## Tuning parameter 'C' was held constant at a value of 1

Here are the results from the first SVM linear model show about 66% accuracy on the training data, which is not a great start. For a classification model with 30% suicides, we can reach 70% accuracy by guessing that no one commits suicide. The model doesn’t seem to be predicting all negatives, but still isn’t very accurate. We’ll calculate the predictions on the test data and compare the test error after all the SVM models are built.

SVM Poly, All Variables

Next, let’s train another SVM model with the same parameters as the linear model, only changing the method to use a polynomial to see if we can reach higher levels of accuracy. By keeping all the conditions the same except for the method, we can see which method fits better and is a more appropriate choice.

svm_poly <- train(suicide ~ .,
                  data = train,
                  method = 'svmPoly',
                  preProc = c('center', 'scale'),
                  tuneLength = tune_length,
                  trControl = ctrl
                  )

test$svm_poly <- predict(svm_poly, test)

x <- svm_poly$bestTune %>%
  row.names()

svm_poly$results[x,]

##     degree scale   C Accuracy     Kappa AccuracySD   KappaSD
## 138      3  0.01 0.5 0.700303 0.1350472  0.1387149 0.3868994

Looking at the results of the best model above, the tuned polynomial method produced a higher level of accuracy, around 70%. Although the results were slightly better than the linear model, we can’t make the case that this model is better before looking at the relative test errors.

SVM Radial, All Variables

The next method we’ll try is the radial method. This will allow for a circular boundary rather than a straight or curved line.

svm_radial <- train(suicide ~ .,
                    data = train,
                    method = 'svmRadial',
                    preProc = c('center', 'scale'),
                    tuneLength = tune_length,
                    trControl = ctrl
                    )

test$svm_radial <- predict(svm_radial, test)

x <- svm_radial$bestTune %>%
  row.names()

svm_radial$results[x,]

##         sigma    C Accuracy Kappa AccuracySD KappaSD
## 1 0.004455678 0.25 0.689697     0 0.02572091       0

Looking at the results above, the radial method produced a comparable model with about 49% accuracy. This model performed better than the linear model and worse than the polynomial method, with all 3 models within similar range.

If one of the two methods performed significantly better than the other, we could move forward and stick with that method. The next step we’ll take is to reconsider the variables included and then run the models again. If we were short on time and resources, it would make sense to continue with just the best performing polynomial method, but since we’re experimenting and exploring here, we’ll run all three methods again and see what we get.

Also note while writing up this report, each model was run multiple times and the results were not consistent for each model. It’s possible that the accuracy levels and results shown may vary when producing the final report.

Limited Variables

When building models, collinearity and noise can present hurdles to creating a production quality model. Maybe one day we’ll have computers that are fast enough to train every single combination of variables and pick the best one, but for now we’re going to consider all of the variables and decide which ones to move forward with.

It would make sense for us to try excluding all of the individual ADHD and mood disorder questions, since the totals for each respective state of mind allows us to get a higher level view of these variables. Below, we’ll create a copy of the training dataset excluding the individual questions, the retrain the three types of SVM models and evaluate the training results.

Finally, we’ll compare the test results for each of the 6 models.

Training DataFrame v2

train2 <- train %>%
  select(age, sex, race, adhd_total, md_total) %>%
  bind_cols(train[,39:52])

We created another dataframe object here to make things easier downstream. Instead of specifically selecting all of the variables, this object, train2, is a copy of train1 without the individual question variables.

SVM Linear v2

svm_linear2 <- train(suicide ~ .,
                     data = train2,
                     method = 'svmLinear',
                     preProc = c('center', 'scale'),
                     tuneLength = tune_length,
                     trControl = ctrl
                     )

test$svm_linear2 <- predict(svm_linear2, test)

svm_linear2

## Support Vector Machines with Linear Kernel 
## 
## 106 samples
##  18 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (43), scaled (43) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 96, 94, 96, 95, 96, 94, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.6689394  0.1842511
## 
## Tuning parameter 'C' was held constant at a value of 1

Looking at the results of the 2nd linear SVM model, the accuracy is much worse than the first, and is worse than predicting all no suicide attempts. Based on these results alone, it doesn’t look like we’ve gained any predictive power, even though we have improved the overall interpretability.

SVM Polynomial v2

svm_poly2 <- train(suicide ~ .,
                   data = train2,
                   method = 'svmPoly',
                   preProc = c('center', 'scale'),
                   tuneLength = tune_length,
                   trControl = ctrl
                   )

test$svm_poly2 <- predict(svm_poly2, test)

x <- svm_poly2$bestTune %>%
  row.names()

svm_poly2$results[x,]

##   degree scale  C  Accuracy    Kappa AccuracySD   KappaSD
## 8      1 0.001 32 0.7093939 0.213993 0.08807536 0.2971142

The 2nd polynomial SVM model has the simliar levels of accuracy to all earlier models. This model is comparable to the 1st polynomial SVM, giving additional evidence that the omission of the individual ADHD and mood disorder questions is mainly adding value by creating relatively easier to interpret models.

SVM Radial v2

svm_radial2 <- train(suicide ~ .,
                     data = train2,
                     method = 'svmRadial',
                     preProc = c('center', 'scale'),
                     tuneLength = tune_length,
                     trControl = ctrl
                     )

test$svm_radial2 <- predict(svm_radial2, test)

x <- svm_radial2$bestTune %>%
  row.names()

svm_radial$results[x,]

##         sigma    C Accuracy Kappa AccuracySD KappaSD
## 1 0.004455678 0.25 0.689697     0 0.02572091       0

Again, the second radial model using fewer variables has no obvious advantage in predictive training accuracy. We will need to check the test error of every model to decide which one would be best to use for future predictions.

SVM Test Evaluation

data.frame(rbind(
  postResample(test$svm_linear, test$suicide),
  postResample(test$svm_poly, test$suicide),
  postResample(test$svm_radial, test$suicide),
  postResample(test$svm_linear2, test$suicide),
  postResample(test$svm_poly2, test$suicide),
  postResample(test$svm_radial2, test$suicide)
),
row.names = c('SVM linear, all variables', 
              'SVM poly, all variables',
              'SVM radial, all variables', 
              'SVM linear, limited', 
              'SVM poly, limited',
              'SVM radial, limited'
              )
)

##                            Accuracy     Kappa
## SVM linear, all variables 0.6285714 0.2649435
## SVM poly, all variables   0.7142857 0.3371212
## SVM radial, all variables 0.6857143 0.0000000
## SVM linear, limited       0.6000000 0.1551724
## SVM poly, limited         0.7142857 0.3027888
## SVM radial, limited       0.6857143 0.0000000

When looking at the accuracy of all 6 models on the test data, it looks like the SVM polynomial model trailed with all variables is showing the highest accuracy on the test data. With all models reaching approximately 70% accuracy, we’re about the same level of accuracy compared to always predicting that no one commits suicide. If we had to decide which model to use going forward, we would pick the SVM polynomial with 77% accuracy, however we should note that there weren’t a large number of observations altogether and when the training the models above, the accuracy of each of the methods varied around the same range when trained multiple times.

### This section was inspired by the pairs plot... it was difficult to read that plot, so the 2 dimensional scatter plots were generated individually

var_count <- ncol(train2)

# pair plot, one at a time
c <- 1
for (i in 1:var_count) {
  for (j in (i+1):var_count) {
    if (i == 16 | j == 16) {
      
    } else {
      print(str_c(c, ": ", i, ', ', j))
      
      temp_df <- train2[,c(i,j,16)]
      temp_names <- names(temp_df)
      names(temp_df) <- c('x', 'y', 'suicide')
      
      print(temp_names[1:2])
      
      p <- ggplot(temp_df, aes(x = x, y = y, color = suicide)) +
        geom_jitter(alpha = 0.7) +
        labs(x = temp_names[1],
             y = temp_names[2])
      
      print(p)
        
      readline(prompt="Press [enter] to continue")
      c <- c + 1
    }
  }
}



# pair plot, one variable vs the others as facets
for (i in 1:var_count) {
  if (i != 16) {
   
    temp_df <- train2[i:var_count]
    
    if (i > 16) {
      temp_df <- temp_df %>%
        bind_cols(train2[16])
    }
    
    temp_ncol <- ncol(temp_df)
    
    fixed_var <- names(temp_df)[1]
    names(temp_df)[1] <- 'x'
    
    p <- temp_df %>%
      gather(var, val, -x, -suicide) %>%
      ggplot(aes(x = x, y = val, color = suicide)) +
      geom_jitter(alpha = 0.7) +
      facet_wrap(~var, scales = 'free') +
      labs(x = fixed_var)
    
    print(p)
    readline(prompt="Press [enter] to continue")

  }
}