We’ll be working with a mental health dataset and will be conducting exploratory data analysis, unsupervised clustering, principal component analysis, gradient boosting, and support vector machines.
To begin, the following code will import the data and load the libraries:
library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
library(Hmisc)
library(naniar)
library(conflicted)
library(pheatmap)
library(corrplot)
library(factoextra)
library(gbm)
library(caret)
library(gridExtra)
library(MLmetrics)
# resolve function name conflict
conflict_prefer('filter', 'dplyr')
conflict_prefer('summarize', 'dplyr')
# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/Data622_group5_projects/main/ADHD_data.csv'
df <- read.csv(url, header=T, na.strings="")In order to facilitate ease of use, we’ll be renaming the columns. Additionally, we’ll convert each of the number coded fields to factors while also including the proper labels.
# convert column names to lowercase
names(df) <- lapply(names(df), tolower)
# replace periods with underscore
names(df) <- str_replace_all(names(df), '\\.', '_')
# rename last column to remove trailing underscore
names(df)[ncol(df)] <- 'psych_meds'
# raw data with renamed columns
df_raw <- df
names(df_raw)## [1] "initial" "age" "sex"
## [4] "race" "adhd_q1" "adhd_q2"
## [7] "adhd_q3" "adhd_q4" "adhd_q5"
## [10] "adhd_q6" "adhd_q7" "adhd_q8"
## [13] "adhd_q9" "adhd_q10" "adhd_q11"
## [16] "adhd_q12" "adhd_q13" "adhd_q14"
## [19] "adhd_q15" "adhd_q16" "adhd_q17"
## [22] "adhd_q18" "adhd_total" "md_q1a"
## [25] "md_q1b" "md_q1c" "md_q1d"
## [28] "md_q1e" "md_q1f" "md_q1g"
## [31] "md_q1h" "md_q1i" "md_q1j"
## [34] "md_q1k" "md_q1l" "md_q1m"
## [37] "md_q2" "md_q3" "md_total"
## [40] "alcohol" "thc" "cocaine"
## [43] "stimulants" "sedative_hypnotics" "opioids"
## [46] "court_order" "education" "hx_of_violence"
## [49] "disorderly_conduct" "suicide" "abuse"
## [52] "non_subst_dx" "subst_dx" "psych_meds"
# Sex
df$sex <- factor(df$sex, levels = c(1,2), labels = c('Male','Female'))
# Race
df$race <- factor(df$race, levels = c(1,2,3,4,5,6), labels = c('White','African American','Hispanic','Asian','Native American','Other or Missing Data'))
# ADHD q1 - q18
adhd_cols <- names(df[,5:22])
df[adhd_cols] <- lapply(df[adhd_cols], factor, levels = c(0,1,2,3,4), labels = c('Never','Rarely','Sometimes','Often','Very Often'))
# Mood Disorder q1a - q2
md_cols <- names(df[,24:37])
df[md_cols] <- lapply(df[md_cols], factor, levels = c(0,1), labels = c('No','Yes'))
# Mood Disorder q3
df$md_q3 <- factor(df$md_q3, levels = c(0,1,2,3), labels = c('No Problem','Minor','Moderate','Serious'))
# Substance Abuse
sa_cols <- names(df[,40:45])
df[sa_cols] <- lapply(df[sa_cols], factor, levels = c(0,1,2,3), labels = c('No Use','Use','Abuse','Dependence'))
# Court Order
df$court_order <- factor(df$court_order, levels = c(0,1), labels = c('No','Yes'))
# Education
# think it might be okay to leave this as a number
# History of Violence, Disorderly Conduct, Suicide Attempt
hist_cols <- names(df[,48:50])
df[hist_cols] <- lapply(df[hist_cols], factor, levels = c(0,1), labels = c('No','Yes'))
# Abuse History
df$abuse <- factor(df$abuse, levels = c(0,1,2,3,4,5,6,7),
labels = c('No','Physical','Sexual','Emotional','Physical & Sexual','Physical & Emotional','Sexual & Emotional','Physical, Sexual, & Emotional'))
# Non-Substance Related Drugs
df$non_subst_dx <- factor(df$non_subst_dx, levels = c(0,1,2), labels = c('None','One','More than one'))
# Substance Related Drugs
df$subst_dx <- factor(df$subst_dx, levels = c(0,1,2,3), labels = c('None','One','Two','Three or more'))
# Psychiatric Meds
df$psych_meds <- factor(df$psych_meds, levels = c(0,1,2), labels = c('None','One','More than one'))
# copy df
df_raw2 <- df
# str(df)The following code will quantitatively and visually explore the nature of the dataset.
We begin by describing the dataset features.
Use dplyr’s glimpse() function to take a quick look at the data structure:
# quick look at what the data structure looks like
glimpse(df)## Rows: 175
## Columns: 54
## $ initial <chr> "JA", "LA", "MD", "RD", "RB", "SB", "PK", "RJ", "DJ…
## $ age <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,…
## $ sex <fct> Male, Female, Female, Male, Male, Female, Female, M…
## $ race <fct> White, White, White, White, White, White, White, Wh…
## $ adhd_q1 <fct> Rarely, Often, Sometimes, Often, Very Often, Someti…
## $ adhd_q2 <fct> Rarely, Often, Rarely, Often, Very Often, Often, So…
## $ adhd_q3 <fct> Very Often, Very Often, Sometimes, Sometimes, Somet…
## $ adhd_q4 <fct> Sometimes, Very Often, Rarely, Sometimes, Very Ofte…
## $ adhd_q5 <fct> Often, NA, Often, Very Often, Very Often, Often, Ve…
## $ adhd_q6 <fct> Rarely, Sometimes, Often, Often, Sometimes, Sometim…
## $ adhd_q7 <fct> Rarely, Sometimes, Often, Sometimes, Often, Often, …
## $ adhd_q8 <fct> Often, Often, Sometimes, Very Often, Very Often, Ve…
## $ adhd_q9 <fct> Sometimes, Sometimes, Never, Very Often, Very Often…
## $ adhd_q10 <fct> Very Often, Very Often, Rarely, Sometimes, Sometime…
## $ adhd_q11 <fct> Sometimes, Rarely, Sometimes, Often, Very Often, Ve…
## $ adhd_q12 <fct> Very Often, Very Often, Never, Rarely, Rarely, Some…
## $ adhd_q13 <fct> Rarely, Sometimes, Sometimes, Often, Often, Very Of…
## $ adhd_q14 <fct> Never, Very Often, Sometimes, Often, Sometimes, Ver…
## $ adhd_q15 <fct> Often, Very Often, Often, Rarely, Rarely, Often, Ve…
## $ adhd_q16 <fct> Rarely, Often, Sometimes, Sometimes, Sometimes, Ver…
## $ adhd_q17 <fct> Often, Rarely, Rarely, Rarely, Rarely, Often, Somet…
## $ adhd_q18 <fct> Very Often, Very Often, Rarely, Sometimes, Rarely, …
## $ adhd_total <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,…
## $ md_q1a <fct> Yes, Yes, No, Yes, No, No, Yes, No, Yes, Yes, Yes, …
## $ md_q1b <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ md_q1c <fct> Yes, Yes, No, No, No, No, No, No, No, No, Yes, No, …
## $ md_q1d <fct> Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, Yes, N…
## $ md_q1e <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes,…
## $ md_q1f <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Ye…
## $ md_q1g <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Ye…
## $ md_q1h <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1i <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1j <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, No, …
## $ md_q1k <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No,…
## $ md_q1l <fct> No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ md_q1m <fct> Yes, No, No, Yes, No, No, No, No, Yes, Yes, Yes, No…
## $ md_q2 <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
## $ md_q3 <fct> Serious, Serious, Moderate, Serious, Moderate, Seri…
## $ md_total <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, …
## $ alcohol <fct> Use, No Use, No Use, Use, Use, Use, Dependence, No …
## $ thc <fct> Use, No Use, No Use, Use, Use, No Use, Dependence, …
## $ cocaine <fct> Use, No Use, No Use, Use, No Use, No Use, Use, No U…
## $ stimulants <fct> No Use, No Use, No Use, Use, No Use, No Use, Use, N…
## $ sedative_hypnotics <fct> No Use, No Use, No Use, No Use, No Use, No Use, Use…
## $ opioids <fct> No Use, No Use, No Use, No Use, No Use, No Use, No …
## $ court_order <fct> Yes, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ education <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1…
## $ hx_of_violence <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, Yes, No…
## $ disorderly_conduct <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ suicide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, No, No, Yes, N…
## $ abuse <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P…
## $ non_subst_dx <fct> More than one, One, More than one, More than one, M…
## $ subst_dx <fct> None, None, None, None, None, None, None, One, None…
## $ psych_meds <fct> More than one, One, One, More than one, None, None,…
From this output, we can summarize each dataset feature as follows:
| Column Numbers | Column Labels | Description |
|---|---|---|
| 1 | initial |
(string) subject’s initials |
| 2 | age |
(numeric) integer values (years) |
| 3 | sex |
(categorical): binary (‘male’ and ‘female’) |
| 4 | race |
(categorical) |
| 5-22 | adhd_q1-adhd_q18 |
(categorical) ordinal values |
| 23 | adhd_total |
(numeric): summary feature derived from adhd_q1-adhd_q18 |
| 24-38 | md_q1a-md_q3 |
(categorical) ordinal values |
| 39 | md_total |
(numeric): summary feature derived from md_q1a-md_q3 |
| 40 | alcohol |
(categorical): ordinal values |
| 41 | thc |
(categorical): ordinal values |
| 42 | cocaine |
(categorical): ordinal values |
| 43 | stimulants |
(categorical): ordinal values |
| 44 | sedative_hypnotics |
(categorical): ordinal values |
| 45 | opioids |
(categorical): ordinal values |
| 46 | court_order |
(categorical): binary (yes/no) |
| 47 | education |
(numeric): interger values (years) |
| 48 | hx_of_violence |
(categorical): binary (yes/no) |
| 49 | disorderly_conduct |
(categorical): binary (yes/no) |
| 50 | suicide |
(categorical): binary (yes/no) |
| 51 | abuse |
(categorical): ordinal values |
| 52 | non_subst_dx |
(categorical): ordinal values |
| 53 | subst_dx |
(categorical): ordinal values |
| 54 | psych_meds |
(categorical): ordinal values |
The columns adhd_q1-adhd_q18 and md_q1a-md_q3 are summarized by the derivative columns adhd_total and md_total respectively. The adhd features correspond to an ADHD self-report survey whereas the md features give responses to a mood disorder self-report survey. For the initial Exploratory Data Analysis, the individual question responses will be dropped in place of visualizations and summary statistics on the derived columns, adhd_total and md_total. A detailed analysis of the individual survey questions will be taken up in the Principal Components Analysis section.
Next, we visualize the extent of the missing values using the naniar library.
Use naniar’s miss_var_summary() and vis_miss() functions to summarize and visualize the missing values in the features of the dataset:
# return a summary table of the missing data in each column
miss_var_summary(df)## # A tibble: 54 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 psych_meds 118 67.4
## 2 subst_dx 23 13.1
## 3 non_subst_dx 22 12.6
## 4 abuse 14 8
## 5 suicide 13 7.43
## 6 hx_of_violence 11 6.29
## 7 disorderly_conduct 11 6.29
## 8 education 9 5.14
## 9 court_order 5 2.86
## 10 alcohol 4 2.29
## # … with 44 more rows
# visualize the amount of missing data for each feature
vis_miss( df, cluster = TRUE )The figure above shows a grouped view of the missing values in each feature column. Overall, 2.7% of the values are missing from the dataset. The majority of the features have no missing values. Many of the features have relatively few missing values. However, the psych_meds features is missing a considerable portion of the data.
df <- df %>%
select( -c(psych_meds) )Having dropped psych_meds, we can now use the gg_miss_upset() function to look for any remaining patterns of missing values in the data.
gg_miss_upset( df )Both the vis_miss() and gg_miss_upset() visualization are informative, because they reveal some interesting patterns in the missing values for the data set. For instance, once we remove psych_meds most of the remaining rows that are not full, hold multiple missing values from a subset of the features. In other words, the remaining missing values are not randomly dispersed across the data set. Rather, they are concentrated in a subset of the features: disorderly_conduct, suicide, abuse, non_subst_dx and subst_dx. Notably, the responses that correspond to the ADHD and Mood Disorder self-reporting surveys are have no missing values; this knowledge will be helpful downstream. Further handling of missing values will be tailored to the analysis of each of the sections below.
To gain a feel for the data, we will evaluate visualizations of select features.
Summary Statistics of Demographic Variables
# summary of each demographic feature
demo_df <- df %>%
select(c('age','sex','race','education'))#select( -c( 5:22, 24:38 ) )
describe( demo_df )## demo_df
##
## 4 Variables 175 Observations
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 175 0 42 0.999 39.47 12.8 22.0 24.0
## .25 .50 .75 .90 .95
## 29.5 42.0 48.0 53.0 56.0
##
## lowest : 18 19 20 21 22, highest: 55 56 57 61 69
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 175 0 2
##
## Value Male Female
## Frequency 99 76
## Proportion 0.566 0.434
## --------------------------------------------------------------------------------
## race
## n missing distinct
## 175 0 4
##
## Value White African American Hispanic
## Frequency 72 100 1
## Proportion 0.411 0.571 0.006
##
## Value Other or Missing Data
## Frequency 2
## Proportion 0.011
## --------------------------------------------------------------------------------
## education
## n missing distinct Info Mean Gmd .05 .10
## 166 9 14 0.929 11.9 2.265 8.25 9.00
## .25 .50 .75 .90 .95
## 11.00 12.00 13.00 14.00 16.00
##
## lowest : 6 7 8 9 10, highest: 15 16 17 18 19
##
## Value 6 7 8 9 10 11 12 13 14 15 16
## Frequency 2 2 5 12 12 23 67 15 14 1 7
## Proportion 0.012 0.012 0.030 0.072 0.072 0.139 0.404 0.090 0.084 0.006 0.042
##
## Value 17 18 19
## Frequency 2 3 1
## Proportion 0.012 0.018 0.006
## --------------------------------------------------------------------------------
Visualizing the demographic information of the mental health study
p1 <- ggplot( df, aes(x = age) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Age Density Plot' )
num_obs <- dim(df)[1]
p2 <- df %>%
dplyr::count( sex ) %>%
dplyr::mutate( n = n/num_obs ) %>%
ggplot( aes(x=sex, y=n)) +
geom_bar(stat = "identity") +
ylim( 0, 0.6 ) +
ylab( 'proportion' ) +
ggtitle( 'Distribution of Sex (binary)' ) +
theme_classic()
p3 <- df %>%
dplyr::count( race ) %>%
dplyr::mutate( n = n/num_obs ) %>%
ggplot( aes(x=race, y=n)) +
geom_bar(stat = "identity") +
ylab( 'proportion' ) +
ylim( 0, 0.6 ) +
ggtitle( 'Distribution of Race' ) +
theme_classic() +
scale_x_discrete(guide = guide_axis(n.dodge = 2))
p4 <- ggplot( df, aes(x = education) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Education Density Plot' )
num_obs <- dim(df)[1]
grid.arrange(p1, p4, p2, p3, ncol=2)From these visualization we learn several things about the mental health study participants:
Summary Statistics for self-report surveys
# summary of each field
initial_eda_df <- df %>%
select( c('adhd_total', 'md_total' ) )
describe( initial_eda_df )## initial_eda_df
##
## 2 Variables 175 Observations
## --------------------------------------------------------------------------------
## adhd_total
## n missing distinct Info Mean Gmd .05 .10
## 175 0 62 0.999 34.32 19.16 7.0 12.0
## .25 .50 .75 .90 .95
## 21.0 33.0 47.5 55.0 62.3
##
## lowest : 0 1 3 5 6, highest: 65 67 69 71 72
## --------------------------------------------------------------------------------
## md_total
## n missing distinct Info Mean Gmd .05 .10
## 175 0 18 0.995 10.02 5.469 0.7 3.0
## .25 .50 .75 .90 .95
## 6.5 11.0 14.0 16.0 17.0
##
## lowest : 0 1 2 3 4, highest: 13 14 15 16 17
##
## Value 0 1 2 3 4 5 6 7 8 9 10
## Frequency 9 3 5 6 4 7 10 6 8 12 13
## Proportion 0.051 0.017 0.029 0.034 0.023 0.040 0.057 0.034 0.046 0.069 0.074
##
## Value 11 12 13 14 15 16 17
## Frequency 18 12 13 12 14 12 11
## Proportion 0.103 0.069 0.074 0.069 0.080 0.069 0.063
## --------------------------------------------------------------------------------
Visualizing the adhd_total and md_total features. These two features give a summary score for the questions on an ADHD Self-Reporting Survey and a separate Mood Disorder Questionnaire.
p1 <- ggplot( df, aes(x = adhd_total) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( df, aes(x = md_total) ) +
geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )adhd_total has a roughly normal distribution with a mean centered at 34.3 and a range of 0 to 72. md_total has an entirely different range from 0 to 17; it’s distribution is more left-skewed than ADHD.
Next we visualize adhd_total ~ md_total to evaluate the relationship between the two total scores
ggplot(df, aes(x=md_total, y=adhd_total)) +
geom_point()+
geom_smooth(method=lm) +
theme_classic() +
ggtitle('ADHD total ~ Mood Disorder total')We can evaluate the fit of a linear model to the data via the summary report and diagnostic plots:
fit = lm( adhd_total ~ md_total, df )
summary( fit )##
## Call:
## lm(formula = adhd_total ~ md_total, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.718 -8.672 -1.941 9.163 38.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.5104 2.5173 6.559 0.000000000603804 ***
## md_total 1.7769 0.2265 7.844 0.000000000000432 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.38 on 173 degrees of freedom
## Multiple R-squared: 0.2623, Adjusted R-squared: 0.2581
## F-statistic: 61.53 on 1 and 173 DF, p-value: 0.0000000000004319
par(mfrow = c(2, 2))
p1 <- plot(fit, which=1)
p2 <- plot(fit, which=2)
p3 <- plot(fit, which=3)
p4 <- plot(fit, which=4)par(mfrow = c(1, 1))The fit is statistically significant and the diagnostics suggest that the assumptions for linearity are reasonably met. Therefore, we can say with confidence that for a given point increase in Mood Disorder Score, the average ADHD score increases 1.7769 points.
Summary Statistics for Substance Misuse Features
# summary of each field
sub_df <- df %>%
select( c( 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'subst_dx' ) )
describe( sub_df )## sub_df
##
## 7 Variables 175 Observations
## --------------------------------------------------------------------------------
## alcohol
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 80 18 7 66
## Proportion 0.468 0.105 0.041 0.386
## --------------------------------------------------------------------------------
## thc
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 116 12 3 40
## Proportion 0.678 0.070 0.018 0.234
## --------------------------------------------------------------------------------
## cocaine
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 101 9 5 56
## Proportion 0.591 0.053 0.029 0.327
## --------------------------------------------------------------------------------
## stimulants
## n missing distinct
## 171 4 3
##
## Value No Use Use Dependence
## Frequency 160 6 5
## Proportion 0.936 0.035 0.029
## --------------------------------------------------------------------------------
## sedative_hypnotics
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 161 4 1 5
## Proportion 0.942 0.023 0.006 0.029
## --------------------------------------------------------------------------------
## opioids
## n missing distinct
## 171 4 3
##
## Value No Use Use Dependence
## Frequency 146 4 21
## Proportion 0.854 0.023 0.123
## --------------------------------------------------------------------------------
## subst_dx
## n missing distinct
## 152 23 4
##
## Value None One Two Three or more
## Frequency 42 61 35 14
## Proportion 0.276 0.401 0.230 0.092
## --------------------------------------------------------------------------------
The individual substance features (e.g. opioids) describe the severity with which a participants uses or does not use a particular substance. On the other hand, subst_dx counts how many substances a participant has a reported using irrespective of severity of use. The following figure will attempt to find patterns between the individual substances and the summary feature.
none_count <- sum(sub_df$subst_dx == 'None')
one_count <- sum(sub_df$subst_dx == 'One')
two_count <- sum(sub_df$subst_dx == 'Two')
more_count <- sum(sub_df$subst_dx == 'Three or more')
total_count <- dim(sub_df)[1]
plot_df <- sub_df %>%
drop_na() %>%
#mutate_if(is.factor, as.numeric) %>%
gather(var, value, -subst_dx) %>%
group_by(var, value, subst_dx) %>%
dplyr::summarize(count = n(),
.groups = 'drop') %>%
dplyr::mutate(prop = count / case_when(subst_dx == 'None' ~ none_count,
subst_dx == 'One' ~ one_count,
subst_dx == 'Two' ~ two_count,
subst_dx == 'Three or more' ~ more_count))
plot_df$value <- factor(plot_df$value,levels = c("No Use", "Use", "Dependence", "Abuse"))
ggplot(plot_df, aes(x = value, y = count, fill = subst_dx)) +
geom_col(position="stack") +#, position = 'dodge') +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(y = 'count',
x = element_blank(),
title = 'Distributions For Substances') +
theme(axis.text.x = element_text(angle = 35, hjust = 1)) #scale_y_continuous(labels = percent_format(accuracy = 1))From the figure above, we see that for each individual substance, the majority of participants report ‘No use’. However, the instances of ‘Dependence’ are elevated for several substances notably alcohol, cocaine and THC. For these substances it is the case that there is a substantial proportion of participants that also reported a substance abuse diagnosis (subst_dx) of three or more substances. There is another interesting result in this figure: there are many instances where a level of substance use was self-reported for a participant that also reported ‘None’ for substance abuse diagnosis. It is interesting that a participant can simultaneously report to, for example, have a dependence on alcohol yet report ‘None’ for a substance abuse diagnosis. Perhaps more documentation on the dataset can shed light on this.
To re-represent this data, we will generate a new summary feature that sums the reported sevarity of use for each substance included in the self report:
sub_df <- sub_df %>%
mutate_if(is.factor, as.numeric) %>%
dplyr::mutate( subst_sum = alcohol + thc + cocaine + stimulants + sedative_hypnotics + opioids )We can derive a similar summary variable for the non-substance abuse behaviors reported (e.g. disorderly conduct, court order etc.). Hoever, we exclude abuse because it has a difference and non-binary reporting criteria.
abuse_df <- df %>%
select( c( 'suicide', 'disorderly_conduct', 'hx_of_violence', 'court_order' ) ) %>%
mutate_if(is.factor, as.numeric) %>%
dplyr::mutate( abuse_sum = suicide + disorderly_conduct + hx_of_violence + court_order )Visualize the distribution of the new summary variables
p1 <- ggplot( sub_df, aes(x = subst_sum) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( abuse_df, aes(x = abuse_sum) ) +
geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )For one last exploratory visualization of the data, we will look at correlations between the self-reporting survey summary features (adhd_total & md_total) and the new summary feature generated here. Obviously, a positive correlation is expected between the two survey totals, because we demonstrated the linear regression earlier.
df_feats <- df %>%
select( c('adhd_total','md_total') ) %>%
dplyr::mutate( subst_sum = as.integer( sub_df$subst_sum ),
abuse_sum = as.integer( abuse_df$abuse_sum ) ) %>%
drop_na()
corrplot(cor(df_feats), method="number") From the correlation plot above, we do not see a strong relationship between the self-reported survey total scores and the new features we generated to summarize substance abuse and non-substance abuses. However, in upcoming sections we will use machine learning techniques to investigate this relationship further.
In order to find patients of clusters, we first need to determine the number of clusters. Below, we’ll use two common methods, evaluating the sum of squares and silhouette width, in order to decide how many clusters we’ll use. Since clustering requires no null values, we’ll use the bagImpute method from caret and use the raw data before each field was converted to factors.
# exclude names
df3 <- df_raw %>%
select(-initial)
# impute null
preproc <- preProcess(df3, 'bagImpute')
df4 <- predict(preproc, df3)fviz_nbclust(df4, kmeans, method = "wss")Based on the sum of squares for each value of K, we can use 2 clusters as the ‘elbow’ point. This gives us a good starting point to determine K, but we should note that in unsupervised learning there is no definite value of k. For the purposes of this homework assignment, we’ll choose one value of k and proceed with the analysis, but in real life situations, it could be helpful to try multiple values of k and analyze the results to find anything that might be interesting.
fviz_nbclust(df4, kmeans, method = "silhouette")Based on the average silhouette width of clusters, the optimal number of clusters shows k = 2.
Since both methods above have a consensus on two clusters, we’ll proceed with calculating the clusters using k = 2 and then plot the distributions to observe and note any relative differences.
Once the clusters are calculated, let’s check the size of each cluster to make sure we’re looking at different groups with enough data points.
# 25 random starts
cluster_k2 <- kmeans(df4, 2, nstart = 25)
# add cluster number to dataframe
df4$cluster_k2 <- cluster_k2$cluster
# check cluster sizes
table(df4$cluster_k2)##
## 1 2
## 88 87
Using k = 2, we created an almost even split of observations. Cluster 1 has 88 observations and cluster 2 has 87.
Note that when this file is run, the cluster numbers may switch or possibly form different clusters altogther. We used the paramter nstart = 25, which increases the chance that we’ll end up with the same cluster. It’s possible that the algorithm ends up with a very close vote, like 12 to 13 or 11 to 14 – which could lead to different clusters and not just switched cluster labels.
# adhd vs suicide
fviz_cluster(cluster_k2, data = df4, c('adhd_total', 'suicide'))The clusters created show a clear delineation for respondents based on adhd_total. This suggests that the clusters are separated based on levels of ADHD. Note that the variables have not been scaled. Since the ADHD total has a wider variance than each of the individual adhd question variables, it’s likely that different clusters would be formed if all the data was scaled. This could really affect the forming of the clusters since the variance of the total variable is much larger than a single question, which may range from 0-4.
# mood disorder vs suicide
fviz_cluster(cluster_k2, data = df4, c('md_total', 'suicide'))The results of the k2 cluster show considerable overlap based on md_total. This means that the clusters aren’t separated by mood disorder levels. Note that the mood disorder total variable is not scaled, similar to the ADHD total variable. Because the clusters are overlapped, that tells us that the cluster weren’t formed simply because of the wide variance, but rather because these two clusters have differences in other ways.
Let’s examine relative distributions for each cluster and variable. We decided to use probability density charts because we’re most interested in relative differences in proportion of the clusters.
# function to plot relative variable distributions for clusters
cluster_plot <- function(start_col, end_col, plot = 'density', legend_position = 'bottom') {
temp_df <- df4[,start_col:end_col] %>%
bind_cols(cluster = factor(df4$cluster_k2)) %>%
gather(var, val, -cluster)
if (plot == 'density') {
p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
geom_density(alpha = 0.3)
} else if (plot == 'histogram') {
p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
geom_histogram(alpha = 0.3)
}
p +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(fill = 'Cluster',
x = element_blank(),
y = element_blank()
) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(size = 8),
legend.position = legend_position)
}cluster_plot(1,3)cluster_plot(4,22)With clear differences for each question as well as the total, cluster 1 reported higher adhd values relative to cluster 2.
cluster_plot(23,38)Some questions have similiar distributions, but generally overall as well as altogether, cluster 1 is more likely to report higher levels of mood disorders.
cluster_plot(39,44)There are different drug use patterns for both of the classes depending on the type of drug.
cluster_plot(51,53)Cluster 2 has a higher chance of not taking non-substance abuse related drugs and is more likely to take one substance related drug.
In this section we will explore the ADHD data set further using Principal Component Analysis (PCA). PCA is a dimensionality reduction method that can be very useful for detecting patterns in complex, feature-rich data. At it’s core, PCA is a fairly simple linear algebra manipulation, an eigendecomposition. Basically, PCA maps the data onto a new set of components that consist of orthogonal axes (eigenvectors) and magnitudes (eigenvalues). Each successive component is added such that it describes as much remaining variance in the data as possible. From the resulting components we can select a subset that describe most of the variance in a new, lower-dimensional feature space which is more convenient for observing relationships in the data that were previously masked by the data’s complexity.
To approach this analysis, we derive 2 sets of features from the original data. Data features that correspond to 1) self-reported answers for an ADHD survey, 2) self-reported answers for a Mood Disorder survey.
The Adult ADHD Self-Reporting Scale Symptoms Checklist is a survey consisting of 18 questions that all range on an ordinal categorical scale: ‘Never’, ‘Rarely’, ‘Sometimes’, ‘Often’, ‘Very Often’
To prepare the ADHD survey responses we performed the following:
adhd_total which gives a summary score for the survey# preparing the ADHD self-report questionnaire data
adhd_df <- df %>%
select( starts_with("adhd_") ) %>%
select( -'adhd_total' )
# recode all questions from factor to numeric values
adhd_df <- adhd_df %>%
mutate_at(
vars(matches("adhd_q") ),
funs( recode_factor( ., 'Never'=0,'Rarely'=1,'Sometimes'=2,'Often'=3,'Very Often'=4 ) ) ) %>%
mutate_if( is.factor, as.numeric )
paste( 'initial number of rows:', dim( adhd_df )[1] )## [1] "initial number of rows: 175"
Summary of missing responses in the ADHD Self-Reporting Survey:
# return a summary table of the missing data in each column
miss_var_summary( adhd_df )## # A tibble: 18 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 adhd_q5 1 0.571
## 2 adhd_q1 0 0
## 3 adhd_q2 0 0
## 4 adhd_q3 0 0
## 5 adhd_q4 0 0
## 6 adhd_q6 0 0
## 7 adhd_q7 0 0
## 8 adhd_q8 0 0
## 9 adhd_q9 0 0
## 10 adhd_q10 0 0
## 11 adhd_q11 0 0
## 12 adhd_q12 0 0
## 13 adhd_q13 0 0
## 14 adhd_q14 0 0
## 15 adhd_q15 0 0
## 16 adhd_q16 0 0
## 17 adhd_q17 0 0
## 18 adhd_q18 0 0
We see that there is only 1 missing value, so it seems reasonable to just drop it
# remove missing
adhd_dna <- adhd_df %>%
drop_na()
paste( 'after drop_na number of rows:', dim( adhd_dna )[1] )## [1] "after drop_na number of rows: 174"
We can perform PCA easily from scratch and use the intermediate steps to learn more about the relationship of the individual question features to the ADHD total score:
scale() functioncov() function# scale the data to facilitate PCA
adhd_df_scaled <- scale( adhd_dna )
# create a covariance matrix
adhd_df_cov <- cov( adhd_df_scaled )Visualizing the covariance matrix
pheatmap(adhd_df_cov, display_numbers = T, color = colorRampPalette(c('white','red'))(100), cluster_rows = F, cluster_cols = F, fontsize_number = 15)The values off the diagonal of the matrix describe the degree of covariance between the two features. We see that some pairs of questions are more closely related than others. For instance, questions #5 and #13 have the highest covariance:
adhd_q5 - “How often do you fidget or squirm with your hands or feet when you have to sit down for a long time?”adhd_q13 - “How often do you feel restless or fidgety”Perhaps it is no surprise that subjects who report the frequency they fidget when sitting down for a long time report a similar incidence of general fidgeting/squirming.
Another closely covarying pair of questions are #16 and #18
adhd_q16 - “When you’re in a conversation, how often do you find yourself finishing the sentences of the people you are talking to, before they can finish them themselves?”adhd_q18 - “How often do you interrupt others when they are busy?”Again, these are two very similar questions.
It is equally interesting to evaluate the questions with the lowest covariance, questions #3 and #16(see above).
adhd_q3 - “How often do you have problems remembering appointments or obligations?”Indeed, conversation skills and the ability to remember appointments are two (subjectively) different tasks, so it is reasonable to not see a strong relation between the two.
Moving on, the covariance matrix can now be used for an eigendecomposition to find the principal components that describe the most variance in the data
# eigendecomposition of the covariance matrix
eig <- eigen( adhd_df_cov )
explained_variance <- eig$values
components <- eig$vectors
explained_variance## [1] 9.3075284 1.3703108 1.0189437 0.8140854 0.7215297 0.5939121 0.5529787
## [8] 0.5013390 0.4471306 0.4144583 0.3953239 0.3693263 0.3523467 0.2795267
## [15] 0.2734344 0.2160613 0.2060649 0.1656991
Now, by storing the eigenvalues in a dataframe, we can explore the amount of variance explained by each PCA component. Furthermore, we find the ratio of explained variance by normalizing the cumulative variance of the components
compnum <- 1:length( explained_variance )
exv_cumsum <- cumsum( explained_variance )/length( explained_variance )
pca_res <- data.frame( compnum, explained_variance, exv_cumsum )
pca_res## compnum explained_variance exv_cumsum
## 1 1 9.3075284 0.5170849
## 2 2 1.3703108 0.5932133
## 3 3 1.0189437 0.6498213
## 4 4 0.8140854 0.6950482
## 5 5 0.7215297 0.7351332
## 6 6 0.5939121 0.7681283
## 7 7 0.5529787 0.7988494
## 8 8 0.5013390 0.8267015
## 9 9 0.4471306 0.8515421
## 10 10 0.4144583 0.8745676
## 11 11 0.3953239 0.8965300
## 12 12 0.3693263 0.9170482
## 13 13 0.3523467 0.9366230
## 14 14 0.2795267 0.9521522
## 15 15 0.2734344 0.9673430
## 16 16 0.2160613 0.9793464
## 17 17 0.2060649 0.9907945
## 18 18 0.1656991 1.0000000
Visualizing the cumulative explained variance described by the principal components with a Skree Plot:
ggplot( pca_res, aes( x = compnum, y = exv_cumsum ) ) +
geom_line() +
ylim( c(0,1.1) ) +
scale_x_continuous(breaks = seq(0, length( explained_variance ), by = 1)) +
geom_hline( yintercept = 0.95, linetype = 'dotted', col = 'red') +
annotate("text", x = 2, y = 0.98,
label = expression( "95%" ~ sigma), vjust = -0.5) +
theme_minimal() +
xlab( 'Principal Component Number' ) +
ylab( 'Cummulative Explained Variance' ) +
ggtitle( 'Scree Plot: Explained Variance of Principal Components' )The figure above plots the cumulative explained variance of the ADHD survey questions and total score as a result of the eigendecomposition. We see that the first principal component accounts for 51.7% of the total variance. Adding successive components gradually increases the cumulative explained variance. It takes as many as 14 components to describe 95% of the data’s variance. Therefore, dimensionality reduction by way of PCA only moderately decreases the components necessary to adequately explain the data (with a threshold of 95%).
prcomp()We can determine the principal components of the ADHD data more quickly using the prcomp() function from the stats library.
# use the prcomp function
adhd_dna.pca <- prcomp( adhd_dna, center = TRUE, scale = TRUE )
# print the pca summary
summary( adhd_dna.pca )## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.0508 1.17060 1.00943 0.90227 0.84943 0.7707 0.74363
## Proportion of Variance 0.5171 0.07613 0.05661 0.04523 0.04008 0.0330 0.03072
## Cumulative Proportion 0.5171 0.59321 0.64982 0.69505 0.73513 0.7681 0.79885
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.70805 0.66868 0.64378 0.62875 0.60772 0.59359 0.52870
## Proportion of Variance 0.02785 0.02484 0.02303 0.02196 0.02052 0.01957 0.01553
## Cumulative Proportion 0.82670 0.85154 0.87457 0.89653 0.91705 0.93662 0.95215
## PC15 PC16 PC17 PC18
## Standard deviation 0.52291 0.4648 0.45394 0.40706
## Proportion of Variance 0.01519 0.0120 0.01145 0.00921
## Cumulative Proportion 0.96734 0.9794 0.99079 1.00000
It is interesting to compare the Cumulative Proportion from the prcomp() output with the feature pca_res$exv_cumsum; the results are the same for both the built-in PCA function as well as are PCA from scratch (a relief!).
The PCA object that results from prcomp() can be used to visualize the components using the ggbiplot library:
library( ggbiplot )
ggbiplot( adhd_dna.pca )The biplot shown above renders the data in the feature space of principal components PC1 and PC2; these are the components that explain most of the variance in the data. The scatterplot points show the projection of the data onto PC1 & PC2. The red arrows represent the original feature space that the data was represented in. We can see that the original features have the largest projection onto the PC1 axis. Relatively few original features have a sizeable projection onto PC2 (e.g. adhd_q16 & adhd_q17). Additionally, these vectors all have a positive component along PC1 showing that the result for each question has a positive relation to PC1. The size of the angle between a given pair of vectors is inversely related to the correlation of the pair. For example, we see that adhd_q3 and adhd_q16 have the largest angle between them; this corresponds to our earlier observation looking at the covariance matrix.
The Mood Disorder Questionnaire is a self-reporting survey that can be used to identify subjects that may be bipolar.
We can perform a similar PCA analysis for the Mood Disorder survey results. Taking similar preprocessing steps: 1) select mood disorder question features, evaluate missing values…
# prepare the mood disorder data and recode all questions from factor to numeric values
md_df <- df %>%
select( starts_with("md_") ) %>%
select( -'md_total' ) %>%
mutate_if( is.factor, as.numeric )
# return a summary table of the missing data in each column
miss_var_summary( md_df )## # A tibble: 15 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 md_q1a 0 0
## 2 md_q1b 0 0
## 3 md_q1c 0 0
## 4 md_q1d 0 0
## 5 md_q1e 0 0
## 6 md_q1f 0 0
## 7 md_q1g 0 0
## 8 md_q1h 0 0
## 9 md_q1i 0 0
## 10 md_q1j 0 0
## 11 md_q1k 0 0
## 12 md_q1l 0 0
## 13 md_q1m 0 0
## 14 md_q2 0 0
## 15 md_q3 0 0
Wonderful, there are no missing values, so we can proceed with PCA using prcomp()…
# use the prcomp function
md_df.pca <- prcomp( md_df, center = TRUE, scale = TRUE )
# print the pca summary
summary( md_df.pca )## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3857 1.3474 0.9721 0.94891 0.91563 0.82515 0.79246
## Proportion of Variance 0.3794 0.1210 0.0630 0.06003 0.05589 0.04539 0.04187
## Cumulative Proportion 0.3794 0.5004 0.5635 0.62348 0.67937 0.72476 0.76663
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.77044 0.74286 0.72814 0.70302 0.65162 0.5835 0.55685
## Proportion of Variance 0.03957 0.03679 0.03535 0.03295 0.02831 0.0227 0.02067
## Cumulative Proportion 0.80620 0.84299 0.87834 0.91128 0.93959 0.9623 0.98296
## PC15
## Standard deviation 0.50553
## Proportion of Variance 0.01704
## Cumulative Proportion 1.00000
Now to visualize the PCA biplot:
ggbiplot( md_df.pca ) + xlim( -2,2 )A similar pattern emerges for the Mood Disorder survey responses. We see that all the projections again have a major component along PC1. Again, we can infer relationships between features from the biplot: md_q1c and md_q3 have the widest angle separating them and are therefore the least correlated features. where:
md_q1c - Has there been a period of time when (…) you felt much more self-confident than usual
md_q3 - How much of a problem did any of these cause you - being unable to work; having family, money or legal troubles; getting into arguments or fights
Responses to md_q1c with a ‘Yes’ reflect a positive affect with a larger numeric value. Conversely, larger numeric values for md_q3 are associated with a negative aspect. In light of the opposite score polarities, it is not surprising to see a lack of correlation between these two features in the PC1/PC2 space.
On major difference between the Mood Disorder and ADHD biplot is that all the feature vectors point in a negative direction in relation to PC1. This suggests inverse relationship between the features variables and PC1. You may recall from the previous section that the responses to the ADHD survey were all positive in relation to PC1.
Gradient Boosting builds a prediction model from an ensemble of smaller ‘weak learner’ models. In this section we will implement the gradient boosting approach to fit a binary classification model to features from the ADHD data set to predict whether a patient attempts suicide.
Before fitting a Gradient Boost Model, we will collect a new subset of features for this approach. Instead of including all features for individual questions on either of the summaries, we will only be using the total scores (adhd_total and md_total) because the total scores are composite features build from the scores on the questions.
ordinal_cats <- c( 'sex', 'race', 'abuse', 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'suicide',
'court_order', 'hx_of_violence', 'disorderly_conduct', 'non_subst_dx', 'subst_dx')
gboost_df <- df %>%
select( -starts_with("md_q") ) %>%
select( -starts_with("adhd_q")) %>%
select( -c('initial') ) %>%
dplyr::mutate( across( all_of( ordinal_cats ), as.numeric ) ) %>%
dplyr::mutate( suicide = suicide-1 )
paste( 'number of rows:', dim( gboost_df )[1] )## [1] "number of rows: 175"
Evaluate the presence of missing data
miss_var_summary( gboost_df )## # A tibble: 19 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 subst_dx 23 13.1
## 2 non_subst_dx 22 12.6
## 3 abuse 14 8
## 4 suicide 13 7.43
## 5 hx_of_violence 11 6.29
## 6 disorderly_conduct 11 6.29
## 7 education 9 5.14
## 8 court_order 5 2.86
## 9 alcohol 4 2.29
## 10 thc 4 2.29
## 11 cocaine 4 2.29
## 12 stimulants 4 2.29
## 13 sedative_hypnotics 4 2.29
## 14 opioids 4 2.29
## 15 age 0 0
## 16 sex 0 0
## 17 race 0 0
## 18 adhd_total 0 0
## 19 md_total 0 0
gg_miss_upset( gboost_df )We see that there are approximately two dozen rows that are missing values for multiple features. Therefore, instead of imputing the missing, we will simply drop these rows.
gboost_df <- gboost_df %>%
drop_na() #%>%
#mutate_if(is.factor, as.numeric)
paste( 'number of rows:', dim( gboost_df )[1] )## [1] "number of rows: 142"
miss_var_summary( gboost_df )## # A tibble: 19 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 age 0 0
## 2 sex 0 0
## 3 race 0 0
## 4 adhd_total 0 0
## 5 md_total 0 0
## 6 alcohol 0 0
## 7 thc 0 0
## 8 cocaine 0 0
## 9 stimulants 0 0
## 10 sedative_hypnotics 0 0
## 11 opioids 0 0
## 12 court_order 0 0
## 13 education 0 0
## 14 hx_of_violence 0 0
## 15 disorderly_conduct 0 0
## 16 suicide 0 0
## 17 abuse 0 0
## 18 non_subst_dx 0 0
## 19 subst_dx 0 0
We will begin by splitting the data into train and test sets
set.seed(101)
indexes = createDataPartition(gboost_df$suicide, p = .75, list = F)
train = gboost_df[indexes, ]
test = gboost_df[-indexes, ]Now, we use the train split to train a generalized boosted model using the gbm() function for a bernoulli distribution because our target, suicides has only two values.
# Train model with preprocessing & repeated cv
gbm.model = gbm(suicide~.,
data=train,
shrinkage=0.01,
distribution = 'bernoulli',
cv.folds=5,
n.trees=3000,
verbose=F)
gbm.model## gbm(formula = suicide ~ ., distribution = "bernoulli", data = train,
## n.trees = 3000, shrinkage = 0.01, cv.folds = 5, verbose = F)
## A gradient boosted model with bernoulli loss function.
## 3000 iterations were performed.
## The best cross-validation iteration was 346.
## There were 18 predictors of which 14 had non-zero influence.
Using the summary() on the gbm object returns information about the relative influence of each feature variable which is a measure of importance of each variable for training the model.
summary( gbm.model )## var rel.inf
## adhd_total adhd_total 20.646957
## age age 16.149402
## md_total md_total 14.252044
## abuse abuse 11.876281
## subst_dx subst_dx 7.075128
## cocaine cocaine 5.693245
## alcohol alcohol 5.083782
## education education 4.123351
## thc thc 3.765223
## disorderly_conduct disorderly_conduct 2.836531
## race race 2.733989
## opioids opioids 2.229160
## hx_of_violence hx_of_violence 1.203982
## sex sex 1.169502
## non_subst_dx non_subst_dx 1.161424
## stimulants stimulants 0.000000
## sedative_hypnotics sedative_hypnotics 0.000000
## court_order court_order 0.000000
We see that adhd_total, age, md_total, abuse and subst_dx have the highest influence on training the grandient boost model
From this result, we take the iteration with the best performance:
best.iter = gbm.perf(gbm.model, method = 'cv' )best.iter## [1] 346
The tree structure determined from the best performing iteration is then used to train a new model with these settings using the caret library.
set.seed(123)
fitControl = trainControl(method="cv", number=5, returnResamp = "all")
train$suicide <- factor( train$suicide )
model_bestiter = train(suicide~.,
data=train,
method="gbm",
distribution="bernoulli",
trControl=fitControl,
verbose=F,
tuneGrid=data.frame(.n.trees=best.iter,
.shrinkage=0.01,
.interaction.depth=1,
.n.minobsinnode=1))Best Iteration Model Summary:
model_bestiter## Stochastic Gradient Boosting
##
## 107 samples
## 18 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 85, 86, 86, 86, 85
## Resampling results:
##
## Accuracy Kappa
## 0.7190476 0.2141068
##
## Tuning parameter 'n.trees' was held constant at a value of 346
## Tuning
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 1
One way to evaluate the performance of the model is by looking at the confusion matrix. This gives us an idea of how well the model performs at correctly classifying the two possible outcomes.
confusionMatrix( model_bestiter )## Cross-Validated (5 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 0 1
## 0 63.6 22.4
## 1 5.6 8.4
##
## Accuracy (average) : 0.7196
Another way of evaluating the performance of our model is with a lift chart:
test$suicide <- factor( test$suicide )
testres = predict( model_bestiter, test, type = 'prob')
testres$obs = test$suicide
testres$pred = predict( model_bestiter, test )
multiClassSummary(testres, lev = levels(testres$obs))## logLoss AUC prAUC Accuracy
## 0.6142656 0.6413043 0.5919714 0.7142857
## Kappa F1 Sensitivity Specificity
## 0.2081448 0.8214286 1.0000000 0.1666667
## Pos_Pred_Value Neg_Pred_Value Precision Recall
## 0.6969697 1.0000000 0.6969697 1.0000000
## Detection_Rate Balanced_Accuracy
## 0.6571429 0.5833333
evalResults <- data.frame(Class = test$suicide)
evalResults$GBM <- predict( model_bestiter, test, type = "prob")[,'0']
trellis.par.set(caretTheme())
liftData <- caret::lift(Class ~ GBM, data = evalResults)
plot(liftData, values = 60, auto.key = list(columns = 1,
lines = TRUE,
points = FALSE))From this lift chart, we can see that to find 60% of the hits, ~55% of the data. The shaded region bounds ranges of performance for models of the data: the diagonal (longest side of the triangle) represents chance performance whereas the other two sides of the shaded triangle bound a classifier that finds all the targets for a given sample size (perfect classifier). We can see that our Gradient Boost Model performs better, but not much better than chance.
Perhaps an SVM classifier can perform better….
In this section we’ll start training a model using the linear kernel, with all the available variables and use that as a benchmark for any other SVM models we may train. The idea is to minimize test error and maximize interpretability. In the real world, being able to interpret the model will help one explain what makes the model work, but in some cases understanding how the model works takes a back seat to pure predictive power. This model won’t be used to make any life altering predictions, so we’ll discuss the results along the way.
We’ll create new training and test sets in order to preserve as many rows as possible. The raw data will have the initial and psych_meds fields omitted, and the rest of the observations with any missing values will be omitted. We’ll also be setting the tune length to 8, which is an arbitrary choice. We would like to set the number high enough to tune through a number of variations, but not too high where it would take too much processing power and time to train one model.
All models below will be trained using 10 fold cross validation and will have all variables centered and scaled before training. Using a 75/25 split leaves us with 106 training and 35 test observations. Note that the bag impute method fills in decimals where the variable may refer to a question with discrete answers. This may not be ideal, but is acceptable because each of the questions follow a similar scale of none/low to more/high in increasing order.
Finally, the last thing to note is that the dataset we’re working with has about 30% of observations flagged for suicide.
df5 <- df_raw2 %>%
select(-initial, -psych_meds) %>%
na.omit
set.seed(101)
trainIndex <- createDataPartition(df5$suicide,
p = 0.75,
list = F)
train <- df5[trainIndex,]
test <- df5[-trainIndex,]
ctrl <- trainControl(method = 'cv', number = 10)
tune_length <- 8library(GGally)
# excluding individual questions for pairs plot
df6 <- df5 %>%
select(age, sex, race, adhd_total, md_total) %>%
bind_cols(df5[,39:52])
ggpairs(df6,
mapping=ggplot2::aes(colour = suicide, alpha = 0.3))The first step we took is to generate a pairs plot to look for any obvious separations. The idea is to visually identify any possible variable combinations where we could clearly separate people who had attempted suicide or not. Unfortunately this method is limited to two dimensions as it’s difficult to visualize higher dimensions and looking at the chart above, there aren’t any obvious separations we can make.
In this section, we’ll train 3 SVM models (linear, polynomial, radial) with all available variables. Since support vector machines
svm_linear <- train(suicide ~ .,
data = train,
method = 'svmLinear',
preProc = c('center', 'scale'),
tuneLength = tune_length,
trControl = ctrl
)
test$svm_linear <- predict(svm_linear, test)
svm_linear## Support Vector Machines with Linear Kernel
##
## 106 samples
## 51 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (132), scaled (132)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 96, 95, 96, 96, 94, 95, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6210606 0.09399862
##
## Tuning parameter 'C' was held constant at a value of 1
Here are the results from the first SVM linear model show about 66% accuracy on the training data, which is not a great start. For a classification model with 30% suicides, we can reach 70% accuracy by guessing that no one commits suicide. The model doesn’t seem to be predicting all negatives, but still isn’t very accurate. We’ll calculate the predictions on the test data and compare the test error after all the SVM models are built.
Next, let’s train another SVM model with the same parameters as the linear model, only changing the method to use a polynomial to see if we can reach higher levels of accuracy. By keeping all the conditions the same except for the method, we can see which method fits better and is a more appropriate choice.
svm_poly <- train(suicide ~ .,
data = train,
method = 'svmPoly',
preProc = c('center', 'scale'),
tuneLength = tune_length,
trControl = ctrl
)
test$svm_poly <- predict(svm_poly, test)
x <- svm_poly$bestTune %>%
row.names()
svm_poly$results[x,]## degree scale C Accuracy Kappa AccuracySD KappaSD
## 138 3 0.01 0.5 0.700303 0.1350472 0.1387149 0.3868994
Looking at the results of the best model above, the tuned polynomial method produced a higher level of accuracy, around 70%. Although the results were slightly better than the linear model, we can’t make the case that this model is better before looking at the relative test errors.
The next method we’ll try is the radial method. This will allow for a circular boundary rather than a straight or curved line.
svm_radial <- train(suicide ~ .,
data = train,
method = 'svmRadial',
preProc = c('center', 'scale'),
tuneLength = tune_length,
trControl = ctrl
)
test$svm_radial <- predict(svm_radial, test)
x <- svm_radial$bestTune %>%
row.names()
svm_radial$results[x,]## sigma C Accuracy Kappa AccuracySD KappaSD
## 1 0.004455678 0.25 0.689697 0 0.02572091 0
Looking at the results above, the radial method produced a comparable model with about 49% accuracy. This model performed better than the linear model and worse than the polynomial method, with all 3 models within similar range.
If one of the two methods performed significantly better than the other, we could move forward and stick with that method. The next step we’ll take is to reconsider the variables included and then run the models again. If we were short on time and resources, it would make sense to continue with just the best performing polynomial method, but since we’re experimenting and exploring here, we’ll run all three methods again and see what we get.
Also note while writing up this report, each model was run multiple times and the results were not consistent for each model. It’s possible that the accuracy levels and results shown may vary when producing the final report.
When building models, collinearity and noise can present hurdles to creating a production quality model. Maybe one day we’ll have computers that are fast enough to train every single combination of variables and pick the best one, but for now we’re going to consider all of the variables and decide which ones to move forward with.
It would make sense for us to try excluding all of the individual ADHD and mood disorder questions, since the totals for each respective state of mind allows us to get a higher level view of these variables. Below, we’ll create a copy of the training dataset excluding the individual questions, the retrain the three types of SVM models and evaluate the training results.
Finally, we’ll compare the test results for each of the 6 models.
train2 <- train %>%
select(age, sex, race, adhd_total, md_total) %>%
bind_cols(train[,39:52])We created another dataframe object here to make things easier downstream. Instead of specifically selecting all of the variables, this object, train2, is a copy of train1 without the individual question variables.
svm_linear2 <- train(suicide ~ .,
data = train2,
method = 'svmLinear',
preProc = c('center', 'scale'),
tuneLength = tune_length,
trControl = ctrl
)
test$svm_linear2 <- predict(svm_linear2, test)
svm_linear2## Support Vector Machines with Linear Kernel
##
## 106 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (43), scaled (43)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 96, 94, 96, 95, 96, 94, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6689394 0.1842511
##
## Tuning parameter 'C' was held constant at a value of 1
Looking at the results of the 2nd linear SVM model, the accuracy is much worse than the first, and is worse than predicting all no suicide attempts. Based on these results alone, it doesn’t look like we’ve gained any predictive power, even though we have improved the overall interpretability.
svm_poly2 <- train(suicide ~ .,
data = train2,
method = 'svmPoly',
preProc = c('center', 'scale'),
tuneLength = tune_length,
trControl = ctrl
)
test$svm_poly2 <- predict(svm_poly2, test)
x <- svm_poly2$bestTune %>%
row.names()
svm_poly2$results[x,]## degree scale C Accuracy Kappa AccuracySD KappaSD
## 8 1 0.001 32 0.7093939 0.213993 0.08807536 0.2971142
The 2nd polynomial SVM model has the simliar levels of accuracy to all earlier models. This model is comparable to the 1st polynomial SVM, giving additional evidence that the omission of the individual ADHD and mood disorder questions is mainly adding value by creating relatively easier to interpret models.
svm_radial2 <- train(suicide ~ .,
data = train2,
method = 'svmRadial',
preProc = c('center', 'scale'),
tuneLength = tune_length,
trControl = ctrl
)
test$svm_radial2 <- predict(svm_radial2, test)
x <- svm_radial2$bestTune %>%
row.names()
svm_radial$results[x,]## sigma C Accuracy Kappa AccuracySD KappaSD
## 1 0.004455678 0.25 0.689697 0 0.02572091 0
Again, the second radial model using fewer variables has no obvious advantage in predictive training accuracy. We will need to check the test error of every model to decide which one would be best to use for future predictions.
data.frame(rbind(
postResample(test$svm_linear, test$suicide),
postResample(test$svm_poly, test$suicide),
postResample(test$svm_radial, test$suicide),
postResample(test$svm_linear2, test$suicide),
postResample(test$svm_poly2, test$suicide),
postResample(test$svm_radial2, test$suicide)
),
row.names = c('SVM linear, all variables',
'SVM poly, all variables',
'SVM radial, all variables',
'SVM linear, limited',
'SVM poly, limited',
'SVM radial, limited'
)
)## Accuracy Kappa
## SVM linear, all variables 0.6285714 0.2649435
## SVM poly, all variables 0.7142857 0.3371212
## SVM radial, all variables 0.6857143 0.0000000
## SVM linear, limited 0.6000000 0.1551724
## SVM poly, limited 0.7142857 0.3027888
## SVM radial, limited 0.6857143 0.0000000
When looking at the accuracy of all 6 models on the test data, it looks like the SVM polynomial model trailed with all variables is showing the highest accuracy on the test data. With all models reaching approximately 70% accuracy, we’re about the same level of accuracy compared to always predicting that no one commits suicide. If we had to decide which model to use going forward, we would pick the SVM polynomial with 77% accuracy, however we should note that there weren’t a large number of observations altogether and when the training the models above, the accuracy of each of the methods varied around the same range when trained multiple times.
### This section was inspired by the pairs plot... it was difficult to read that plot, so the 2 dimensional scatter plots were generated individually
var_count <- ncol(train2)
# pair plot, one at a time
c <- 1
for (i in 1:var_count) {
for (j in (i+1):var_count) {
if (i == 16 | j == 16) {
} else {
print(str_c(c, ": ", i, ', ', j))
temp_df <- train2[,c(i,j,16)]
temp_names <- names(temp_df)
names(temp_df) <- c('x', 'y', 'suicide')
print(temp_names[1:2])
p <- ggplot(temp_df, aes(x = x, y = y, color = suicide)) +
geom_jitter(alpha = 0.7) +
labs(x = temp_names[1],
y = temp_names[2])
print(p)
readline(prompt="Press [enter] to continue")
c <- c + 1
}
}
}
# pair plot, one variable vs the others as facets
for (i in 1:var_count) {
if (i != 16) {
temp_df <- train2[i:var_count]
if (i > 16) {
temp_df <- temp_df %>%
bind_cols(train2[16])
}
temp_ncol <- ncol(temp_df)
fixed_var <- names(temp_df)[1]
names(temp_df)[1] <- 'x'
p <- temp_df %>%
gather(var, val, -x, -suicide) %>%
ggplot(aes(x = x, y = val, color = suicide)) +
geom_jitter(alpha = 0.7) +
facet_wrap(~var, scales = 'free') +
labs(x = fixed_var)
print(p)
readline(prompt="Press [enter] to continue")
}
}
Social
Cluster 2 is less likely to experience abuse and suicide, and more likely to have a history of violence and be charged with disorderly conduct.