Data 622 Homework 4: Mental Health Data Modeling

library(knitr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret) # For createDataPartition, featureplot, classification report

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(skimr)    # Used for EDA
#library(corrplot) # For correlation matrix
library(mice) # Multivariate Imputation By Chained Equations

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(micemd)
library(parallel)
library(doParallel)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

#library(plyr)
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

library(ggplot2)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(stats)
library(tidymodels)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──

## ✓ broom        0.7.9      ✓ rsample      0.1.0 
## ✓ dials        0.0.10     ✓ tune         0.1.6 
## ✓ infer        1.0.0      ✓ workflows    0.2.4 
## ✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
## ✓ parsnip      0.1.7      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.17

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x foreach::accumulate()    masks purrr::accumulate()
## x scales::discard()        masks purrr::discard()
## x mice::filter()           masks dplyr::filter(), stats::filter()
## x recipes::fixed()         masks stringr::fixed()
## x kableExtra::group_rows() masks dplyr::group_rows()
## x dplyr::lag()             masks stats::lag()
## x caret::lift()            masks purrr::lift()
## x yardstick::precision()   masks caret::precision()
## x recipes::prepare()       masks VIM::prepare()
## x yardstick::recall()      masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec()        masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step()          masks stats::step()
## x foreach::when()          masks purrr::when()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

library(e1071)

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:tune':
## 
##     tune

## The following object is masked from 'package:rsample':
## 
##     permutations

library(vcd)

library(ggplot2)
library(Hmisc)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:e1071':
## 
##     impute

## The following object is masked from 'package:parsnip':
## 
##     translate

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(Boruta)
## Global options
# ---------------------------------

options(max.print="108")
opts_knit$set(width=31)

Loading data

# getwd()
setwd("~/Data 622/repos/data622_fa2021/hw4")
adhd_data <- readxl::read_excel('ADHD_data.xlsx', sheet = "Data", .name_repair = "universal", na = "")

## New names:
## * `ADHD Q1` -> ADHD.Q1
## * `ADHD Q2` -> ADHD.Q2
## * `ADHD Q3` -> ADHD.Q3
## * `ADHD Q4` -> ADHD.Q4
## * `ADHD Q5` -> ADHD.Q5
## * ...

# dim(adhd_data)
str(adhd_data)

## tibble [175 × 54] (S3: tbl_df/tbl/data.frame)
##  $ Initial           : chr [1:175] "JA" "LA" "MD" "RD" ...
##  $ Age               : num [1:175] 24 48 51 43 34 39 41 48 44 27 ...
##  $ Sex               : num [1:175] 1 2 2 1 1 2 2 1 2 2 ...
##  $ Race              : num [1:175] 1 1 1 1 1 1 1 1 1 1 ...
##  $ ADHD.Q1           : num [1:175] 1 3 2 3 4 2 2 2 3 2 ...
##  $ ADHD.Q2           : num [1:175] 1 3 1 3 4 3 2 4 3 3 ...
##  $ ADHD.Q3           : num [1:175] 4 4 2 2 2 1 1 3 3 4 ...
##  $ ADHD.Q4           : num [1:175] 2 4 1 2 4 4 3 4 4 4 ...
##  $ ADHD.Q5           : num [1:175] 3 5 3 4 4 3 4 3 4 4 ...
##  $ ADHD.Q6           : num [1:175] 1 2 3 3 2 2 4 3 3 3 ...
##  $ ADHD.Q7           : num [1:175] 1 2 3 2 3 3 2 1 3 4 ...
##  $ ADHD.Q8           : num [1:175] 3 3 2 4 4 4 3 1 4 3 ...
##  $ ADHD.Q9           : num [1:175] 2 2 0 4 4 4 3 4 3 2 ...
##  $ ADHD.Q10          : num [1:175] 4 4 1 2 2 2 4 2 4 4 ...
##  $ ADHD.Q11          : num [1:175] 2 1 2 3 4 4 3 4 3 3 ...
##  $ ADHD.Q12          : num [1:175] 4 4 0 1 1 2 1 1 0 1 ...
##  $ ADHD.Q13          : num [1:175] 1 2 2 3 3 4 4 4 4 3 ...
##  $ ADHD.Q14          : num [1:175] 0 4 2 3 2 4 4 3 3 4 ...
##  $ ADHD.Q15          : num [1:175] 3 4 3 1 1 3 4 0 3 4 ...
##  $ ADHD.Q16          : num [1:175] 1 3 2 2 2 4 4 0 2 4 ...
##  $ ADHD.Q17          : num [1:175] 3 1 1 1 1 3 2 1 4 2 ...
##  $ ADHD.Q18          : num [1:175] 4 4 1 2 1 3 4 1 3 2 ...
##  $ ADHD.Total        : num [1:175] 40 55 31 45 48 55 54 41 56 56 ...
##  $ MD.Q1a            : num [1:175] 1 1 0 1 0 0 1 0 1 1 ...
##  $ MD.Q1b            : num [1:175] 1 1 0 1 1 1 1 0 1 1 ...
##  $ MD.Q1c            : num [1:175] 1 1 0 0 0 0 0 0 0 0 ...
##  $ MD.Q1d            : num [1:175] 1 1 0 0 1 1 0 0 1 0 ...
##  $ MD.Q1e            : num [1:175] 0 1 1 1 0 1 1 0 1 1 ...
##  $ MD.Q1f            : num [1:175] 1 1 1 1 1 1 1 1 1 0 ...
##  $ MD.Q1g            : num [1:175] 1 1 1 1 1 1 0 1 1 1 ...
##  $ MD.Q1h            : num [1:175] 1 1 0 1 0 1 0 0 0 0 ...
##  $ MD.Q1i            : num [1:175] 1 1 0 1 0 1 0 0 0 0 ...
##  $ MD.Q1j            : num [1:175] 1 0 0 0 0 1 0 0 0 1 ...
##  $ MD.Q1k            : num [1:175] 1 0 0 0 0 1 0 0 0 1 ...
##  $ MD.Q1L            : num [1:175] 0 1 0 1 0 1 1 1 1 1 ...
##  $ MD.Q1m            : num [1:175] 1 0 0 1 0 0 0 0 1 1 ...
##  $ MD.Q2             : num [1:175] 1 1 0 1 1 1 1 1 1 1 ...
##  $ MD.Q3             : num [1:175] 3 3 2 3 2 3 3 3 3 2 ...
##  $ MD.TOTAL          : num [1:175] 15 14 5 13 7 14 9 7 12 11 ...
##  $ Alcohol           : num [1:175] 1 0 0 1 1 1 3 0 1 0 ...
##  $ THC               : num [1:175] 1 0 0 1 1 0 3 0 0 3 ...
##  $ Cocaine           : num [1:175] 1 0 0 1 0 0 1 0 0 0 ...
##  $ Stimulants        : num [1:175] 0 0 0 1 0 0 1 0 0 0 ...
##  $ Sedative.hypnotics: num [1:175] 0 0 0 0 0 0 1 0 0 0 ...
##  $ Opioids           : num [1:175] 0 0 0 0 0 0 0 0 1 0 ...
##  $ Court.order       : num [1:175] 1 0 0 0 1 0 0 0 0 0 ...
##  $ Education         : num [1:175] 11 14 12 12 9 11 12 16 12 9 ...
##  $ Hx.of.Violence    : num [1:175] 0 0 0 0 1 0 1 1 1 0 ...
##  $ Disorderly.Conduct: num [1:175] 1 0 0 0 1 1 1 1 1 1 ...
##  $ Suicide           : num [1:175] 1 1 0 1 1 1 0 0 0 0 ...
##  $ Abuse             : num [1:175] 0 4 6 7 0 2 4 0 0 2 ...
##  $ Non.subst.Dx      : num [1:175] 2 1 2 2 2 0 1 2 1 1 ...
##  $ Subst.Dx          : num [1:175] 0 0 0 0 0 0 0 1 0 2 ...
##  $ Psych.meds.       : num [1:175] 2 1 1 2 0 0 1 2 1 0 ...

# loan_raw <- read.csv('https://raw.githubusercontent.com/metis-macys-66898/data622_fa2021/main/hw3/data/Loan_approval.csv', header = TRUE, na.strings = " ")
# loan_raw[loan_raw==""] <- NA
# loan_raw <- loan_raw %>% mutate_if(is.character, factor)
# loan <- loan_raw

Data Processing Steps

I employed a couple strategies into transforming the data before we can apply models.

Imputing only when necessary. Imputing by applying the right method for the right variable type.
Make sure to make the fields into factors when they are categorical variables.

aggr_plot <- aggr(adhd_data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##            Variable      Count
##         Psych.meds. 0.67428571
##            Subst.Dx 0.13142857
##        Non.subst.Dx 0.12571429
##               Abuse 0.08000000
##             Suicide 0.07428571
##      Hx.of.Violence 0.06285714
##  Disorderly.Conduct 0.06285714
##           Education 0.05142857
##         Court.order 0.02857143
##             Alcohol 0.02285714
##                 THC 0.02285714
##             Cocaine 0.02285714
##          Stimulants 0.02285714
##  Sedative.hypnotics 0.02285714
##             Opioids 0.02285714
##             Initial 0.00000000
##                 Age 0.00000000
##                 Sex 0.00000000
##                Race 0.00000000
##             ADHD.Q1 0.00000000
##             ADHD.Q2 0.00000000
##             ADHD.Q3 0.00000000
##             ADHD.Q4 0.00000000
##             ADHD.Q5 0.00000000
##             ADHD.Q6 0.00000000
##             ADHD.Q7 0.00000000
##             ADHD.Q8 0.00000000
##             ADHD.Q9 0.00000000
##            ADHD.Q10 0.00000000
##            ADHD.Q11 0.00000000
##            ADHD.Q12 0.00000000
##            ADHD.Q13 0.00000000
##            ADHD.Q14 0.00000000
##            ADHD.Q15 0.00000000
##            ADHD.Q16 0.00000000
##            ADHD.Q17 0.00000000
##            ADHD.Q18 0.00000000
##          ADHD.Total 0.00000000
##              MD.Q1a 0.00000000
##              MD.Q1b 0.00000000
##              MD.Q1c 0.00000000
##              MD.Q1d 0.00000000
##              MD.Q1e 0.00000000
##              MD.Q1f 0.00000000
##              MD.Q1g 0.00000000
##              MD.Q1h 0.00000000
##              MD.Q1i 0.00000000
##              MD.Q1j 0.00000000
##              MD.Q1k 0.00000000
##              MD.Q1L 0.00000000
##              MD.Q1m 0.00000000
##               MD.Q2 0.00000000
##               MD.Q3 0.00000000
##            MD.TOTAL 0.00000000

Removing Initial as it provides no values to any models. Also, I removed Psych.meds. as it has 67% missing.

adhd_data_pre_imp <- adhd_data %>% select(-c('Initial','Psych.meds.'))
skim(adhd_data_pre_imp) %>% dplyr::filter( n_missing > 0 )

Data summary

Name	adhd_data_pre_imp
Number of rows	175
Number of columns	52
_______________________
Column type frequency:
numeric	14
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Alcohol	4	0.98	1.35	1.39	0	0	1	3.0	3	▇▂▁▁▆
THC	4	0.98	0.81	1.27	0	0	0	1.5	3	▇▁▁▁▃
Cocaine	4	0.98	1.09	1.39	0	0	0	3.0	3	▇▁▁▁▅
Stimulants	4	0.98	0.12	0.53	0	0	0	0.0	3	▇▁▁▁▁
Sedative.hypnotics	4	0.98	0.12	0.54	0	0	0	0.0	3	▇▁▁▁▁
Opioids	4	0.98	0.39	0.99	0	0	0	0.0	3	▇▁▁▁▁
Court.order	5	0.97	0.09	0.28	0	0	0	0.0	1	▇▁▁▁▁
Education	9	0.95	11.90	2.17	6	11	12	13.0	19	▁▅▇▂▁
Hx.of.Violence	11	0.94	0.24	0.43	0	0	0	0.0	1	▇▁▁▁▂
Disorderly.Conduct	11	0.94	0.73	0.45	0	0	1	1.0	1	▃▁▁▁▇
Suicide	13	0.93	0.30	0.46	0	0	0	1.0	1	▇▁▁▁▃
Abuse	14	0.92	1.33	2.12	0	0	0	2.0	7	▇▂▁▁▁
Non.subst.Dx	22	0.87	0.44	0.68	0	0	0	1.0	2	▇▁▃▁▁
Subst.Dx	23	0.87	1.14	0.93	0	0	1	2.0	3	▆▇▁▅▂

adhd_data_pre_imp <- adhd_data_pre_imp %>% mutate(NonSubstDx =  as.factor(Non.subst.Dx)  ,
                                                  SubstDx = as.factor(Subst.Dx) ,
                                                  Abuse = factor(Abuse, ordered = TRUE), 
                                                  Alcohol = as.factor(Alcohol)
)


adhd_data_pre_imp <- adhd_data_pre_imp %>%  mutate_if(is.numeric, factor)
skim(adhd_data_pre_imp)

Data summary

Name	adhd_data_pre_imp
Number of rows	175
Number of columns	54
_______________________
Column type frequency:
factor	54
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Age	0	1.00	FALSE	42	44: 12, 51: 9, 26: 8, 33: 7
Sex	0	1.00	FALSE	2	1: 99, 2: 76
Race	0	1.00	FALSE	4	2: 100, 1: 72, 6: 2, 3: 1
ADHD.Q1	0	1.00	FALSE	5	2: 44, 1: 43, 0: 39, 3: 30
ADHD.Q2	0	1.00	FALSE	5	2: 47, 1: 46, 3: 33, 0: 25
ADHD.Q3	0	1.00	FALSE	5	1: 46, 2: 46, 3: 32, 0: 26
ADHD.Q4	0	1.00	FALSE	5	2: 50, 4: 36, 1: 31, 3: 31
ADHD.Q5	0	1.00	FALSE	6	3: 47, 4: 41, 0: 33, 2: 32
ADHD.Q6	0	1.00	FALSE	5	2: 45, 3: 45, 0: 36, 1: 29
ADHD.Q7	0	1.00	FALSE	5	2: 54, 1: 53, 3: 25, 0: 22
ADHD.Q8	0	1.00	FALSE	5	3: 42, 1: 40, 2: 40, 4: 32
ADHD.Q9	0	1.00	FALSE	5	1: 43, 3: 41, 2: 36, 0: 31
ADHD.Q10	0	1.00	FALSE	5	2: 49, 1: 46, 3: 33, 4: 32
ADHD.Q11	0	1.00	FALSE	5	2: 48, 3: 43, 4: 35, 1: 33
ADHD.Q12	0	1.00	FALSE	5	0: 55, 1: 55, 2: 37, 3: 15
ADHD.Q13	0	1.00	FALSE	5	3: 47, 2: 46, 4: 38, 1: 29
ADHD.Q14	0	1.00	FALSE	5	3: 47, 2: 40, 4: 37, 0: 27
ADHD.Q15	0	1.00	FALSE	5	0: 50, 1: 39, 2: 35, 3: 27
ADHD.Q16	0	1.00	FALSE	5	1: 49, 0: 40, 2: 39, 4: 30
ADHD.Q17	0	1.00	FALSE	5	0: 49, 2: 46, 1: 41, 3: 22
ADHD.Q18	0	1.00	FALSE	5	1: 52, 0: 49, 2: 35, 3: 20
ADHD.Total	0	1.00	FALSE	62	17: 8, 31: 7, 32: 7, 24: 6
MD.Q1a	0	1.00	FALSE	2	1: 96, 0: 79
MD.Q1b	0	1.00	FALSE	2	1: 100, 0: 75
MD.Q1c	0	1.00	FALSE	2	1: 95, 0: 80
MD.Q1d	0	1.00	FALSE	2	1: 102, 0: 73
MD.Q1e	0	1.00	FALSE	2	1: 97, 0: 78
MD.Q1f	0	1.00	FALSE	2	1: 122, 0: 53
MD.Q1g	0	1.00	FALSE	2	1: 126, 0: 49
MD.Q1h	0	1.00	FALSE	2	1: 98, 0: 77
MD.Q1i	0	1.00	FALSE	2	1: 103, 0: 72
MD.Q1j	0	1.00	FALSE	2	0: 107, 1: 68
MD.Q1k	0	1.00	FALSE	2	0: 90, 1: 85
MD.Q1L	0	1.00	FALSE	2	1: 102, 0: 73
MD.Q1m	0	1.00	FALSE	2	0: 89, 1: 86
MD.Q2	0	1.00	FALSE	2	1: 126, 0: 49
MD.Q3	0	1.00	FALSE	4	3: 76, 2: 49, 0: 25, 1: 25
MD.TOTAL	0	1.00	FALSE	18	11: 18, 15: 14, 10: 13, 13: 13
Alcohol	4	0.98	FALSE	4	0: 80, 3: 66, 1: 18, 2: 7
THC	4	0.98	FALSE	4	0: 116, 3: 40, 1: 12, 2: 3
Cocaine	4	0.98	FALSE	4	0: 101, 3: 56, 1: 9, 2: 5
Stimulants	4	0.98	FALSE	3	0: 160, 1: 6, 3: 5
Sedative.hypnotics	4	0.98	FALSE	4	0: 161, 3: 5, 1: 4, 2: 1
Opioids	4	0.98	FALSE	3	0: 146, 3: 21, 1: 4
Court.order	5	0.97	FALSE	2	0: 155, 1: 15
Education	9	0.95	FALSE	14	12: 67, 11: 23, 13: 15, 14: 14
Hx.of.Violence	11	0.94	FALSE	2	0: 124, 1: 40
Disorderly.Conduct	11	0.94	FALSE	2	1: 119, 0: 45
Suicide	13	0.93	FALSE	2	0: 113, 1: 49
Abuse	14	0.92	TRUE	8	0: 101, 2: 20, 5: 10, 1: 8
Non.subst.Dx	22	0.87	FALSE	3	0: 102, 1: 35, 2: 16
Subst.Dx	23	0.87	FALSE	4	1: 61, 0: 42, 2: 35, 3: 14
NonSubstDx	22	0.87	FALSE	3	0: 102, 1: 35, 2: 16
SubstDx	23	0.87	FALSE	4	1: 61, 0: 42, 2: 35, 3: 14

adhd_data_pre_imp1 <- adhd_data_pre_imp %>% mutate(Education = as.numeric(Education)  )
skim(adhd_data_pre_imp1)

Data summary

Name	adhd_data_pre_imp1
Number of rows	175
Number of columns	54
_______________________
Column type frequency:
factor	53
numeric	1
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Age	0	1.00	FALSE	42	44: 12, 51: 9, 26: 8, 33: 7
Sex	0	1.00	FALSE	2	1: 99, 2: 76
Race	0	1.00	FALSE	4	2: 100, 1: 72, 6: 2, 3: 1
ADHD.Q1	0	1.00	FALSE	5	2: 44, 1: 43, 0: 39, 3: 30
ADHD.Q2	0	1.00	FALSE	5	2: 47, 1: 46, 3: 33, 0: 25
ADHD.Q3	0	1.00	FALSE	5	1: 46, 2: 46, 3: 32, 0: 26
ADHD.Q4	0	1.00	FALSE	5	2: 50, 4: 36, 1: 31, 3: 31
ADHD.Q5	0	1.00	FALSE	6	3: 47, 4: 41, 0: 33, 2: 32
ADHD.Q6	0	1.00	FALSE	5	2: 45, 3: 45, 0: 36, 1: 29
ADHD.Q7	0	1.00	FALSE	5	2: 54, 1: 53, 3: 25, 0: 22
ADHD.Q8	0	1.00	FALSE	5	3: 42, 1: 40, 2: 40, 4: 32
ADHD.Q9	0	1.00	FALSE	5	1: 43, 3: 41, 2: 36, 0: 31
ADHD.Q10	0	1.00	FALSE	5	2: 49, 1: 46, 3: 33, 4: 32
ADHD.Q11	0	1.00	FALSE	5	2: 48, 3: 43, 4: 35, 1: 33
ADHD.Q12	0	1.00	FALSE	5	0: 55, 1: 55, 2: 37, 3: 15
ADHD.Q13	0	1.00	FALSE	5	3: 47, 2: 46, 4: 38, 1: 29
ADHD.Q14	0	1.00	FALSE	5	3: 47, 2: 40, 4: 37, 0: 27
ADHD.Q15	0	1.00	FALSE	5	0: 50, 1: 39, 2: 35, 3: 27
ADHD.Q16	0	1.00	FALSE	5	1: 49, 0: 40, 2: 39, 4: 30
ADHD.Q17	0	1.00	FALSE	5	0: 49, 2: 46, 1: 41, 3: 22
ADHD.Q18	0	1.00	FALSE	5	1: 52, 0: 49, 2: 35, 3: 20
ADHD.Total	0	1.00	FALSE	62	17: 8, 31: 7, 32: 7, 24: 6
MD.Q1a	0	1.00	FALSE	2	1: 96, 0: 79
MD.Q1b	0	1.00	FALSE	2	1: 100, 0: 75
MD.Q1c	0	1.00	FALSE	2	1: 95, 0: 80
MD.Q1d	0	1.00	FALSE	2	1: 102, 0: 73
MD.Q1e	0	1.00	FALSE	2	1: 97, 0: 78
MD.Q1f	0	1.00	FALSE	2	1: 122, 0: 53
MD.Q1g	0	1.00	FALSE	2	1: 126, 0: 49
MD.Q1h	0	1.00	FALSE	2	1: 98, 0: 77
MD.Q1i	0	1.00	FALSE	2	1: 103, 0: 72
MD.Q1j	0	1.00	FALSE	2	0: 107, 1: 68
MD.Q1k	0	1.00	FALSE	2	0: 90, 1: 85
MD.Q1L	0	1.00	FALSE	2	1: 102, 0: 73
MD.Q1m	0	1.00	FALSE	2	0: 89, 1: 86
MD.Q2	0	1.00	FALSE	2	1: 126, 0: 49
MD.Q3	0	1.00	FALSE	4	3: 76, 2: 49, 0: 25, 1: 25
MD.TOTAL	0	1.00	FALSE	18	11: 18, 15: 14, 10: 13, 13: 13
Alcohol	4	0.98	FALSE	4	0: 80, 3: 66, 1: 18, 2: 7
THC	4	0.98	FALSE	4	0: 116, 3: 40, 1: 12, 2: 3
Cocaine	4	0.98	FALSE	4	0: 101, 3: 56, 1: 9, 2: 5
Stimulants	4	0.98	FALSE	3	0: 160, 1: 6, 3: 5
Sedative.hypnotics	4	0.98	FALSE	4	0: 161, 3: 5, 1: 4, 2: 1
Opioids	4	0.98	FALSE	3	0: 146, 3: 21, 1: 4
Court.order	5	0.97	FALSE	2	0: 155, 1: 15
Hx.of.Violence	11	0.94	FALSE	2	0: 124, 1: 40
Disorderly.Conduct	11	0.94	FALSE	2	1: 119, 0: 45
Suicide	13	0.93	FALSE	2	0: 113, 1: 49
Abuse	14	0.92	TRUE	8	0: 101, 2: 20, 5: 10, 1: 8
Non.subst.Dx	22	0.87	FALSE	3	0: 102, 1: 35, 2: 16
Subst.Dx	23	0.87	FALSE	4	1: 61, 0: 42, 2: 35, 3: 14
NonSubstDx	22	0.87	FALSE	3	0: 102, 1: 35, 2: 16
SubstDx	23	0.87	FALSE	4	1: 61, 0: 42, 2: 35, 3: 14

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Education	9	0.95	6.9	2.17	1	6	7	8	14	▁▅▇▂▁

# skim(adhd_data_pre_imp)

Imputation

More specifically, I separated the variables with missing values into the following categories:

Categorical Variables with more than 2 levels: Subst.Dx, NonSubstDx, Opioids, Sedative.hypnotics, Stimulants, Cocaine, THC, Alcohol - polyreg
Categorical Variables with 2 levels: Disorderly.Conduct, Hx.of.Violence, Court.order - logreg
Discrete variable (ordered) with more than 2 levels: Abuse - polr
Continuous Variables: Education - ~~pmm~~ -> norm

init <- mice(adhd_data_pre_imp1, maxit=0)

## Warning: Number of logged events: 2

meth <- init$method
predM <- init$predictorMatrix
meth[c('Education')] <- 'pmm'
meth[c('Disorderly.Conduct', 'Hx.of.Violence', 'Court.order')] <- 'logreg'
meth[c('Subst.Dx', 'NonSubstDx' , 'Opioids', 'Sedative.hypnotics', 'Stimulants', 'Cocaine', 'THC', 'Alcohol')] <- 'polyreg'
meth[c('Abuse')] = 'polr'
meth[c('Age','Sex','Race','ADHD.Q1','ADHD.Q2','ADHD.Q3','ADHD.Q4','ADHD.Q5','ADHD.Q6','ADHD.Q7','ADHD.Q8','ADHD.Q9','ADHD.Q10','ADHD.Q11','ADHD.Q12','ADHD.Q13','ADHD.Q14','ADHD.Q15','ADHD.Q16','ADHD.Q17','ADHD.Q18','ADHD.Total','MD.Q1a','MD.Q1b','MD.Q1c','MD.Q1d','MD.Q1e','MD.Q1f','MD.Q1g','MD.Q1h','MD.Q1i','MD.Q1j','MD.Q1k','MD.Q1L','MD.Q1m','MD.Q2','MD.Q3','MD.TOTAL','Suicide')] = ''

# adhd_data_imp1 <- mice(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, seed=501)
# adhd_data_imp1 <- parlmice(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, cluster.seed=501, m = 4, n.core = 4, n.imp.core = 100)

# no_cores <- detectCores() - 1
# 
# cl<-makePSOCKcluster(no_cores)
# 
# registerDoParallel(cl)
# 
# start.time<-proc.time()
# 
# adhd_data_imp1 <- mice.par(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, seed=301, m = 5)
# 
# stop.time<-proc.time()
# 
# run.time<-stop.time -start.time
# 
# print(run.time)
# 
# stopCluster(cl)
# user  system elapsed 
#   1.826   4.303 264.775

# abandoned this method as it's not always consistent
# cl<-makePSOCKcluster(no_cores)
# registerDoParallel(cl)
#   
# start.time<-proc.time()
#   
# adhd_data_imp1 <- parlmice(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, cluster.seed=501, m = 4, n.core = 4, n.imp.core = 1)
# 
# stop.time<-proc.time()
# 
# run.time<-stop.time -start.time
# 
# print(run.time)
# 
# stopCluster(cl)

Examining the imputations

Examining the imputations for Education. As the 50-th percentile before was 12, we picked an impute that is exactly that. Impute #4 fits the bill.

# str(adhd_data_imp1)
# skim(adhd_data_imp1) %>% dplyr::filter( n_missing > 0 )
# 
#Eduation
# adhd_data_imp1$imp$Education
# adhd_data_pre_imp1[140:142,]
# 
# 
# #Age
# adhd_data_imp1$imp$Alcohol

# adhd_data_2 <- complete(adhd_data_imp1, 4) # 2nd argument if not provided is defaulted to 1

# skim(adhd_data_2) %>% dplyr::filter( n_missing > 0 )
# # # saving a R data frame

# saveRDS(adhd_data_2, "adhd_data_2.rds")

Ended up dropping 29 records as even there are other means of imputing the missing records manually. I just don’t think it’s trustworthy when advanced algorithms did not end up imputing them. The need to have complete cases, except for the suicide variable for section of clustering and PCA where suicide variable is not the response variable, or variable of interest, is the ultimate reason why I had to drop any records with even a missing field from the dataset.

# loading rds object

# adhd_data_2 <- readRDS("adhd_data_2.rds")

# adhd_data_imp1$imp$Suicide
# 41    NA  NA  NA  NA  NA
# 49    NA  NA  NA  NA  NA
# 53    NA  NA  NA  NA  NA
# 67    NA  NA  NA  NA  NA
# 73    NA  NA  NA  NA  NA
# 106   NA  NA  NA  NA  NA
# 117   NA  NA  NA  NA  NA
# 122   NA  NA  NA  NA  NA
# 129   NA  NA  NA  NA  NA
# 131


# adhd_data_pre_imp1[complete.cases(adhd_data_pre_imp1[ , -49]),]

# saved 3 records
# adhd_data_3 <- adhd_data_2[complete.cases(adhd_data_2[ , -49]),]

# Saving the records 
# saveRDS(adhd_data_3, "adhd_data_3.rds")
#Loading the records
adhd_data_3 <- readRDS("adhd_data_3.rds")

Count for each Education / suicide attempt

ggplot(adhd_data_3, aes(x = Education, fill = Suicide)) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
                    guide = F) +
  theme_minimal() +
  facet_wrap(~Suicide, ncol = 1) +
  coord_flip()

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

mosaic(~ Education + Suicide, data = adhd_data_3)

Count for each Subst.Dx / suicide attempt

ggplot(adhd_data_3, aes(x = Subst.Dx, fill = Suicide)) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
                    guide = F) +
  theme_minimal() +
  facet_wrap(~Suicide, ncol = 1) +
  coord_flip()

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

mosaic(~ Subst.Dx + Suicide, data = adhd_data_3)

Count for each Non.subst.Dx / suicide attempt

ggplot(adhd_data_3, aes(x = Non.subst.Dx, fill = Suicide)) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
                    guide = F) +
  theme_minimal() +
  facet_wrap(~Suicide, ncol = 1) +
  coord_flip()

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

mosaic(~ Non.subst.Dx + Suicide, data = adhd_data_3)

Count for each Abuse / Suicide

ggplot(adhd_data_3, aes(x = Abuse, fill = Suicide)) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = c("darkorange", "purple", "cyan4", "black"),
                    guide = F) +
  theme_minimal() +
  facet_wrap(~Suicide, ncol = 1) +
  coord_flip()

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

mosaic(~ Abuse + Suicide, data = adhd_data_3)

Clustering Method

Principal Component Analysis

Xtreme Gradient Boosting (XGBoost)

Removing the lone record that Suicide is NA.

# need to remove Suicide null 
dim(adhd_data_3)

## [1] 146  54

# skim(adhd_data_3) 
adhd_data_4 <- adhd_data_3[complete.cases(adhd_data_3),]
dim(adhd_data_4)

## [1] 145  54

# Dropping the 2 repeated columns
adhd_data_4 <- adhd_data_4[, -c(51:52)]

Splitting the train and test dataset

set.seed(108)
sample_size <- floor(nrow(adhd_data_4)*0.8)
indices <- sample(1:nrow(adhd_data_4),sample_size)
data_train <- adhd_data_4[c(indices),]
data_test <- adhd_data_4[-c(indices),]

One-hot encoding of features, which is a requirement for XGBoost. At the end, you wanted to make sure the number of columns in data_train and data_test are the same.

# do the 1-hot encoding for the data_train
Suicide <- data_train[, 49]
dummy <- dummyVars(" ~ . ", data = data_train [, -49] )
newdata <- data.frame(predict (dummy, newdata = data_train[, -49]))
data_train <- cbind (newdata, Suicide)
options(max.print="309")
colnames(data_train)

##   [1] "Age.18"               "Age.19"               "Age.20"              
##   [4] "Age.21"               "Age.22"               "Age.23"              
##   [7] "Age.24"               "Age.25"               "Age.26"              
##  [10] "Age.27"               "Age.28"               "Age.29"              
##  [13] "Age.30"               "Age.31"               "Age.32"              
##  [16] "Age.33"               "Age.34"               "Age.35"              
##  [19] "Age.36"               "Age.37"               "Age.38"              
##  [22] "Age.39"               "Age.40"               "Age.41"              
##  [25] "Age.42"               "Age.43"               "Age.44"              
##  [28] "Age.45"               "Age.46"               "Age.47"              
##  [31] "Age.48"               "Age.49"               "Age.50"              
##  [34] "Age.51"               "Age.52"               "Age.53"              
##  [37] "Age.54"               "Age.55"               "Age.56"              
##  [40] "Age.57"               "Age.61"               "Age.69"              
##  [43] "Sex.1"                "Sex.2"                "Race.1"              
##  [46] "Race.2"               "Race.3"               "Race.6"              
##  [49] "ADHD.Q1.0"            "ADHD.Q1.1"            "ADHD.Q1.2"           
##  [52] "ADHD.Q1.3"            "ADHD.Q1.4"            "ADHD.Q2.0"           
##  [55] "ADHD.Q2.1"            "ADHD.Q2.2"            "ADHD.Q2.3"           
##  [58] "ADHD.Q2.4"            "ADHD.Q3.0"            "ADHD.Q3.1"           
##  [61] "ADHD.Q3.2"            "ADHD.Q3.3"            "ADHD.Q3.4"           
##  [64] "ADHD.Q4.0"            "ADHD.Q4.1"            "ADHD.Q4.2"           
##  [67] "ADHD.Q4.3"            "ADHD.Q4.4"            "ADHD.Q5.0"           
##  [70] "ADHD.Q5.1"            "ADHD.Q5.2"            "ADHD.Q5.3"           
##  [73] "ADHD.Q5.4"            "ADHD.Q5.5"            "ADHD.Q6.0"           
##  [76] "ADHD.Q6.1"            "ADHD.Q6.2"            "ADHD.Q6.3"           
##  [79] "ADHD.Q6.4"            "ADHD.Q7.0"            "ADHD.Q7.1"           
##  [82] "ADHD.Q7.2"            "ADHD.Q7.3"            "ADHD.Q7.4"           
##  [85] "ADHD.Q8.0"            "ADHD.Q8.1"            "ADHD.Q8.2"           
##  [88] "ADHD.Q8.3"            "ADHD.Q8.4"            "ADHD.Q9.0"           
##  [91] "ADHD.Q9.1"            "ADHD.Q9.2"            "ADHD.Q9.3"           
##  [94] "ADHD.Q9.4"            "ADHD.Q10.0"           "ADHD.Q10.1"          
##  [97] "ADHD.Q10.2"           "ADHD.Q10.3"           "ADHD.Q10.4"          
## [100] "ADHD.Q11.0"           "ADHD.Q11.1"           "ADHD.Q11.2"          
## [103] "ADHD.Q11.3"           "ADHD.Q11.4"           "ADHD.Q12.0"          
## [106] "ADHD.Q12.1"           "ADHD.Q12.2"           "ADHD.Q12.3"          
## [109] "ADHD.Q12.4"           "ADHD.Q13.0"           "ADHD.Q13.1"          
## [112] "ADHD.Q13.2"           "ADHD.Q13.3"           "ADHD.Q13.4"          
## [115] "ADHD.Q14.0"           "ADHD.Q14.1"           "ADHD.Q14.2"          
## [118] "ADHD.Q14.3"           "ADHD.Q14.4"           "ADHD.Q15.0"          
## [121] "ADHD.Q15.1"           "ADHD.Q15.2"           "ADHD.Q15.3"          
## [124] "ADHD.Q15.4"           "ADHD.Q16.0"           "ADHD.Q16.1"          
## [127] "ADHD.Q16.2"           "ADHD.Q16.3"           "ADHD.Q16.4"          
## [130] "ADHD.Q17.0"           "ADHD.Q17.1"           "ADHD.Q17.2"          
## [133] "ADHD.Q17.3"           "ADHD.Q17.4"           "ADHD.Q18.0"          
## [136] "ADHD.Q18.1"           "ADHD.Q18.2"           "ADHD.Q18.3"          
## [139] "ADHD.Q18.4"           "ADHD.Total.0"         "ADHD.Total.1"        
## [142] "ADHD.Total.3"         "ADHD.Total.5"         "ADHD.Total.6"        
## [145] "ADHD.Total.7"         "ADHD.Total.8"         "ADHD.Total.9"        
## [148] "ADHD.Total.10"        "ADHD.Total.11"        "ADHD.Total.12"       
## [151] "ADHD.Total.13"        "ADHD.Total.14"        "ADHD.Total.16"       
## [154] "ADHD.Total.17"        "ADHD.Total.18"        "ADHD.Total.19"       
## [157] "ADHD.Total.20"        "ADHD.Total.21"        "ADHD.Total.23"       
## [160] "ADHD.Total.24"        "ADHD.Total.25"        "ADHD.Total.26"       
## [163] "ADHD.Total.27"        "ADHD.Total.28"        "ADHD.Total.29"       
## [166] "ADHD.Total.30"        "ADHD.Total.31"        "ADHD.Total.32"       
## [169] "ADHD.Total.33"        "ADHD.Total.34"        "ADHD.Total.35"       
## [172] "ADHD.Total.36"        "ADHD.Total.37"        "ADHD.Total.38"       
## [175] "ADHD.Total.39"        "ADHD.Total.40"        "ADHD.Total.41"       
## [178] "ADHD.Total.42"        "ADHD.Total.43"        "ADHD.Total.44"       
## [181] "ADHD.Total.45"        "ADHD.Total.46"        "ADHD.Total.47"       
## [184] "ADHD.Total.48"        "ADHD.Total.49"        "ADHD.Total.50"       
## [187] "ADHD.Total.51"        "ADHD.Total.52"        "ADHD.Total.53"       
## [190] "ADHD.Total.54"        "ADHD.Total.55"        "ADHD.Total.56"       
## [193] "ADHD.Total.57"        "ADHD.Total.58"        "ADHD.Total.62"       
## [196] "ADHD.Total.63"        "ADHD.Total.65"        "ADHD.Total.67"       
## [199] "ADHD.Total.69"        "ADHD.Total.71"        "ADHD.Total.72"       
## [202] "MD.Q1a.0"             "MD.Q1a.1"             "MD.Q1b.0"            
## [205] "MD.Q1b.1"             "MD.Q1c.0"             "MD.Q1c.1"            
## [208] "MD.Q1d.0"             "MD.Q1d.1"             "MD.Q1e.0"            
## [211] "MD.Q1e.1"             "MD.Q1f.0"             "MD.Q1f.1"            
## [214] "MD.Q1g.0"             "MD.Q1g.1"             "MD.Q1h.0"            
## [217] "MD.Q1h.1"             "MD.Q1i.0"             "MD.Q1i.1"            
## [220] "MD.Q1j.0"             "MD.Q1j.1"             "MD.Q1k.0"            
## [223] "MD.Q1k.1"             "MD.Q1L.0"             "MD.Q1L.1"            
## [226] "MD.Q1m.0"             "MD.Q1m.1"             "MD.Q2.0"             
## [229] "MD.Q2.1"              "MD.Q3.0"              "MD.Q3.1"             
## [232] "MD.Q3.2"              "MD.Q3.3"              "MD.TOTAL.0"          
## [235] "MD.TOTAL.1"           "MD.TOTAL.2"           "MD.TOTAL.3"          
## [238] "MD.TOTAL.4"           "MD.TOTAL.5"           "MD.TOTAL.6"          
## [241] "MD.TOTAL.7"           "MD.TOTAL.8"           "MD.TOTAL.9"          
## [244] "MD.TOTAL.10"          "MD.TOTAL.11"          "MD.TOTAL.12"         
## [247] "MD.TOTAL.13"          "MD.TOTAL.14"          "MD.TOTAL.15"         
## [250] "MD.TOTAL.16"          "MD.TOTAL.17"          "Alcohol.0"           
## [253] "Alcohol.1"            "Alcohol.2"            "Alcohol.3"           
## [256] "THC.0"                "THC.1"                "THC.2"               
## [259] "THC.3"                "Cocaine.0"            "Cocaine.1"           
## [262] "Cocaine.2"            "Cocaine.3"            "Stimulants.0"        
## [265] "Stimulants.1"         "Stimulants.3"         "Sedative.hypnotics.0"
## [268] "Sedative.hypnotics.1" "Sedative.hypnotics.2" "Sedative.hypnotics.3"
## [271] "Opioids.0"            "Opioids.1"            "Opioids.3"           
## [274] "Court.order.0"        "Court.order.1"        "Education"           
## [277] "Hx.of.Violence.0"     "Hx.of.Violence.1"     "Disorderly.Conduct.0"
## [280] "Disorderly.Conduct.1" "Abuse.L"              "Abuse.Q"             
## [283] "Abuse.C"              "Abuse.4"              "Abuse.5"             
## [286] "Abuse.6"              "Abuse.7"              "NonSubstDx.0"        
## [289] "NonSubstDx.1"         "NonSubstDx.2"         "SubstDx.0"           
## [292] "SubstDx.1"            "SubstDx.2"            "SubstDx.3"           
## [295] "Suicide"

# repeat the same exercise for the data_test
Suicide1 <- data_test[, 49]
dummy1 <- dummyVars(" ~ . ", data = data_test [, -49] )
newdata1 <- data.frame(predict (dummy1, newdata = data_test[, -49]))
data_test <- cbind (newdata1, Suicide1)
colnames(data_test)

##   [1] "Age.18"               "Age.19"               "Age.20"              
##   [4] "Age.21"               "Age.22"               "Age.23"              
##   [7] "Age.24"               "Age.25"               "Age.26"              
##  [10] "Age.27"               "Age.28"               "Age.29"              
##  [13] "Age.30"               "Age.31"               "Age.32"              
##  [16] "Age.33"               "Age.34"               "Age.35"              
##  [19] "Age.36"               "Age.37"               "Age.38"              
##  [22] "Age.39"               "Age.40"               "Age.41"              
##  [25] "Age.42"               "Age.43"               "Age.44"              
##  [28] "Age.45"               "Age.46"               "Age.47"              
##  [31] "Age.48"               "Age.49"               "Age.50"              
##  [34] "Age.51"               "Age.52"               "Age.53"              
##  [37] "Age.54"               "Age.55"               "Age.56"              
##  [40] "Age.57"               "Age.61"               "Age.69"              
##  [43] "Sex.1"                "Sex.2"                "Race.1"              
##  [46] "Race.2"               "Race.3"               "Race.6"              
##  [49] "ADHD.Q1.0"            "ADHD.Q1.1"            "ADHD.Q1.2"           
##  [52] "ADHD.Q1.3"            "ADHD.Q1.4"            "ADHD.Q2.0"           
##  [55] "ADHD.Q2.1"            "ADHD.Q2.2"            "ADHD.Q2.3"           
##  [58] "ADHD.Q2.4"            "ADHD.Q3.0"            "ADHD.Q3.1"           
##  [61] "ADHD.Q3.2"            "ADHD.Q3.3"            "ADHD.Q3.4"           
##  [64] "ADHD.Q4.0"            "ADHD.Q4.1"            "ADHD.Q4.2"           
##  [67] "ADHD.Q4.3"            "ADHD.Q4.4"            "ADHD.Q5.0"           
##  [70] "ADHD.Q5.1"            "ADHD.Q5.2"            "ADHD.Q5.3"           
##  [73] "ADHD.Q5.4"            "ADHD.Q5.5"            "ADHD.Q6.0"           
##  [76] "ADHD.Q6.1"            "ADHD.Q6.2"            "ADHD.Q6.3"           
##  [79] "ADHD.Q6.4"            "ADHD.Q7.0"            "ADHD.Q7.1"           
##  [82] "ADHD.Q7.2"            "ADHD.Q7.3"            "ADHD.Q7.4"           
##  [85] "ADHD.Q8.0"            "ADHD.Q8.1"            "ADHD.Q8.2"           
##  [88] "ADHD.Q8.3"            "ADHD.Q8.4"            "ADHD.Q9.0"           
##  [91] "ADHD.Q9.1"            "ADHD.Q9.2"            "ADHD.Q9.3"           
##  [94] "ADHD.Q9.4"            "ADHD.Q10.0"           "ADHD.Q10.1"          
##  [97] "ADHD.Q10.2"           "ADHD.Q10.3"           "ADHD.Q10.4"          
## [100] "ADHD.Q11.0"           "ADHD.Q11.1"           "ADHD.Q11.2"          
## [103] "ADHD.Q11.3"           "ADHD.Q11.4"           "ADHD.Q12.0"          
## [106] "ADHD.Q12.1"           "ADHD.Q12.2"           "ADHD.Q12.3"          
## [109] "ADHD.Q12.4"           "ADHD.Q13.0"           "ADHD.Q13.1"          
## [112] "ADHD.Q13.2"           "ADHD.Q13.3"           "ADHD.Q13.4"          
## [115] "ADHD.Q14.0"           "ADHD.Q14.1"           "ADHD.Q14.2"          
## [118] "ADHD.Q14.3"           "ADHD.Q14.4"           "ADHD.Q15.0"          
## [121] "ADHD.Q15.1"           "ADHD.Q15.2"           "ADHD.Q15.3"          
## [124] "ADHD.Q15.4"           "ADHD.Q16.0"           "ADHD.Q16.1"          
## [127] "ADHD.Q16.2"           "ADHD.Q16.3"           "ADHD.Q16.4"          
## [130] "ADHD.Q17.0"           "ADHD.Q17.1"           "ADHD.Q17.2"          
## [133] "ADHD.Q17.3"           "ADHD.Q17.4"           "ADHD.Q18.0"          
## [136] "ADHD.Q18.1"           "ADHD.Q18.2"           "ADHD.Q18.3"          
## [139] "ADHD.Q18.4"           "ADHD.Total.0"         "ADHD.Total.1"        
## [142] "ADHD.Total.3"         "ADHD.Total.5"         "ADHD.Total.6"        
## [145] "ADHD.Total.7"         "ADHD.Total.8"         "ADHD.Total.9"        
## [148] "ADHD.Total.10"        "ADHD.Total.11"        "ADHD.Total.12"       
## [151] "ADHD.Total.13"        "ADHD.Total.14"        "ADHD.Total.16"       
## [154] "ADHD.Total.17"        "ADHD.Total.18"        "ADHD.Total.19"       
## [157] "ADHD.Total.20"        "ADHD.Total.21"        "ADHD.Total.23"       
## [160] "ADHD.Total.24"        "ADHD.Total.25"        "ADHD.Total.26"       
## [163] "ADHD.Total.27"        "ADHD.Total.28"        "ADHD.Total.29"       
## [166] "ADHD.Total.30"        "ADHD.Total.31"        "ADHD.Total.32"       
## [169] "ADHD.Total.33"        "ADHD.Total.34"        "ADHD.Total.35"       
## [172] "ADHD.Total.36"        "ADHD.Total.37"        "ADHD.Total.38"       
## [175] "ADHD.Total.39"        "ADHD.Total.40"        "ADHD.Total.41"       
## [178] "ADHD.Total.42"        "ADHD.Total.43"        "ADHD.Total.44"       
## [181] "ADHD.Total.45"        "ADHD.Total.46"        "ADHD.Total.47"       
## [184] "ADHD.Total.48"        "ADHD.Total.49"        "ADHD.Total.50"       
## [187] "ADHD.Total.51"        "ADHD.Total.52"        "ADHD.Total.53"       
## [190] "ADHD.Total.54"        "ADHD.Total.55"        "ADHD.Total.56"       
## [193] "ADHD.Total.57"        "ADHD.Total.58"        "ADHD.Total.62"       
## [196] "ADHD.Total.63"        "ADHD.Total.65"        "ADHD.Total.67"       
## [199] "ADHD.Total.69"        "ADHD.Total.71"        "ADHD.Total.72"       
## [202] "MD.Q1a.0"             "MD.Q1a.1"             "MD.Q1b.0"            
## [205] "MD.Q1b.1"             "MD.Q1c.0"             "MD.Q1c.1"            
## [208] "MD.Q1d.0"             "MD.Q1d.1"             "MD.Q1e.0"            
## [211] "MD.Q1e.1"             "MD.Q1f.0"             "MD.Q1f.1"            
## [214] "MD.Q1g.0"             "MD.Q1g.1"             "MD.Q1h.0"            
## [217] "MD.Q1h.1"             "MD.Q1i.0"             "MD.Q1i.1"            
## [220] "MD.Q1j.0"             "MD.Q1j.1"             "MD.Q1k.0"            
## [223] "MD.Q1k.1"             "MD.Q1L.0"             "MD.Q1L.1"            
## [226] "MD.Q1m.0"             "MD.Q1m.1"             "MD.Q2.0"             
## [229] "MD.Q2.1"              "MD.Q3.0"              "MD.Q3.1"             
## [232] "MD.Q3.2"              "MD.Q3.3"              "MD.TOTAL.0"          
## [235] "MD.TOTAL.1"           "MD.TOTAL.2"           "MD.TOTAL.3"          
## [238] "MD.TOTAL.4"           "MD.TOTAL.5"           "MD.TOTAL.6"          
## [241] "MD.TOTAL.7"           "MD.TOTAL.8"           "MD.TOTAL.9"          
## [244] "MD.TOTAL.10"          "MD.TOTAL.11"          "MD.TOTAL.12"         
## [247] "MD.TOTAL.13"          "MD.TOTAL.14"          "MD.TOTAL.15"         
## [250] "MD.TOTAL.16"          "MD.TOTAL.17"          "Alcohol.0"           
## [253] "Alcohol.1"            "Alcohol.2"            "Alcohol.3"           
## [256] "THC.0"                "THC.1"                "THC.2"               
## [259] "THC.3"                "Cocaine.0"            "Cocaine.1"           
## [262] "Cocaine.2"            "Cocaine.3"            "Stimulants.0"        
## [265] "Stimulants.1"         "Stimulants.3"         "Sedative.hypnotics.0"
## [268] "Sedative.hypnotics.1" "Sedative.hypnotics.2" "Sedative.hypnotics.3"
## [271] "Opioids.0"            "Opioids.1"            "Opioids.3"           
## [274] "Court.order.0"        "Court.order.1"        "Education"           
## [277] "Hx.of.Violence.0"     "Hx.of.Violence.1"     "Disorderly.Conduct.0"
## [280] "Disorderly.Conduct.1" "Abuse.L"              "Abuse.Q"             
## [283] "Abuse.C"              "Abuse.4"              "Abuse.5"             
## [286] "Abuse.6"              "Abuse.7"              "NonSubstDx.0"        
## [289] "NonSubstDx.1"         "NonSubstDx.2"         "SubstDx.0"           
## [292] "SubstDx.1"            "SubstDx.2"            "SubstDx.3"           
## [295] "Suicide1"

str(data_train)

## 'data.frame':    116 obs. of  295 variables:
##  $ Age.18              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.19              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.20              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.21              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.22              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.23              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.24              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.25              : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Age.26              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.27              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.28              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.29              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.30              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.31              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.32              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.33              : num  0 0 0 0 1 0 0 0 1 0 ...
##  $ Age.34              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.35              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.36              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.37              : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Age.38              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.39              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.40              : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ Age.41              : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ Age.42              : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ Age.43              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.44              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.45              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.46              : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Age.47              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.48              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.49              : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Age.50              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.51              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.52              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.53              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.54              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.55              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.56              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.57              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.61              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.69              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sex.1               : num  1 1 0 0 1 1 1 0 0 1 ...
##  $ Sex.2               : num  0 0 1 1 0 0 0 1 1 0 ...
##  $ Race.1              : num  0 0 1 1 1 1 0 0 0 1 ...
##  $ Race.2              : num  1 1 0 0 0 0 1 1 1 0 ...
##  $ Race.3              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Race.6              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q1.0           : num  1 0 0 0 0 0 1 0 0 0 ...
##  $ ADHD.Q1.1           : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q1.2           : num  0 0 0 1 0 1 0 0 0 1 ...
##  $ ADHD.Q1.3           : num  0 0 0 0 0 0 0 1 1 0 ...
##  $ ADHD.Q1.4           : num  0 0 1 0 1 0 0 0 0 0 ...
##  $ ADHD.Q2.0           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q2.1           : num  1 0 0 1 0 0 0 1 0 0 ...
##  $ ADHD.Q2.2           : num  0 1 0 0 0 1 1 0 1 1 ...
##  $ ADHD.Q2.3           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ ADHD.Q2.4           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ ADHD.Q3.0           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q3.1           : num  1 0 1 1 0 1 0 0 0 0 ...
##  $ ADHD.Q3.2           : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ ADHD.Q3.3           : num  0 0 0 0 1 0 0 0 1 1 ...
##  $ ADHD.Q3.4           : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q4.0           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q4.1           : num  0 0 0 1 0 0 0 1 0 0 ...
##  $ ADHD.Q4.2           : num  1 0 0 0 0 1 1 0 1 0 ...
##  $ ADHD.Q4.3           : num  0 1 0 0 1 0 0 0 0 1 ...
##  $ ADHD.Q4.4           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ ADHD.Q5.0           : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ ADHD.Q5.1           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q5.2           : num  0 0 1 0 0 0 0 1 0 0 ...
##  $ ADHD.Q5.3           : num  0 1 0 1 1 1 0 0 1 1 ...
##  $ ADHD.Q5.4           : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q5.5           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q6.0           : num  0 0 1 0 0 0 1 0 0 0 ...
##  $ ADHD.Q6.1           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ ADHD.Q6.2           : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ ADHD.Q6.3           : num  1 1 0 1 0 1 0 1 0 0 ...
##  $ ADHD.Q6.4           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q7.0           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ ADHD.Q7.1           : num  1 0 0 1 0 1 0 0 0 0 ...
##  $ ADHD.Q7.2           : num  0 1 1 0 0 0 1 0 0 0 ...
##  $ ADHD.Q7.3           : num  0 0 0 0 1 0 0 0 0 1 ...
##  $ ADHD.Q7.4           : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ ADHD.Q8.0           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q8.1           : num  0 1 0 0 0 1 0 1 0 0 ...
##  $ ADHD.Q8.2           : num  0 0 0 1 0 0 1 0 0 0 ...
##  $ ADHD.Q8.3           : num  1 0 1 0 1 0 0 0 1 1 ...
##  $ ADHD.Q8.4           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q9.0           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ ADHD.Q9.1           : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ ADHD.Q9.2           : num  0 0 0 1 0 0 1 0 0 0 ...
##  $ ADHD.Q9.3           : num  1 1 1 0 1 0 0 0 0 1 ...
##  $ ADHD.Q9.4           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q10.0          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ADHD.Q10.1          : num  1 0 0 1 0 0 0 1 0 0 ...
##  $ ADHD.Q10.2          : num  0 1 0 0 0 1 1 0 1 0 ...
##  $ ADHD.Q10.3          : num  0 0 1 0 1 0 0 0 0 1 ...
##  $ ADHD.Q10.4          : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

As you can tell below, the target variable in the data_train is unbalanced and is skewed toward 0 (68% 0s).

table(data_train$Suicide)

## 
##  0  1 
## 79 37

train_upsample <- upSample(x=data_train[, -295], y = data_train$Suicide)
# There is a new variable created called Class so that's why I can drop the Suicide column
table(train_upsample$Class)

## 
##  0  1 
## 79 79

Preparing the grid for XGBoost

grid_tune <- expand.grid(
  nrounds = c(81), # of trees normal range is 1500 - 3000 depends on # of records
  max_depth = c(2, 4, 6), 
  eta = c(0.01, 0.1, 0.3), # Learning rate
  gamma = 0,   # pruning normal range [0, 1]
  colsample_bytree = 1, # subsample ratio for columns for tree
  min_child_weight = 1, # the larger the more conservative the model is, can be used as a stop
  subsample = 1 # used to prevent overfitting by sampling x% training dataset
)

The following defines the trainControl and actually train the XGBoost

trainControl <- trainControl (method = "cv", 
                              number = 3, 
                              verboseIter = T, 
                              allowParallel = T
                             )

xgb_tune <- train(x = train_upsample[, -295], 
                  y = train_upsample$Class,
                  trControl = trainControl, 
                  tuneGrid = grid_tune, 
                  method = "xgbTree", 
                  verbose = T
                  )

## + Fold1: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold1: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold1: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold2: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold2: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## + Fold3: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## - Fold3: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81 
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 81, max_depth = 4, eta = 0.3, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set

# Best tune
xgb_tune$bestTune

##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 8      81         4 0.3     0                1                1         1

# Creating the best model 
train_Control <- trainControl (method = "none", 
                              verboseIter = T, 
                              allowParallel = T
                             )

final_grid <- expand.grid(nrounds = xgb_tune$bestTune$nrounds, 
                          eta = xgb_tune$bestTune$eta,
                          max_depth = xgb_tune$bestTune$max_depth,
                          gamma = xgb_tune$bestTune$gamma,
                          colsample_bytree = xgb_tune$bestTune$colsample_bytree,
                          min_child_weight = xgb_tune$bestTune$min_child_weight,
                          subsample = xgb_tune$bestTune$subsample
)

xgb_model <- train(x= train_upsample[, -295], 
                   y = train_upsample$Class,
                   trControl = train_Control, 
                   tuneGrid = final_grid, 
                   method = "xgbTree", 
                   verbose = F
)

## Fitting nrounds = 81, eta = 0.3, max_depth = 4, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set

# Creating prediction here
xgb.pred <- predict(xgb_model, data_test)

After creating xgb.pred, we run the Confusion Matrix to get the performance metrics.

confusionMatrix(xgb.pred, data_test$Suicide)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 19  4
##          1  2  4
##                                           
##                Accuracy : 0.7931          
##                  95% CI : (0.6028, 0.9201)
##     No Information Rate : 0.7241          
##     P-Value [Acc > NIR] : 0.2735          
##                                           
##                   Kappa : 0.4387          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9048          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.8261          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.7241          
##          Detection Rate : 0.6552          
##    Detection Prevalence : 0.7931          
##       Balanced Accuracy : 0.7024          
##                                           
##        'Positive' Class : 0               
##

Accuracy is 79.31% while Balanced Accuracy is only 70.24%

Support Vector Machine (SVM)

SVM will help us decide on optimal decision boundary which can then help classify our labeled data. We’re going to model for the response variable suicide attempts with the adhd_data_3 dataset with both a radial and a linear kernel to try to find the decision boundary as we don’t know whether this is a non-linear problem or linear problem.

adhd_data_4_matx = as.matrix(adhd_data_4)

df.corr.p = as.data.frame(rcorr(adhd_data_4_matx)$P)

# removing Suicide, and repeated columns
correlation_table <- cbind(rownames(df.corr.p), df.corr.p[, 49])[-c(49) , ]
correlation_table %>%
  kbl(caption = "Correlation with Suicide") %>%
  kable_material_dark()

Correlation with Suicide
Age	0.30027942497339
Sex	0.0471220935204864
Race	0.218309067324639
ADHD.Q1	0.0117740269942401
ADHD.Q2	0.11564807090846
ADHD.Q3	0.259154266743925
ADHD.Q4	0.112410726789768
ADHD.Q5	0.216255750653531
ADHD.Q6	0.483512910929568
ADHD.Q7	0.048634884292714
ADHD.Q8	0.151797641933531
ADHD.Q9	0.322485825735442
ADHD.Q10	0.291560841921094
ADHD.Q11	0.515184561690949
ADHD.Q12	0.395887015823113
ADHD.Q13	0.584780116748058
ADHD.Q14	0.249482531618938
ADHD.Q15	0.112873665348362
ADHD.Q16	0.201977696336559
ADHD.Q17	0.872794003748846
ADHD.Q18	0.546374843413306
ADHD.Total	0.102932484662113
MD.Q1a	0.0625907388295026
MD.Q1b	0.00317302701726341
MD.Q1c	0.960575599200187
MD.Q1d	0.0010368039318247
MD.Q1e	0.537896077431999
MD.Q1f	0.00712626183154863
MD.Q1g	0.000120152964638853
MD.Q1h	0.144730606824987
MD.Q1i	0.2592977551472
MD.Q1j	0.267581368524187
MD.Q1k	0.016734195148864
MD.Q1L	0.0156611407664968
MD.Q1m	0.403393642126824
MD.Q2	0.000857782522594164
MD.Q3	0.025029663425552
MD.TOTAL	0.000755726404960466
Alcohol	0.0138128897960139
THC	0.868985202967459
Cocaine	0.158974694425702
Stimulants	0.441880666140152
Sedative.hypnotics	0.0752635208018009
Opioids	0.016937963928505
Court.order	0.0805564613765424
Education	0.230822596373245
Hx.of.Violence	0.149606337250033
Disorderly.Conduct	0.704780193506212
Abuse	0.000396677403584178
NonSubstDx	0.693218272432247
SubstDx	0.0468228684891696

Here are the factors that has at least a 30% correlation with Suicide:

Age
ADHD.Q6
ADHD.Q9
ADHD.Q11
ADHD.Q12
ADHD.Q13
ADHD.Q17
ADHD.Q18
MD.Q1c
MD.Q1e
MD.Q1m
THC
Stimulants
Disorderly.Conduct
NonSubstDx

We further pear down the list of factors to that of correlation coefficient .500 or above.

svm_selected <- as.data.frame(cbind(rownames(df.corr.p[df.corr.p[, 49] >= .5,]), df.corr.p[df.corr.p[, 49]>= .5, 49 ]))
svm_selected <-  svm_selected[complete.cases(svm_selected), ] 
rownames(svm_selected) <- seq(length=nrow(svm_selected))
svm_selected

##                   V1                V2
## 1           ADHD.Q11 0.515184561690949
## 2           ADHD.Q13 0.584780116748058
## 3           ADHD.Q17 0.872794003748846
## 4           ADHD.Q18 0.546374843413306
## 5             MD.Q1c 0.960575599200187
## 6             MD.Q1e 0.537896077431999
## 7                THC 0.868985202967459
## 8 Disorderly.Conduct 0.704780193506212
## 9         NonSubstDx 0.693218272432247

adhd <- adhd_data_4 %>% select(c(svm_selected$V1, 'Suicide' ))

set.seed(45)
indexes = createDataPartition(adhd$Suicide, p = .85, list = F) # results not in a List
train = adhd[indexes, ]
test = adhd[-indexes, ]
 
wts <- 100 / table(test$Suicide)
wts

## 
##         0         1 
##  6.666667 16.666667

radial_svm = svm(Suicide~., data=train, class.weights = wts, C = 13)
print(radial_svm)

## 
## Call:
## svm(formula = Suicide ~ ., data = train, class.weights = wts, C = 13)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  112

test$pred = predict(radial_svm, test)
 
confusionMatrix(test$pred, test$Suicide)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 11  5
##          1  4  1
##                                           
##                Accuracy : 0.5714          
##                  95% CI : (0.3402, 0.7818)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.9501          
##                                           
##                   Kappa : -0.1053         
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.7333          
##             Specificity : 0.1667          
##          Pos Pred Value : 0.6875          
##          Neg Pred Value : 0.2000          
##              Prevalence : 0.7143          
##          Detection Rate : 0.5238          
##    Detection Prevalence : 0.7619          
##       Balanced Accuracy : 0.4500          
##                                           
##        'Positive' Class : 0               
##

linear_svm = svm(factor(Suicide)~., data=train, kernel = "linear", type = 'C-classification', cost = 10, scale = F, class.weights = wts)
print(linear_svm)

## 
## Call:
## svm(formula = factor(Suicide) ~ ., data = train, kernel = "linear", 
##     type = "C-classification", cost = 10, class.weights = wts, scale = F)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
## 
## Number of Support Vectors:  98

test$pred = predict(linear_svm, test)
 
confusionMatrix(test$pred, test$Suicide)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 12  4
##          1  3  2
##                                           
##                Accuracy : 0.6667          
##                  95% CI : (0.4303, 0.8541)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.7705          
##                                           
##                   Kappa : 0.1404          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.3333          
##          Pos Pred Value : 0.7500          
##          Neg Pred Value : 0.4000          
##              Prevalence : 0.7143          
##          Detection Rate : 0.5714          
##    Detection Prevalence : 0.7619          
##       Balanced Accuracy : 0.5667          
##                                           
##        'Positive' Class : 0               
##

Linear SVM performs better with a higher Balanced Accuracy.

# set.seed(688)
# # recoding Loan_Status back to categorical variable
# loan_knn2$Loan_Status <- as.factor(loan_knn2$Loan_Status)
# str(loan_knn2)
# 
# 
# # Data Partitioning
# trainIndex <- createDataPartition(loan_knn2$Loan_Status, p = .8, list = FALSE, times = 1)
# knn_train <- loan_knn2[trainIndex,]
# knn_test  <- loan_knn2[-trainIndex,]
# 
# 
# 
# str(knn_train)

# trControl <- trainControl(method  = "repeatedcv",
#                           repeats  = 11)
# knn.fit <- train(Loan_Status ~ .,
#              method     = "knn",
#              tuneGrid   = expand.grid(k = 1:10),
#              trControl  = trControl,
#              preProcess = c("center","scale"),
#              data       = knn_train
#              )

# knn_pred <- predict(knn.fit, newdata = knn_test)
# # options('max.print' = 100)  
# # getOption("max.print")
# confusionMatrix(knn_pred, knn_test$Loan_Status)