Data 622 Homework 4: Mental Health Data Modeling
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(skimr) # Used for EDA
#library(corrplot) # For correlation matrix
library(mice) # Multivariate Imputation By Chained Equations##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.9 ✓ rsample 0.1.0
## ✓ dials 0.0.10 ✓ tune 0.1.6
## ✓ infer 1.0.0 ✓ workflows 0.2.4
## ✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
## ✓ parsnip 0.1.7 ✓ yardstick 0.0.8
## ✓ recipes 0.1.17
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x foreach::accumulate() masks purrr::accumulate()
## x scales::discard() masks purrr::discard()
## x mice::filter() masks dplyr::filter(), stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x kableExtra::group_rows() masks dplyr::group_rows()
## x dplyr::lag() masks stats::lag()
## x caret::lift() masks purrr::lift()
## x yardstick::precision() masks caret::precision()
## x recipes::prepare() masks VIM::prepare()
## x yardstick::recall() masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec() masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step() masks stats::step()
## x foreach::when() masks purrr::when()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
##
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
##
## tune
## The following object is masked from 'package:rsample':
##
## permutations
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(Boruta)
## Global options
# ---------------------------------
options(max.print="108")
opts_knit$set(width=31)Loading data
# getwd()
setwd("~/Data 622/repos/data622_fa2021/hw4")
adhd_data <- readxl::read_excel('ADHD_data.xlsx', sheet = "Data", .name_repair = "universal", na = "")## New names:
## * `ADHD Q1` -> ADHD.Q1
## * `ADHD Q2` -> ADHD.Q2
## * `ADHD Q3` -> ADHD.Q3
## * `ADHD Q4` -> ADHD.Q4
## * `ADHD Q5` -> ADHD.Q5
## * ...
## tibble [175 × 54] (S3: tbl_df/tbl/data.frame)
## $ Initial : chr [1:175] "JA" "LA" "MD" "RD" ...
## $ Age : num [1:175] 24 48 51 43 34 39 41 48 44 27 ...
## $ Sex : num [1:175] 1 2 2 1 1 2 2 1 2 2 ...
## $ Race : num [1:175] 1 1 1 1 1 1 1 1 1 1 ...
## $ ADHD.Q1 : num [1:175] 1 3 2 3 4 2 2 2 3 2 ...
## $ ADHD.Q2 : num [1:175] 1 3 1 3 4 3 2 4 3 3 ...
## $ ADHD.Q3 : num [1:175] 4 4 2 2 2 1 1 3 3 4 ...
## $ ADHD.Q4 : num [1:175] 2 4 1 2 4 4 3 4 4 4 ...
## $ ADHD.Q5 : num [1:175] 3 5 3 4 4 3 4 3 4 4 ...
## $ ADHD.Q6 : num [1:175] 1 2 3 3 2 2 4 3 3 3 ...
## $ ADHD.Q7 : num [1:175] 1 2 3 2 3 3 2 1 3 4 ...
## $ ADHD.Q8 : num [1:175] 3 3 2 4 4 4 3 1 4 3 ...
## $ ADHD.Q9 : num [1:175] 2 2 0 4 4 4 3 4 3 2 ...
## $ ADHD.Q10 : num [1:175] 4 4 1 2 2 2 4 2 4 4 ...
## $ ADHD.Q11 : num [1:175] 2 1 2 3 4 4 3 4 3 3 ...
## $ ADHD.Q12 : num [1:175] 4 4 0 1 1 2 1 1 0 1 ...
## $ ADHD.Q13 : num [1:175] 1 2 2 3 3 4 4 4 4 3 ...
## $ ADHD.Q14 : num [1:175] 0 4 2 3 2 4 4 3 3 4 ...
## $ ADHD.Q15 : num [1:175] 3 4 3 1 1 3 4 0 3 4 ...
## $ ADHD.Q16 : num [1:175] 1 3 2 2 2 4 4 0 2 4 ...
## $ ADHD.Q17 : num [1:175] 3 1 1 1 1 3 2 1 4 2 ...
## $ ADHD.Q18 : num [1:175] 4 4 1 2 1 3 4 1 3 2 ...
## $ ADHD.Total : num [1:175] 40 55 31 45 48 55 54 41 56 56 ...
## $ MD.Q1a : num [1:175] 1 1 0 1 0 0 1 0 1 1 ...
## $ MD.Q1b : num [1:175] 1 1 0 1 1 1 1 0 1 1 ...
## $ MD.Q1c : num [1:175] 1 1 0 0 0 0 0 0 0 0 ...
## $ MD.Q1d : num [1:175] 1 1 0 0 1 1 0 0 1 0 ...
## $ MD.Q1e : num [1:175] 0 1 1 1 0 1 1 0 1 1 ...
## $ MD.Q1f : num [1:175] 1 1 1 1 1 1 1 1 1 0 ...
## $ MD.Q1g : num [1:175] 1 1 1 1 1 1 0 1 1 1 ...
## $ MD.Q1h : num [1:175] 1 1 0 1 0 1 0 0 0 0 ...
## $ MD.Q1i : num [1:175] 1 1 0 1 0 1 0 0 0 0 ...
## $ MD.Q1j : num [1:175] 1 0 0 0 0 1 0 0 0 1 ...
## $ MD.Q1k : num [1:175] 1 0 0 0 0 1 0 0 0 1 ...
## $ MD.Q1L : num [1:175] 0 1 0 1 0 1 1 1 1 1 ...
## $ MD.Q1m : num [1:175] 1 0 0 1 0 0 0 0 1 1 ...
## $ MD.Q2 : num [1:175] 1 1 0 1 1 1 1 1 1 1 ...
## $ MD.Q3 : num [1:175] 3 3 2 3 2 3 3 3 3 2 ...
## $ MD.TOTAL : num [1:175] 15 14 5 13 7 14 9 7 12 11 ...
## $ Alcohol : num [1:175] 1 0 0 1 1 1 3 0 1 0 ...
## $ THC : num [1:175] 1 0 0 1 1 0 3 0 0 3 ...
## $ Cocaine : num [1:175] 1 0 0 1 0 0 1 0 0 0 ...
## $ Stimulants : num [1:175] 0 0 0 1 0 0 1 0 0 0 ...
## $ Sedative.hypnotics: num [1:175] 0 0 0 0 0 0 1 0 0 0 ...
## $ Opioids : num [1:175] 0 0 0 0 0 0 0 0 1 0 ...
## $ Court.order : num [1:175] 1 0 0 0 1 0 0 0 0 0 ...
## $ Education : num [1:175] 11 14 12 12 9 11 12 16 12 9 ...
## $ Hx.of.Violence : num [1:175] 0 0 0 0 1 0 1 1 1 0 ...
## $ Disorderly.Conduct: num [1:175] 1 0 0 0 1 1 1 1 1 1 ...
## $ Suicide : num [1:175] 1 1 0 1 1 1 0 0 0 0 ...
## $ Abuse : num [1:175] 0 4 6 7 0 2 4 0 0 2 ...
## $ Non.subst.Dx : num [1:175] 2 1 2 2 2 0 1 2 1 1 ...
## $ Subst.Dx : num [1:175] 0 0 0 0 0 0 0 1 0 2 ...
## $ Psych.meds. : num [1:175] 2 1 1 2 0 0 1 2 1 0 ...
# loan_raw <- read.csv('https://raw.githubusercontent.com/metis-macys-66898/data622_fa2021/main/hw3/data/Loan_approval.csv', header = TRUE, na.strings = " ")
# loan_raw[loan_raw==""] <- NA
# loan_raw <- loan_raw %>% mutate_if(is.character, factor)
# loan <- loan_rawData Processing Steps
I employed a couple strategies into transforming the data before we can apply models.
Imputing only when necessary. Imputing by applying the right method for the right variable type.
Make sure to make the fields into factors when they are categorical variables.
aggr_plot <- aggr(adhd_data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))##
## Variables sorted by number of missings:
## Variable Count
## Psych.meds. 0.67428571
## Subst.Dx 0.13142857
## Non.subst.Dx 0.12571429
## Abuse 0.08000000
## Suicide 0.07428571
## Hx.of.Violence 0.06285714
## Disorderly.Conduct 0.06285714
## Education 0.05142857
## Court.order 0.02857143
## Alcohol 0.02285714
## THC 0.02285714
## Cocaine 0.02285714
## Stimulants 0.02285714
## Sedative.hypnotics 0.02285714
## Opioids 0.02285714
## Initial 0.00000000
## Age 0.00000000
## Sex 0.00000000
## Race 0.00000000
## ADHD.Q1 0.00000000
## ADHD.Q2 0.00000000
## ADHD.Q3 0.00000000
## ADHD.Q4 0.00000000
## ADHD.Q5 0.00000000
## ADHD.Q6 0.00000000
## ADHD.Q7 0.00000000
## ADHD.Q8 0.00000000
## ADHD.Q9 0.00000000
## ADHD.Q10 0.00000000
## ADHD.Q11 0.00000000
## ADHD.Q12 0.00000000
## ADHD.Q13 0.00000000
## ADHD.Q14 0.00000000
## ADHD.Q15 0.00000000
## ADHD.Q16 0.00000000
## ADHD.Q17 0.00000000
## ADHD.Q18 0.00000000
## ADHD.Total 0.00000000
## MD.Q1a 0.00000000
## MD.Q1b 0.00000000
## MD.Q1c 0.00000000
## MD.Q1d 0.00000000
## MD.Q1e 0.00000000
## MD.Q1f 0.00000000
## MD.Q1g 0.00000000
## MD.Q1h 0.00000000
## MD.Q1i 0.00000000
## MD.Q1j 0.00000000
## MD.Q1k 0.00000000
## MD.Q1L 0.00000000
## MD.Q1m 0.00000000
## MD.Q2 0.00000000
## MD.Q3 0.00000000
## MD.TOTAL 0.00000000
Removing Initial as it provides no values to any models. Also, I removed Psych.meds. as it has 67% missing.
adhd_data_pre_imp <- adhd_data %>% select(-c('Initial','Psych.meds.'))
skim(adhd_data_pre_imp) %>% dplyr::filter( n_missing > 0 ) | Name | adhd_data_pre_imp |
| Number of rows | 175 |
| Number of columns | 52 |
| _______________________ | |
| Column type frequency: | |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Alcohol | 4 | 0.98 | 1.35 | 1.39 | 0 | 0 | 1 | 3.0 | 3 | ▇▂▁▁▆ |
| THC | 4 | 0.98 | 0.81 | 1.27 | 0 | 0 | 0 | 1.5 | 3 | ▇▁▁▁▃ |
| Cocaine | 4 | 0.98 | 1.09 | 1.39 | 0 | 0 | 0 | 3.0 | 3 | ▇▁▁▁▅ |
| Stimulants | 4 | 0.98 | 0.12 | 0.53 | 0 | 0 | 0 | 0.0 | 3 | ▇▁▁▁▁ |
| Sedative.hypnotics | 4 | 0.98 | 0.12 | 0.54 | 0 | 0 | 0 | 0.0 | 3 | ▇▁▁▁▁ |
| Opioids | 4 | 0.98 | 0.39 | 0.99 | 0 | 0 | 0 | 0.0 | 3 | ▇▁▁▁▁ |
| Court.order | 5 | 0.97 | 0.09 | 0.28 | 0 | 0 | 0 | 0.0 | 1 | ▇▁▁▁▁ |
| Education | 9 | 0.95 | 11.90 | 2.17 | 6 | 11 | 12 | 13.0 | 19 | ▁▅▇▂▁ |
| Hx.of.Violence | 11 | 0.94 | 0.24 | 0.43 | 0 | 0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
| Disorderly.Conduct | 11 | 0.94 | 0.73 | 0.45 | 0 | 0 | 1 | 1.0 | 1 | ▃▁▁▁▇ |
| Suicide | 13 | 0.93 | 0.30 | 0.46 | 0 | 0 | 0 | 1.0 | 1 | ▇▁▁▁▃ |
| Abuse | 14 | 0.92 | 1.33 | 2.12 | 0 | 0 | 0 | 2.0 | 7 | ▇▂▁▁▁ |
| Non.subst.Dx | 22 | 0.87 | 0.44 | 0.68 | 0 | 0 | 0 | 1.0 | 2 | ▇▁▃▁▁ |
| Subst.Dx | 23 | 0.87 | 1.14 | 0.93 | 0 | 0 | 1 | 2.0 | 3 | ▆▇▁▅▂ |
adhd_data_pre_imp <- adhd_data_pre_imp %>% mutate(NonSubstDx = as.factor(Non.subst.Dx) ,
SubstDx = as.factor(Subst.Dx) ,
Abuse = factor(Abuse, ordered = TRUE),
Alcohol = as.factor(Alcohol)
)
adhd_data_pre_imp <- adhd_data_pre_imp %>% mutate_if(is.numeric, factor)
skim(adhd_data_pre_imp)| Name | adhd_data_pre_imp |
| Number of rows | 175 |
| Number of columns | 54 |
| _______________________ | |
| Column type frequency: | |
| factor | 54 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Age | 0 | 1.00 | FALSE | 42 | 44: 12, 51: 9, 26: 8, 33: 7 |
| Sex | 0 | 1.00 | FALSE | 2 | 1: 99, 2: 76 |
| Race | 0 | 1.00 | FALSE | 4 | 2: 100, 1: 72, 6: 2, 3: 1 |
| ADHD.Q1 | 0 | 1.00 | FALSE | 5 | 2: 44, 1: 43, 0: 39, 3: 30 |
| ADHD.Q2 | 0 | 1.00 | FALSE | 5 | 2: 47, 1: 46, 3: 33, 0: 25 |
| ADHD.Q3 | 0 | 1.00 | FALSE | 5 | 1: 46, 2: 46, 3: 32, 0: 26 |
| ADHD.Q4 | 0 | 1.00 | FALSE | 5 | 2: 50, 4: 36, 1: 31, 3: 31 |
| ADHD.Q5 | 0 | 1.00 | FALSE | 6 | 3: 47, 4: 41, 0: 33, 2: 32 |
| ADHD.Q6 | 0 | 1.00 | FALSE | 5 | 2: 45, 3: 45, 0: 36, 1: 29 |
| ADHD.Q7 | 0 | 1.00 | FALSE | 5 | 2: 54, 1: 53, 3: 25, 0: 22 |
| ADHD.Q8 | 0 | 1.00 | FALSE | 5 | 3: 42, 1: 40, 2: 40, 4: 32 |
| ADHD.Q9 | 0 | 1.00 | FALSE | 5 | 1: 43, 3: 41, 2: 36, 0: 31 |
| ADHD.Q10 | 0 | 1.00 | FALSE | 5 | 2: 49, 1: 46, 3: 33, 4: 32 |
| ADHD.Q11 | 0 | 1.00 | FALSE | 5 | 2: 48, 3: 43, 4: 35, 1: 33 |
| ADHD.Q12 | 0 | 1.00 | FALSE | 5 | 0: 55, 1: 55, 2: 37, 3: 15 |
| ADHD.Q13 | 0 | 1.00 | FALSE | 5 | 3: 47, 2: 46, 4: 38, 1: 29 |
| ADHD.Q14 | 0 | 1.00 | FALSE | 5 | 3: 47, 2: 40, 4: 37, 0: 27 |
| ADHD.Q15 | 0 | 1.00 | FALSE | 5 | 0: 50, 1: 39, 2: 35, 3: 27 |
| ADHD.Q16 | 0 | 1.00 | FALSE | 5 | 1: 49, 0: 40, 2: 39, 4: 30 |
| ADHD.Q17 | 0 | 1.00 | FALSE | 5 | 0: 49, 2: 46, 1: 41, 3: 22 |
| ADHD.Q18 | 0 | 1.00 | FALSE | 5 | 1: 52, 0: 49, 2: 35, 3: 20 |
| ADHD.Total | 0 | 1.00 | FALSE | 62 | 17: 8, 31: 7, 32: 7, 24: 6 |
| MD.Q1a | 0 | 1.00 | FALSE | 2 | 1: 96, 0: 79 |
| MD.Q1b | 0 | 1.00 | FALSE | 2 | 1: 100, 0: 75 |
| MD.Q1c | 0 | 1.00 | FALSE | 2 | 1: 95, 0: 80 |
| MD.Q1d | 0 | 1.00 | FALSE | 2 | 1: 102, 0: 73 |
| MD.Q1e | 0 | 1.00 | FALSE | 2 | 1: 97, 0: 78 |
| MD.Q1f | 0 | 1.00 | FALSE | 2 | 1: 122, 0: 53 |
| MD.Q1g | 0 | 1.00 | FALSE | 2 | 1: 126, 0: 49 |
| MD.Q1h | 0 | 1.00 | FALSE | 2 | 1: 98, 0: 77 |
| MD.Q1i | 0 | 1.00 | FALSE | 2 | 1: 103, 0: 72 |
| MD.Q1j | 0 | 1.00 | FALSE | 2 | 0: 107, 1: 68 |
| MD.Q1k | 0 | 1.00 | FALSE | 2 | 0: 90, 1: 85 |
| MD.Q1L | 0 | 1.00 | FALSE | 2 | 1: 102, 0: 73 |
| MD.Q1m | 0 | 1.00 | FALSE | 2 | 0: 89, 1: 86 |
| MD.Q2 | 0 | 1.00 | FALSE | 2 | 1: 126, 0: 49 |
| MD.Q3 | 0 | 1.00 | FALSE | 4 | 3: 76, 2: 49, 0: 25, 1: 25 |
| MD.TOTAL | 0 | 1.00 | FALSE | 18 | 11: 18, 15: 14, 10: 13, 13: 13 |
| Alcohol | 4 | 0.98 | FALSE | 4 | 0: 80, 3: 66, 1: 18, 2: 7 |
| THC | 4 | 0.98 | FALSE | 4 | 0: 116, 3: 40, 1: 12, 2: 3 |
| Cocaine | 4 | 0.98 | FALSE | 4 | 0: 101, 3: 56, 1: 9, 2: 5 |
| Stimulants | 4 | 0.98 | FALSE | 3 | 0: 160, 1: 6, 3: 5 |
| Sedative.hypnotics | 4 | 0.98 | FALSE | 4 | 0: 161, 3: 5, 1: 4, 2: 1 |
| Opioids | 4 | 0.98 | FALSE | 3 | 0: 146, 3: 21, 1: 4 |
| Court.order | 5 | 0.97 | FALSE | 2 | 0: 155, 1: 15 |
| Education | 9 | 0.95 | FALSE | 14 | 12: 67, 11: 23, 13: 15, 14: 14 |
| Hx.of.Violence | 11 | 0.94 | FALSE | 2 | 0: 124, 1: 40 |
| Disorderly.Conduct | 11 | 0.94 | FALSE | 2 | 1: 119, 0: 45 |
| Suicide | 13 | 0.93 | FALSE | 2 | 0: 113, 1: 49 |
| Abuse | 14 | 0.92 | TRUE | 8 | 0: 101, 2: 20, 5: 10, 1: 8 |
| Non.subst.Dx | 22 | 0.87 | FALSE | 3 | 0: 102, 1: 35, 2: 16 |
| Subst.Dx | 23 | 0.87 | FALSE | 4 | 1: 61, 0: 42, 2: 35, 3: 14 |
| NonSubstDx | 22 | 0.87 | FALSE | 3 | 0: 102, 1: 35, 2: 16 |
| SubstDx | 23 | 0.87 | FALSE | 4 | 1: 61, 0: 42, 2: 35, 3: 14 |
adhd_data_pre_imp1 <- adhd_data_pre_imp %>% mutate(Education = as.numeric(Education) )
skim(adhd_data_pre_imp1)| Name | adhd_data_pre_imp1 |
| Number of rows | 175 |
| Number of columns | 54 |
| _______________________ | |
| Column type frequency: | |
| factor | 53 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Age | 0 | 1.00 | FALSE | 42 | 44: 12, 51: 9, 26: 8, 33: 7 |
| Sex | 0 | 1.00 | FALSE | 2 | 1: 99, 2: 76 |
| Race | 0 | 1.00 | FALSE | 4 | 2: 100, 1: 72, 6: 2, 3: 1 |
| ADHD.Q1 | 0 | 1.00 | FALSE | 5 | 2: 44, 1: 43, 0: 39, 3: 30 |
| ADHD.Q2 | 0 | 1.00 | FALSE | 5 | 2: 47, 1: 46, 3: 33, 0: 25 |
| ADHD.Q3 | 0 | 1.00 | FALSE | 5 | 1: 46, 2: 46, 3: 32, 0: 26 |
| ADHD.Q4 | 0 | 1.00 | FALSE | 5 | 2: 50, 4: 36, 1: 31, 3: 31 |
| ADHD.Q5 | 0 | 1.00 | FALSE | 6 | 3: 47, 4: 41, 0: 33, 2: 32 |
| ADHD.Q6 | 0 | 1.00 | FALSE | 5 | 2: 45, 3: 45, 0: 36, 1: 29 |
| ADHD.Q7 | 0 | 1.00 | FALSE | 5 | 2: 54, 1: 53, 3: 25, 0: 22 |
| ADHD.Q8 | 0 | 1.00 | FALSE | 5 | 3: 42, 1: 40, 2: 40, 4: 32 |
| ADHD.Q9 | 0 | 1.00 | FALSE | 5 | 1: 43, 3: 41, 2: 36, 0: 31 |
| ADHD.Q10 | 0 | 1.00 | FALSE | 5 | 2: 49, 1: 46, 3: 33, 4: 32 |
| ADHD.Q11 | 0 | 1.00 | FALSE | 5 | 2: 48, 3: 43, 4: 35, 1: 33 |
| ADHD.Q12 | 0 | 1.00 | FALSE | 5 | 0: 55, 1: 55, 2: 37, 3: 15 |
| ADHD.Q13 | 0 | 1.00 | FALSE | 5 | 3: 47, 2: 46, 4: 38, 1: 29 |
| ADHD.Q14 | 0 | 1.00 | FALSE | 5 | 3: 47, 2: 40, 4: 37, 0: 27 |
| ADHD.Q15 | 0 | 1.00 | FALSE | 5 | 0: 50, 1: 39, 2: 35, 3: 27 |
| ADHD.Q16 | 0 | 1.00 | FALSE | 5 | 1: 49, 0: 40, 2: 39, 4: 30 |
| ADHD.Q17 | 0 | 1.00 | FALSE | 5 | 0: 49, 2: 46, 1: 41, 3: 22 |
| ADHD.Q18 | 0 | 1.00 | FALSE | 5 | 1: 52, 0: 49, 2: 35, 3: 20 |
| ADHD.Total | 0 | 1.00 | FALSE | 62 | 17: 8, 31: 7, 32: 7, 24: 6 |
| MD.Q1a | 0 | 1.00 | FALSE | 2 | 1: 96, 0: 79 |
| MD.Q1b | 0 | 1.00 | FALSE | 2 | 1: 100, 0: 75 |
| MD.Q1c | 0 | 1.00 | FALSE | 2 | 1: 95, 0: 80 |
| MD.Q1d | 0 | 1.00 | FALSE | 2 | 1: 102, 0: 73 |
| MD.Q1e | 0 | 1.00 | FALSE | 2 | 1: 97, 0: 78 |
| MD.Q1f | 0 | 1.00 | FALSE | 2 | 1: 122, 0: 53 |
| MD.Q1g | 0 | 1.00 | FALSE | 2 | 1: 126, 0: 49 |
| MD.Q1h | 0 | 1.00 | FALSE | 2 | 1: 98, 0: 77 |
| MD.Q1i | 0 | 1.00 | FALSE | 2 | 1: 103, 0: 72 |
| MD.Q1j | 0 | 1.00 | FALSE | 2 | 0: 107, 1: 68 |
| MD.Q1k | 0 | 1.00 | FALSE | 2 | 0: 90, 1: 85 |
| MD.Q1L | 0 | 1.00 | FALSE | 2 | 1: 102, 0: 73 |
| MD.Q1m | 0 | 1.00 | FALSE | 2 | 0: 89, 1: 86 |
| MD.Q2 | 0 | 1.00 | FALSE | 2 | 1: 126, 0: 49 |
| MD.Q3 | 0 | 1.00 | FALSE | 4 | 3: 76, 2: 49, 0: 25, 1: 25 |
| MD.TOTAL | 0 | 1.00 | FALSE | 18 | 11: 18, 15: 14, 10: 13, 13: 13 |
| Alcohol | 4 | 0.98 | FALSE | 4 | 0: 80, 3: 66, 1: 18, 2: 7 |
| THC | 4 | 0.98 | FALSE | 4 | 0: 116, 3: 40, 1: 12, 2: 3 |
| Cocaine | 4 | 0.98 | FALSE | 4 | 0: 101, 3: 56, 1: 9, 2: 5 |
| Stimulants | 4 | 0.98 | FALSE | 3 | 0: 160, 1: 6, 3: 5 |
| Sedative.hypnotics | 4 | 0.98 | FALSE | 4 | 0: 161, 3: 5, 1: 4, 2: 1 |
| Opioids | 4 | 0.98 | FALSE | 3 | 0: 146, 3: 21, 1: 4 |
| Court.order | 5 | 0.97 | FALSE | 2 | 0: 155, 1: 15 |
| Hx.of.Violence | 11 | 0.94 | FALSE | 2 | 0: 124, 1: 40 |
| Disorderly.Conduct | 11 | 0.94 | FALSE | 2 | 1: 119, 0: 45 |
| Suicide | 13 | 0.93 | FALSE | 2 | 0: 113, 1: 49 |
| Abuse | 14 | 0.92 | TRUE | 8 | 0: 101, 2: 20, 5: 10, 1: 8 |
| Non.subst.Dx | 22 | 0.87 | FALSE | 3 | 0: 102, 1: 35, 2: 16 |
| Subst.Dx | 23 | 0.87 | FALSE | 4 | 1: 61, 0: 42, 2: 35, 3: 14 |
| NonSubstDx | 22 | 0.87 | FALSE | 3 | 0: 102, 1: 35, 2: 16 |
| SubstDx | 23 | 0.87 | FALSE | 4 | 1: 61, 0: 42, 2: 35, 3: 14 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Education | 9 | 0.95 | 6.9 | 2.17 | 1 | 6 | 7 | 8 | 14 | ▁▅▇▂▁ |
Imputation
More specifically, I separated the variables with missing values into the following categories:
Categorical Variables with more than 2 levels: Subst.Dx, NonSubstDx, Opioids, Sedative.hypnotics, Stimulants, Cocaine, THC, Alcohol - polyreg
Categorical Variables with 2 levels: Disorderly.Conduct, Hx.of.Violence, Court.order - logreg
Discrete variable (ordered) with more than 2 levels: Abuse - polr
Continuous Variables: Education -
pmm-> norm
## Warning: Number of logged events: 2
meth <- init$method
predM <- init$predictorMatrix
meth[c('Education')] <- 'pmm'
meth[c('Disorderly.Conduct', 'Hx.of.Violence', 'Court.order')] <- 'logreg'
meth[c('Subst.Dx', 'NonSubstDx' , 'Opioids', 'Sedative.hypnotics', 'Stimulants', 'Cocaine', 'THC', 'Alcohol')] <- 'polyreg'
meth[c('Abuse')] = 'polr'
meth[c('Age','Sex','Race','ADHD.Q1','ADHD.Q2','ADHD.Q3','ADHD.Q4','ADHD.Q5','ADHD.Q6','ADHD.Q7','ADHD.Q8','ADHD.Q9','ADHD.Q10','ADHD.Q11','ADHD.Q12','ADHD.Q13','ADHD.Q14','ADHD.Q15','ADHD.Q16','ADHD.Q17','ADHD.Q18','ADHD.Total','MD.Q1a','MD.Q1b','MD.Q1c','MD.Q1d','MD.Q1e','MD.Q1f','MD.Q1g','MD.Q1h','MD.Q1i','MD.Q1j','MD.Q1k','MD.Q1L','MD.Q1m','MD.Q2','MD.Q3','MD.TOTAL','Suicide')] = ''
# adhd_data_imp1 <- mice(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, seed=501)
# adhd_data_imp1 <- parlmice(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, cluster.seed=501, m = 4, n.core = 4, n.imp.core = 100)
# no_cores <- detectCores() - 1
#
# cl<-makePSOCKcluster(no_cores)
#
# registerDoParallel(cl)
#
# start.time<-proc.time()
#
# adhd_data_imp1 <- mice.par(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, seed=301, m = 5)
#
# stop.time<-proc.time()
#
# run.time<-stop.time -start.time
#
# print(run.time)
#
# stopCluster(cl)
# user system elapsed
# 1.826 4.303 264.775 # abandoned this method as it's not always consistent
# cl<-makePSOCKcluster(no_cores)
# registerDoParallel(cl)
#
# start.time<-proc.time()
#
# adhd_data_imp1 <- parlmice(adhd_data_pre_imp1, method=meth, predictorMatrix=predM, cluster.seed=501, m = 4, n.core = 4, n.imp.core = 1)
#
# stop.time<-proc.time()
#
# run.time<-stop.time -start.time
#
# print(run.time)
#
# stopCluster(cl)Examining the imputations
Examining the imputations for Education. As the 50-th percentile before was 12, we picked an impute that is exactly that. Impute #4 fits the bill.
# str(adhd_data_imp1)
# skim(adhd_data_imp1) %>% dplyr::filter( n_missing > 0 )
#
#Eduation
# adhd_data_imp1$imp$Education
# adhd_data_pre_imp1[140:142,]
#
#
# #Age
# adhd_data_imp1$imp$Alcohol# skim(adhd_data_2) %>% dplyr::filter( n_missing > 0 )
# # # saving a R data frame
# saveRDS(adhd_data_2, "adhd_data_2.rds")Ended up dropping 29 records as even there are other means of imputing the missing records manually. I just don’t think it’s trustworthy when advanced algorithms did not end up imputing them. The need to have complete cases, except for the suicide variable for section of clustering and PCA where suicide variable is not the response variable, or variable of interest, is the ultimate reason why I had to drop any records with even a missing field from the dataset.
# loading rds object
# adhd_data_2 <- readRDS("adhd_data_2.rds")
# adhd_data_imp1$imp$Suicide
# 41 NA NA NA NA NA
# 49 NA NA NA NA NA
# 53 NA NA NA NA NA
# 67 NA NA NA NA NA
# 73 NA NA NA NA NA
# 106 NA NA NA NA NA
# 117 NA NA NA NA NA
# 122 NA NA NA NA NA
# 129 NA NA NA NA NA
# 131
# adhd_data_pre_imp1[complete.cases(adhd_data_pre_imp1[ , -49]),]
# saved 3 records
# adhd_data_3 <- adhd_data_2[complete.cases(adhd_data_2[ , -49]),]Count for each Education / suicide attempt
ggplot(adhd_data_3, aes(x = Education, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
Count for each Subst.Dx / suicide attempt
ggplot(adhd_data_3, aes(x = Subst.Dx, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
Count for each Non.subst.Dx / suicide attempt
ggplot(adhd_data_3, aes(x = Non.subst.Dx, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
Count for each Abuse / Suicide
ggplot(adhd_data_3, aes(x = Abuse, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4", "black"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
Clustering Method
Principal Component Analysis
Xtreme Gradient Boosting (XGBoost)
Removing the lone record that Suicide is NA.
## [1] 146 54
## [1] 145 54
Splitting the train and test dataset
set.seed(108)
sample_size <- floor(nrow(adhd_data_4)*0.8)
indices <- sample(1:nrow(adhd_data_4),sample_size)
data_train <- adhd_data_4[c(indices),]
data_test <- adhd_data_4[-c(indices),]One-hot encoding of features, which is a requirement for XGBoost. At the end, you wanted to make sure the number of columns in data_train and data_test are the same.
# do the 1-hot encoding for the data_train
Suicide <- data_train[, 49]
dummy <- dummyVars(" ~ . ", data = data_train [, -49] )
newdata <- data.frame(predict (dummy, newdata = data_train[, -49]))
data_train <- cbind (newdata, Suicide)
options(max.print="309")
colnames(data_train)## [1] "Age.18" "Age.19" "Age.20"
## [4] "Age.21" "Age.22" "Age.23"
## [7] "Age.24" "Age.25" "Age.26"
## [10] "Age.27" "Age.28" "Age.29"
## [13] "Age.30" "Age.31" "Age.32"
## [16] "Age.33" "Age.34" "Age.35"
## [19] "Age.36" "Age.37" "Age.38"
## [22] "Age.39" "Age.40" "Age.41"
## [25] "Age.42" "Age.43" "Age.44"
## [28] "Age.45" "Age.46" "Age.47"
## [31] "Age.48" "Age.49" "Age.50"
## [34] "Age.51" "Age.52" "Age.53"
## [37] "Age.54" "Age.55" "Age.56"
## [40] "Age.57" "Age.61" "Age.69"
## [43] "Sex.1" "Sex.2" "Race.1"
## [46] "Race.2" "Race.3" "Race.6"
## [49] "ADHD.Q1.0" "ADHD.Q1.1" "ADHD.Q1.2"
## [52] "ADHD.Q1.3" "ADHD.Q1.4" "ADHD.Q2.0"
## [55] "ADHD.Q2.1" "ADHD.Q2.2" "ADHD.Q2.3"
## [58] "ADHD.Q2.4" "ADHD.Q3.0" "ADHD.Q3.1"
## [61] "ADHD.Q3.2" "ADHD.Q3.3" "ADHD.Q3.4"
## [64] "ADHD.Q4.0" "ADHD.Q4.1" "ADHD.Q4.2"
## [67] "ADHD.Q4.3" "ADHD.Q4.4" "ADHD.Q5.0"
## [70] "ADHD.Q5.1" "ADHD.Q5.2" "ADHD.Q5.3"
## [73] "ADHD.Q5.4" "ADHD.Q5.5" "ADHD.Q6.0"
## [76] "ADHD.Q6.1" "ADHD.Q6.2" "ADHD.Q6.3"
## [79] "ADHD.Q6.4" "ADHD.Q7.0" "ADHD.Q7.1"
## [82] "ADHD.Q7.2" "ADHD.Q7.3" "ADHD.Q7.4"
## [85] "ADHD.Q8.0" "ADHD.Q8.1" "ADHD.Q8.2"
## [88] "ADHD.Q8.3" "ADHD.Q8.4" "ADHD.Q9.0"
## [91] "ADHD.Q9.1" "ADHD.Q9.2" "ADHD.Q9.3"
## [94] "ADHD.Q9.4" "ADHD.Q10.0" "ADHD.Q10.1"
## [97] "ADHD.Q10.2" "ADHD.Q10.3" "ADHD.Q10.4"
## [100] "ADHD.Q11.0" "ADHD.Q11.1" "ADHD.Q11.2"
## [103] "ADHD.Q11.3" "ADHD.Q11.4" "ADHD.Q12.0"
## [106] "ADHD.Q12.1" "ADHD.Q12.2" "ADHD.Q12.3"
## [109] "ADHD.Q12.4" "ADHD.Q13.0" "ADHD.Q13.1"
## [112] "ADHD.Q13.2" "ADHD.Q13.3" "ADHD.Q13.4"
## [115] "ADHD.Q14.0" "ADHD.Q14.1" "ADHD.Q14.2"
## [118] "ADHD.Q14.3" "ADHD.Q14.4" "ADHD.Q15.0"
## [121] "ADHD.Q15.1" "ADHD.Q15.2" "ADHD.Q15.3"
## [124] "ADHD.Q15.4" "ADHD.Q16.0" "ADHD.Q16.1"
## [127] "ADHD.Q16.2" "ADHD.Q16.3" "ADHD.Q16.4"
## [130] "ADHD.Q17.0" "ADHD.Q17.1" "ADHD.Q17.2"
## [133] "ADHD.Q17.3" "ADHD.Q17.4" "ADHD.Q18.0"
## [136] "ADHD.Q18.1" "ADHD.Q18.2" "ADHD.Q18.3"
## [139] "ADHD.Q18.4" "ADHD.Total.0" "ADHD.Total.1"
## [142] "ADHD.Total.3" "ADHD.Total.5" "ADHD.Total.6"
## [145] "ADHD.Total.7" "ADHD.Total.8" "ADHD.Total.9"
## [148] "ADHD.Total.10" "ADHD.Total.11" "ADHD.Total.12"
## [151] "ADHD.Total.13" "ADHD.Total.14" "ADHD.Total.16"
## [154] "ADHD.Total.17" "ADHD.Total.18" "ADHD.Total.19"
## [157] "ADHD.Total.20" "ADHD.Total.21" "ADHD.Total.23"
## [160] "ADHD.Total.24" "ADHD.Total.25" "ADHD.Total.26"
## [163] "ADHD.Total.27" "ADHD.Total.28" "ADHD.Total.29"
## [166] "ADHD.Total.30" "ADHD.Total.31" "ADHD.Total.32"
## [169] "ADHD.Total.33" "ADHD.Total.34" "ADHD.Total.35"
## [172] "ADHD.Total.36" "ADHD.Total.37" "ADHD.Total.38"
## [175] "ADHD.Total.39" "ADHD.Total.40" "ADHD.Total.41"
## [178] "ADHD.Total.42" "ADHD.Total.43" "ADHD.Total.44"
## [181] "ADHD.Total.45" "ADHD.Total.46" "ADHD.Total.47"
## [184] "ADHD.Total.48" "ADHD.Total.49" "ADHD.Total.50"
## [187] "ADHD.Total.51" "ADHD.Total.52" "ADHD.Total.53"
## [190] "ADHD.Total.54" "ADHD.Total.55" "ADHD.Total.56"
## [193] "ADHD.Total.57" "ADHD.Total.58" "ADHD.Total.62"
## [196] "ADHD.Total.63" "ADHD.Total.65" "ADHD.Total.67"
## [199] "ADHD.Total.69" "ADHD.Total.71" "ADHD.Total.72"
## [202] "MD.Q1a.0" "MD.Q1a.1" "MD.Q1b.0"
## [205] "MD.Q1b.1" "MD.Q1c.0" "MD.Q1c.1"
## [208] "MD.Q1d.0" "MD.Q1d.1" "MD.Q1e.0"
## [211] "MD.Q1e.1" "MD.Q1f.0" "MD.Q1f.1"
## [214] "MD.Q1g.0" "MD.Q1g.1" "MD.Q1h.0"
## [217] "MD.Q1h.1" "MD.Q1i.0" "MD.Q1i.1"
## [220] "MD.Q1j.0" "MD.Q1j.1" "MD.Q1k.0"
## [223] "MD.Q1k.1" "MD.Q1L.0" "MD.Q1L.1"
## [226] "MD.Q1m.0" "MD.Q1m.1" "MD.Q2.0"
## [229] "MD.Q2.1" "MD.Q3.0" "MD.Q3.1"
## [232] "MD.Q3.2" "MD.Q3.3" "MD.TOTAL.0"
## [235] "MD.TOTAL.1" "MD.TOTAL.2" "MD.TOTAL.3"
## [238] "MD.TOTAL.4" "MD.TOTAL.5" "MD.TOTAL.6"
## [241] "MD.TOTAL.7" "MD.TOTAL.8" "MD.TOTAL.9"
## [244] "MD.TOTAL.10" "MD.TOTAL.11" "MD.TOTAL.12"
## [247] "MD.TOTAL.13" "MD.TOTAL.14" "MD.TOTAL.15"
## [250] "MD.TOTAL.16" "MD.TOTAL.17" "Alcohol.0"
## [253] "Alcohol.1" "Alcohol.2" "Alcohol.3"
## [256] "THC.0" "THC.1" "THC.2"
## [259] "THC.3" "Cocaine.0" "Cocaine.1"
## [262] "Cocaine.2" "Cocaine.3" "Stimulants.0"
## [265] "Stimulants.1" "Stimulants.3" "Sedative.hypnotics.0"
## [268] "Sedative.hypnotics.1" "Sedative.hypnotics.2" "Sedative.hypnotics.3"
## [271] "Opioids.0" "Opioids.1" "Opioids.3"
## [274] "Court.order.0" "Court.order.1" "Education"
## [277] "Hx.of.Violence.0" "Hx.of.Violence.1" "Disorderly.Conduct.0"
## [280] "Disorderly.Conduct.1" "Abuse.L" "Abuse.Q"
## [283] "Abuse.C" "Abuse.4" "Abuse.5"
## [286] "Abuse.6" "Abuse.7" "NonSubstDx.0"
## [289] "NonSubstDx.1" "NonSubstDx.2" "SubstDx.0"
## [292] "SubstDx.1" "SubstDx.2" "SubstDx.3"
## [295] "Suicide"
# repeat the same exercise for the data_test
Suicide1 <- data_test[, 49]
dummy1 <- dummyVars(" ~ . ", data = data_test [, -49] )
newdata1 <- data.frame(predict (dummy1, newdata = data_test[, -49]))
data_test <- cbind (newdata1, Suicide1)
colnames(data_test)## [1] "Age.18" "Age.19" "Age.20"
## [4] "Age.21" "Age.22" "Age.23"
## [7] "Age.24" "Age.25" "Age.26"
## [10] "Age.27" "Age.28" "Age.29"
## [13] "Age.30" "Age.31" "Age.32"
## [16] "Age.33" "Age.34" "Age.35"
## [19] "Age.36" "Age.37" "Age.38"
## [22] "Age.39" "Age.40" "Age.41"
## [25] "Age.42" "Age.43" "Age.44"
## [28] "Age.45" "Age.46" "Age.47"
## [31] "Age.48" "Age.49" "Age.50"
## [34] "Age.51" "Age.52" "Age.53"
## [37] "Age.54" "Age.55" "Age.56"
## [40] "Age.57" "Age.61" "Age.69"
## [43] "Sex.1" "Sex.2" "Race.1"
## [46] "Race.2" "Race.3" "Race.6"
## [49] "ADHD.Q1.0" "ADHD.Q1.1" "ADHD.Q1.2"
## [52] "ADHD.Q1.3" "ADHD.Q1.4" "ADHD.Q2.0"
## [55] "ADHD.Q2.1" "ADHD.Q2.2" "ADHD.Q2.3"
## [58] "ADHD.Q2.4" "ADHD.Q3.0" "ADHD.Q3.1"
## [61] "ADHD.Q3.2" "ADHD.Q3.3" "ADHD.Q3.4"
## [64] "ADHD.Q4.0" "ADHD.Q4.1" "ADHD.Q4.2"
## [67] "ADHD.Q4.3" "ADHD.Q4.4" "ADHD.Q5.0"
## [70] "ADHD.Q5.1" "ADHD.Q5.2" "ADHD.Q5.3"
## [73] "ADHD.Q5.4" "ADHD.Q5.5" "ADHD.Q6.0"
## [76] "ADHD.Q6.1" "ADHD.Q6.2" "ADHD.Q6.3"
## [79] "ADHD.Q6.4" "ADHD.Q7.0" "ADHD.Q7.1"
## [82] "ADHD.Q7.2" "ADHD.Q7.3" "ADHD.Q7.4"
## [85] "ADHD.Q8.0" "ADHD.Q8.1" "ADHD.Q8.2"
## [88] "ADHD.Q8.3" "ADHD.Q8.4" "ADHD.Q9.0"
## [91] "ADHD.Q9.1" "ADHD.Q9.2" "ADHD.Q9.3"
## [94] "ADHD.Q9.4" "ADHD.Q10.0" "ADHD.Q10.1"
## [97] "ADHD.Q10.2" "ADHD.Q10.3" "ADHD.Q10.4"
## [100] "ADHD.Q11.0" "ADHD.Q11.1" "ADHD.Q11.2"
## [103] "ADHD.Q11.3" "ADHD.Q11.4" "ADHD.Q12.0"
## [106] "ADHD.Q12.1" "ADHD.Q12.2" "ADHD.Q12.3"
## [109] "ADHD.Q12.4" "ADHD.Q13.0" "ADHD.Q13.1"
## [112] "ADHD.Q13.2" "ADHD.Q13.3" "ADHD.Q13.4"
## [115] "ADHD.Q14.0" "ADHD.Q14.1" "ADHD.Q14.2"
## [118] "ADHD.Q14.3" "ADHD.Q14.4" "ADHD.Q15.0"
## [121] "ADHD.Q15.1" "ADHD.Q15.2" "ADHD.Q15.3"
## [124] "ADHD.Q15.4" "ADHD.Q16.0" "ADHD.Q16.1"
## [127] "ADHD.Q16.2" "ADHD.Q16.3" "ADHD.Q16.4"
## [130] "ADHD.Q17.0" "ADHD.Q17.1" "ADHD.Q17.2"
## [133] "ADHD.Q17.3" "ADHD.Q17.4" "ADHD.Q18.0"
## [136] "ADHD.Q18.1" "ADHD.Q18.2" "ADHD.Q18.3"
## [139] "ADHD.Q18.4" "ADHD.Total.0" "ADHD.Total.1"
## [142] "ADHD.Total.3" "ADHD.Total.5" "ADHD.Total.6"
## [145] "ADHD.Total.7" "ADHD.Total.8" "ADHD.Total.9"
## [148] "ADHD.Total.10" "ADHD.Total.11" "ADHD.Total.12"
## [151] "ADHD.Total.13" "ADHD.Total.14" "ADHD.Total.16"
## [154] "ADHD.Total.17" "ADHD.Total.18" "ADHD.Total.19"
## [157] "ADHD.Total.20" "ADHD.Total.21" "ADHD.Total.23"
## [160] "ADHD.Total.24" "ADHD.Total.25" "ADHD.Total.26"
## [163] "ADHD.Total.27" "ADHD.Total.28" "ADHD.Total.29"
## [166] "ADHD.Total.30" "ADHD.Total.31" "ADHD.Total.32"
## [169] "ADHD.Total.33" "ADHD.Total.34" "ADHD.Total.35"
## [172] "ADHD.Total.36" "ADHD.Total.37" "ADHD.Total.38"
## [175] "ADHD.Total.39" "ADHD.Total.40" "ADHD.Total.41"
## [178] "ADHD.Total.42" "ADHD.Total.43" "ADHD.Total.44"
## [181] "ADHD.Total.45" "ADHD.Total.46" "ADHD.Total.47"
## [184] "ADHD.Total.48" "ADHD.Total.49" "ADHD.Total.50"
## [187] "ADHD.Total.51" "ADHD.Total.52" "ADHD.Total.53"
## [190] "ADHD.Total.54" "ADHD.Total.55" "ADHD.Total.56"
## [193] "ADHD.Total.57" "ADHD.Total.58" "ADHD.Total.62"
## [196] "ADHD.Total.63" "ADHD.Total.65" "ADHD.Total.67"
## [199] "ADHD.Total.69" "ADHD.Total.71" "ADHD.Total.72"
## [202] "MD.Q1a.0" "MD.Q1a.1" "MD.Q1b.0"
## [205] "MD.Q1b.1" "MD.Q1c.0" "MD.Q1c.1"
## [208] "MD.Q1d.0" "MD.Q1d.1" "MD.Q1e.0"
## [211] "MD.Q1e.1" "MD.Q1f.0" "MD.Q1f.1"
## [214] "MD.Q1g.0" "MD.Q1g.1" "MD.Q1h.0"
## [217] "MD.Q1h.1" "MD.Q1i.0" "MD.Q1i.1"
## [220] "MD.Q1j.0" "MD.Q1j.1" "MD.Q1k.0"
## [223] "MD.Q1k.1" "MD.Q1L.0" "MD.Q1L.1"
## [226] "MD.Q1m.0" "MD.Q1m.1" "MD.Q2.0"
## [229] "MD.Q2.1" "MD.Q3.0" "MD.Q3.1"
## [232] "MD.Q3.2" "MD.Q3.3" "MD.TOTAL.0"
## [235] "MD.TOTAL.1" "MD.TOTAL.2" "MD.TOTAL.3"
## [238] "MD.TOTAL.4" "MD.TOTAL.5" "MD.TOTAL.6"
## [241] "MD.TOTAL.7" "MD.TOTAL.8" "MD.TOTAL.9"
## [244] "MD.TOTAL.10" "MD.TOTAL.11" "MD.TOTAL.12"
## [247] "MD.TOTAL.13" "MD.TOTAL.14" "MD.TOTAL.15"
## [250] "MD.TOTAL.16" "MD.TOTAL.17" "Alcohol.0"
## [253] "Alcohol.1" "Alcohol.2" "Alcohol.3"
## [256] "THC.0" "THC.1" "THC.2"
## [259] "THC.3" "Cocaine.0" "Cocaine.1"
## [262] "Cocaine.2" "Cocaine.3" "Stimulants.0"
## [265] "Stimulants.1" "Stimulants.3" "Sedative.hypnotics.0"
## [268] "Sedative.hypnotics.1" "Sedative.hypnotics.2" "Sedative.hypnotics.3"
## [271] "Opioids.0" "Opioids.1" "Opioids.3"
## [274] "Court.order.0" "Court.order.1" "Education"
## [277] "Hx.of.Violence.0" "Hx.of.Violence.1" "Disorderly.Conduct.0"
## [280] "Disorderly.Conduct.1" "Abuse.L" "Abuse.Q"
## [283] "Abuse.C" "Abuse.4" "Abuse.5"
## [286] "Abuse.6" "Abuse.7" "NonSubstDx.0"
## [289] "NonSubstDx.1" "NonSubstDx.2" "SubstDx.0"
## [292] "SubstDx.1" "SubstDx.2" "SubstDx.3"
## [295] "Suicide1"
## 'data.frame': 116 obs. of 295 variables:
## $ Age.18 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.19 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.21 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.22 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.23 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.24 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.25 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Age.26 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.27 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.28 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.29 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.30 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.31 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.32 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.33 : num 0 0 0 0 1 0 0 0 1 0 ...
## $ Age.34 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.35 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.36 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.37 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Age.38 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.39 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.40 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Age.41 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Age.42 : num 0 1 0 0 0 0 0 1 0 0 ...
## $ Age.43 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.44 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.45 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.46 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Age.47 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.48 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.49 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Age.50 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.51 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.53 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.54 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.55 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.57 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.61 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.69 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sex.1 : num 1 1 0 0 1 1 1 0 0 1 ...
## $ Sex.2 : num 0 0 1 1 0 0 0 1 1 0 ...
## $ Race.1 : num 0 0 1 1 1 1 0 0 0 1 ...
## $ Race.2 : num 1 1 0 0 0 0 1 1 1 0 ...
## $ Race.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Race.6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q1.0 : num 1 0 0 0 0 0 1 0 0 0 ...
## $ ADHD.Q1.1 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q1.2 : num 0 0 0 1 0 1 0 0 0 1 ...
## $ ADHD.Q1.3 : num 0 0 0 0 0 0 0 1 1 0 ...
## $ ADHD.Q1.4 : num 0 0 1 0 1 0 0 0 0 0 ...
## $ ADHD.Q2.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q2.1 : num 1 0 0 1 0 0 0 1 0 0 ...
## $ ADHD.Q2.2 : num 0 1 0 0 0 1 1 0 1 1 ...
## $ ADHD.Q2.3 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ ADHD.Q2.4 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ADHD.Q3.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q3.1 : num 1 0 1 1 0 1 0 0 0 0 ...
## $ ADHD.Q3.2 : num 0 0 0 0 0 0 1 1 0 0 ...
## $ ADHD.Q3.3 : num 0 0 0 0 1 0 0 0 1 1 ...
## $ ADHD.Q3.4 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q4.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q4.1 : num 0 0 0 1 0 0 0 1 0 0 ...
## $ ADHD.Q4.2 : num 1 0 0 0 0 1 1 0 1 0 ...
## $ ADHD.Q4.3 : num 0 1 0 0 1 0 0 0 0 1 ...
## $ ADHD.Q4.4 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ ADHD.Q5.0 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ ADHD.Q5.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q5.2 : num 0 0 1 0 0 0 0 1 0 0 ...
## $ ADHD.Q5.3 : num 0 1 0 1 1 1 0 0 1 1 ...
## $ ADHD.Q5.4 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q5.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q6.0 : num 0 0 1 0 0 0 1 0 0 0 ...
## $ ADHD.Q6.1 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ADHD.Q6.2 : num 0 0 0 0 0 0 0 0 1 1 ...
## $ ADHD.Q6.3 : num 1 1 0 1 0 1 0 1 0 0 ...
## $ ADHD.Q6.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q7.0 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ADHD.Q7.1 : num 1 0 0 1 0 1 0 0 0 0 ...
## $ ADHD.Q7.2 : num 0 1 1 0 0 0 1 0 0 0 ...
## $ ADHD.Q7.3 : num 0 0 0 0 1 0 0 0 0 1 ...
## $ ADHD.Q7.4 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ ADHD.Q8.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q8.1 : num 0 1 0 0 0 1 0 1 0 0 ...
## $ ADHD.Q8.2 : num 0 0 0 1 0 0 1 0 0 0 ...
## $ ADHD.Q8.3 : num 1 0 1 0 1 0 0 0 1 1 ...
## $ ADHD.Q8.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q9.0 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ADHD.Q9.1 : num 0 0 0 0 0 1 0 0 1 0 ...
## $ ADHD.Q9.2 : num 0 0 0 1 0 0 1 0 0 0 ...
## $ ADHD.Q9.3 : num 1 1 1 0 1 0 0 0 0 1 ...
## $ ADHD.Q9.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q10.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q10.1 : num 1 0 0 1 0 0 0 1 0 0 ...
## $ ADHD.Q10.2 : num 0 1 0 0 0 1 1 0 1 0 ...
## $ ADHD.Q10.3 : num 0 0 1 0 1 0 0 0 0 1 ...
## $ ADHD.Q10.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
As you can tell below, the target variable in the data_train is unbalanced and is skewed toward 0 (68% 0s).
##
## 0 1
## 79 37
train_upsample <- upSample(x=data_train[, -295], y = data_train$Suicide)
# There is a new variable created called Class so that's why I can drop the Suicide column
table(train_upsample$Class)##
## 0 1
## 79 79
Preparing the grid for XGBoost
grid_tune <- expand.grid(
nrounds = c(81), # of trees normal range is 1500 - 3000 depends on # of records
max_depth = c(2, 4, 6),
eta = c(0.01, 0.1, 0.3), # Learning rate
gamma = 0, # pruning normal range [0, 1]
colsample_bytree = 1, # subsample ratio for columns for tree
min_child_weight = 1, # the larger the more conservative the model is, can be used as a stop
subsample = 1 # used to prevent overfitting by sampling x% training dataset
)The following defines the trainControl and actually train the XGBoost
trainControl <- trainControl (method = "cv",
number = 3,
verboseIter = T,
allowParallel = T
)
xgb_tune <- train(x = train_upsample[, -295],
y = train_upsample$Class,
trControl = trainControl,
tuneGrid = grid_tune,
method = "xgbTree",
verbose = T
)## + Fold1: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 81, max_depth = 4, eta = 0.3, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 8 81 4 0.3 0 1 1 1
# Creating the best model
train_Control <- trainControl (method = "none",
verboseIter = T,
allowParallel = T
)
final_grid <- expand.grid(nrounds = xgb_tune$bestTune$nrounds,
eta = xgb_tune$bestTune$eta,
max_depth = xgb_tune$bestTune$max_depth,
gamma = xgb_tune$bestTune$gamma,
colsample_bytree = xgb_tune$bestTune$colsample_bytree,
min_child_weight = xgb_tune$bestTune$min_child_weight,
subsample = xgb_tune$bestTune$subsample
)
xgb_model <- train(x= train_upsample[, -295],
y = train_upsample$Class,
trControl = train_Control,
tuneGrid = final_grid,
method = "xgbTree",
verbose = F
) ## Fitting nrounds = 81, eta = 0.3, max_depth = 4, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
After creating xgb.pred, we run the Confusion Matrix to get the performance metrics.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19 4
## 1 2 4
##
## Accuracy : 0.7931
## 95% CI : (0.6028, 0.9201)
## No Information Rate : 0.7241
## P-Value [Acc > NIR] : 0.2735
##
## Kappa : 0.4387
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9048
## Specificity : 0.5000
## Pos Pred Value : 0.8261
## Neg Pred Value : 0.6667
## Prevalence : 0.7241
## Detection Rate : 0.6552
## Detection Prevalence : 0.7931
## Balanced Accuracy : 0.7024
##
## 'Positive' Class : 0
##
Accuracy is 79.31% while Balanced Accuracy is only 70.24%
Support Vector Machine (SVM)
SVM will help us decide on optimal decision boundary which can then help classify our labeled data. We’re going to model for the response variable suicide attempts with the adhd_data_3 dataset with both a radial and a linear kernel to try to find the decision boundary as we don’t know whether this is a non-linear problem or linear problem.
adhd_data_4_matx = as.matrix(adhd_data_4)
df.corr.p = as.data.frame(rcorr(adhd_data_4_matx)$P)
# removing Suicide, and repeated columns
correlation_table <- cbind(rownames(df.corr.p), df.corr.p[, 49])[-c(49) , ]
correlation_table %>%
kbl(caption = "Correlation with Suicide") %>%
kable_material_dark()| Age | 0.30027942497339 |
| Sex | 0.0471220935204864 |
| Race | 0.218309067324639 |
| ADHD.Q1 | 0.0117740269942401 |
| ADHD.Q2 | 0.11564807090846 |
| ADHD.Q3 | 0.259154266743925 |
| ADHD.Q4 | 0.112410726789768 |
| ADHD.Q5 | 0.216255750653531 |
| ADHD.Q6 | 0.483512910929568 |
| ADHD.Q7 | 0.048634884292714 |
| ADHD.Q8 | 0.151797641933531 |
| ADHD.Q9 | 0.322485825735442 |
| ADHD.Q10 | 0.291560841921094 |
| ADHD.Q11 | 0.515184561690949 |
| ADHD.Q12 | 0.395887015823113 |
| ADHD.Q13 | 0.584780116748058 |
| ADHD.Q14 | 0.249482531618938 |
| ADHD.Q15 | 0.112873665348362 |
| ADHD.Q16 | 0.201977696336559 |
| ADHD.Q17 | 0.872794003748846 |
| ADHD.Q18 | 0.546374843413306 |
| ADHD.Total | 0.102932484662113 |
| MD.Q1a | 0.0625907388295026 |
| MD.Q1b | 0.00317302701726341 |
| MD.Q1c | 0.960575599200187 |
| MD.Q1d | 0.0010368039318247 |
| MD.Q1e | 0.537896077431999 |
| MD.Q1f | 0.00712626183154863 |
| MD.Q1g | 0.000120152964638853 |
| MD.Q1h | 0.144730606824987 |
| MD.Q1i | 0.2592977551472 |
| MD.Q1j | 0.267581368524187 |
| MD.Q1k | 0.016734195148864 |
| MD.Q1L | 0.0156611407664968 |
| MD.Q1m | 0.403393642126824 |
| MD.Q2 | 0.000857782522594164 |
| MD.Q3 | 0.025029663425552 |
| MD.TOTAL | 0.000755726404960466 |
| Alcohol | 0.0138128897960139 |
| THC | 0.868985202967459 |
| Cocaine | 0.158974694425702 |
| Stimulants | 0.441880666140152 |
| Sedative.hypnotics | 0.0752635208018009 |
| Opioids | 0.016937963928505 |
| Court.order | 0.0805564613765424 |
| Education | 0.230822596373245 |
| Hx.of.Violence | 0.149606337250033 |
| Disorderly.Conduct | 0.704780193506212 |
| Abuse | 0.000396677403584178 |
| NonSubstDx | 0.693218272432247 |
| SubstDx | 0.0468228684891696 |
Here are the factors that has at least a 30% correlation with Suicide:
Age
ADHD.Q6
ADHD.Q9
ADHD.Q11
ADHD.Q12
ADHD.Q13
ADHD.Q17
ADHD.Q18
MD.Q1c
MD.Q1e
MD.Q1m
THC
Stimulants
Disorderly.Conduct
NonSubstDx
We further pear down the list of factors to that of correlation coefficient .500 or above.
svm_selected <- as.data.frame(cbind(rownames(df.corr.p[df.corr.p[, 49] >= .5,]), df.corr.p[df.corr.p[, 49]>= .5, 49 ]))
svm_selected <- svm_selected[complete.cases(svm_selected), ]
rownames(svm_selected) <- seq(length=nrow(svm_selected))
svm_selected## V1 V2
## 1 ADHD.Q11 0.515184561690949
## 2 ADHD.Q13 0.584780116748058
## 3 ADHD.Q17 0.872794003748846
## 4 ADHD.Q18 0.546374843413306
## 5 MD.Q1c 0.960575599200187
## 6 MD.Q1e 0.537896077431999
## 7 THC 0.868985202967459
## 8 Disorderly.Conduct 0.704780193506212
## 9 NonSubstDx 0.693218272432247
set.seed(45)
indexes = createDataPartition(adhd$Suicide, p = .85, list = F) # results not in a List
train = adhd[indexes, ]
test = adhd[-indexes, ]
wts <- 100 / table(test$Suicide)
wts##
## 0 1
## 6.666667 16.666667
##
## Call:
## svm(formula = Suicide ~ ., data = train, class.weights = wts, C = 13)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 112
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11 5
## 1 4 1
##
## Accuracy : 0.5714
## 95% CI : (0.3402, 0.7818)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.9501
##
## Kappa : -0.1053
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7333
## Specificity : 0.1667
## Pos Pred Value : 0.6875
## Neg Pred Value : 0.2000
## Prevalence : 0.7143
## Detection Rate : 0.5238
## Detection Prevalence : 0.7619
## Balanced Accuracy : 0.4500
##
## 'Positive' Class : 0
##
linear_svm = svm(factor(Suicide)~., data=train, kernel = "linear", type = 'C-classification', cost = 10, scale = F, class.weights = wts)
print(linear_svm)##
## Call:
## svm(formula = factor(Suicide) ~ ., data = train, kernel = "linear",
## type = "C-classification", cost = 10, class.weights = wts, scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 98
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12 4
## 1 3 2
##
## Accuracy : 0.6667
## 95% CI : (0.4303, 0.8541)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.7705
##
## Kappa : 0.1404
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8000
## Specificity : 0.3333
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.4000
## Prevalence : 0.7143
## Detection Rate : 0.5714
## Detection Prevalence : 0.7619
## Balanced Accuracy : 0.5667
##
## 'Positive' Class : 0
##
Linear SVM performs better with a higher Balanced Accuracy.
# set.seed(688)
# # recoding Loan_Status back to categorical variable
# loan_knn2$Loan_Status <- as.factor(loan_knn2$Loan_Status)
# str(loan_knn2)
#
#
# # Data Partitioning
# trainIndex <- createDataPartition(loan_knn2$Loan_Status, p = .8, list = FALSE, times = 1)
# knn_train <- loan_knn2[trainIndex,]
# knn_test <- loan_knn2[-trainIndex,]
#
#
#
# str(knn_train)