# Saving the records
# saveRDS(adhd_data_3, "adhd_data_3.rds")
#Loading the records
adhd_data_3 <- readRDS("adhd_data_3.rds")ggplot(adhd_data_3, aes(x = Education, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()ggplot(adhd_data_3, aes(x = Subst.Dx, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()ggplot(adhd_data_3, aes(x = Non.subst.Dx, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()ggplot(adhd_data_3, aes(x = Abuse, fill = Suicide)) +
geom_bar(alpha = 0.8) +
scale_fill_manual(values = c("darkorange", "purple", "cyan4", "black"),
guide = F) +
theme_minimal() +
facet_wrap(~Suicide, ncol = 1) +
coord_flip()Here we are going to start our clustering techniques using hierarchical clustering. The benefits of this clustering type is that we don’t need to specify the number of clusters and results are reproducable.
Instead of using all the columns from our data, we decided to only use the column ADHD.Total an MD.Total to the represent the individual columns combined together. This will help our results to be easier to interpret.
The output from hierarchical clustering is a dendogram, which is a graph that looks like a tree. We can see from our dendogram below there are 4 distinct clusters.
hier_cluster_subset <- adhd_data_3 %>% select(c('Age','Sex','Race','ADHD.Total','MD.TOTAL','Alcohol','THC','Cocaine','Stimulants','Sedative.hypnotics','Opioids','Court.order','Education','Hx.of.Violence','Disorderly.Conduct','Suicide','Abuse','Non.subst.Dx','Subst.Dx','NonSubstDx','SubstDx'))
hier_cluster_complete <- hclust(dist(hier_cluster_subset),method="complete")
plot(hier_cluster_complete, hang = -1, cex = 0.6)
abline(h = 43.5, col = "red") From the above dendogram, we can see there are 4 distinct clusters so we will cut our tree at this level using the function cutree(). This function returns a vector of cluster assignments, which can be see here:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 2 3 1 1 1 1 2 1 1 1 3 3 1 4 1 2 2 4 2
## 21 24 25 26 27 28 29 33 34 35 36 38 39 40 42 43 44 45 46 47
## 2 2 2 2 2 1 2 4 1 3 4 1 1 1 2 4 1 3 4 1
## 48 50 51 52 54 55 56 57 58 59 60 61 62 63 64 65 66 68 69 70
## 4 3 3 4 2 2 1 4 1 4 1 1 2 1 1 4 3 2 1 4
## 71 72 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
## 1 2 1 4 2 2 3 2 2 3 2 1 2 4 4 2 3 1 1 1
## 92 93 94 96 97 98 100 101 109 110 111 113 114 115 116 118 119 120 121 123
## 3 1 4 1 1 1 1 1 3 3 1 4 4 4 4 3 3 4 4 1
## 124 125 126 127 128 129 130 133
## 1 4 3 4 3 1 1 4
## [ reached getOption("max.print") -- omitted 38 entries ]
The above clustering included quite a few columns. Now we are going to make a simpler clustering dendogram using only age,ADHD.Total,MD.TOTAL. These columns represent our numeric data.
hier_cluster_subset_simple <- adhd_data_3 %>% select(c('Age','ADHD.Total','MD.TOTAL'))
hier_cluster_complete_simple <- hclust(dist(hier_cluster_subset_simple),method="complete")
plot(hier_cluster_complete_simple, hang = -1, cex = 0.6)
abline(h = 50, col = "red") Based on the above dendogram, we can see there are 3 distinct clusters. Again, we will cut our tree and view the output cluster assignments.
num_cut = cutree(hier_cluster_complete_simple,3)
hier_cluster_subset_simple$cluster_pred <- num_cut
num_cut## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 2 1 1 2 2 2 1 2 2 1 1 1 1 1 1 1 2 1 2
## 21 24 25 26 27 28 29 33 34 35 36 38 39 40 42 43 44 45 46 47
## 2 2 2 2 1 2 2 1 2 1 1 1 1 2 2 1 1 1 3 1
## 48 50 51 52 54 55 56 57 58 59 60 61 62 63 64 65 66 68 69 70
## 1 1 1 3 2 2 1 1 1 1 1 2 2 1 1 3 1 1 2 1
## 71 72 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
## 1 2 1 3 1 2 1 2 2 1 2 2 1 3 3 2 1 1 2 1
## 92 93 94 96 97 98 100 101 109 110 111 113 114 115 116 118 119 120 121 123
## 1 2 1 1 1 2 2 1 1 1 1 1 3 1 3 1 1 3 3 1
## 124 125 126 127 128 129 130 133
## 2 3 1 1 1 2 1 3
## [ reached getOption("max.print") -- omitted 38 entries ]
Here we can see that it appears that cluster 2 had a median and average age of 38. This cluster’s age fits right between cluster 1 & 3. Cluster 2 had the highest individual scores for reporting ADHD and reporting mood disorder. It would be interesting to see if there is any data to investigate if this age has any correlation with a mid-life crisis. Younger participants self reported the lower amount of ADHD and mood disorder.
results_simple <- hier_cluster_subset_simple %>% dplyr::group_by(cluster_pred)
results_simple$Age <- as.numeric(as.character(results_simple$Age))
results_simple$ADHD.Total <- as.numeric(as.character(results_simple$ADHD.Total))
results_simple$MD.TOTAL <- as.numeric(as.character(results_simple$MD.TOTAL))
results_simple %>% dplyr::summarize(Avg_Age = mean(Age,na.rm=TRUE),
Median_Age = median(Age,na.rm=TRUE),
Avg_ADHD.Total = mean(ADHD.Total,na.rm=TRUE),
Median_ADHD.Total = median(ADHD.Total,na.rm=TRUE),
Avg_MD.TOTAL = mean(MD.TOTAL,na.rm=TRUE),
Median_MD.TOTAL = median(MD.TOTAL,na.rm=TRUE))# function that returns principal components up to the threshold variance
rank.pca = function(data,threshold){
rank = 1
cum_var = 0
while (cum_var <= threshold){
pca = prcomp(data, rank. = rank)
cum_var = summary(pca)$importance[rank*3]
rank = rank + 1
}
return(pca)
}data = adhd_data_3
# arbitrary threshold of .85
pca.thresh = .85
# change types back to int so we can use prcomp()
for (name in names(data)){
data[name] = c(sapply(data[name], as.integer))
}is.adhd_question = str_detect(names(data), '^ADHD\\.Q')
adhd_questions = data[is.adhd_question]
adhd.pca = rank.pca(adhd_questions,pca.thresh)
summary(adhd.pca)## Importance of first k=9 (out of 18) components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.9567 1.58025 1.35806 1.21656 1.10910 1.04123 0.97067
## Proportion of Variance 0.5095 0.08127 0.06002 0.04817 0.04003 0.03528 0.03066
## Cumulative Proportion 0.5095 0.59077 0.65080 0.69897 0.73900 0.77429 0.80495
## PC8 PC9
## Standard deviation 0.91367 0.88988
## Proportion of Variance 0.02717 0.02577
## Cumulative Proportion 0.83212 0.85789
## Standard deviations (1, .., p=18):
## [1] 3.9566536 1.5802489 1.3580553 1.2165616 1.1091016 1.0412286 0.9706667
## [8] 0.9136671 0.8898769 0.8500617 0.8083534 0.7991683 0.7318146 0.7095402
## [15] 0.6497302 0.5745030 0.5517886 0.5060870
##
## Rotation (n x k) = (18 x 9):
## PC1 PC2 PC3 PC4 PC5
## ADHD.Q1 0.2241604 -0.11850805 0.36711647 -0.305577054 0.40191812
## ADHD.Q2 0.2330749 -0.23317338 0.18537730 -0.181861210 0.37182989
## ADHD.Q3 0.2120604 -0.27150539 0.16297362 -0.175680568 -0.47947987
## ADHD.Q4 0.2583036 -0.25992257 0.22458338 -0.058665778 -0.01721544
## ADHD.Q5 0.2707830 -0.14874925 -0.48941003 -0.073054193 -0.30876278
## ADHD.Q6 0.2204925 0.05883647 -0.31113536 -0.422341320 0.08186676
## ADHD.Q7 0.2182837 -0.14805909 0.12990186 -0.087132649 -0.12346514
## ADHD.Q8 0.2678672 -0.14068584 0.17910137 0.232797556 -0.17814563
## ADHD.Q9 0.2671436 -0.06874345 0.11409744 0.290180080 -0.08141198
## ADHD.Q10 0.2516825 -0.07340875 0.07149119 0.120828234 -0.03173831
## ADHD.Q11 0.2317082 -0.12254454 -0.16989734 0.318381020 0.14585970
## ADHD.Q12 0.1960853 0.16854950 0.08940426 0.347522212 -0.10312727
## PC6 PC7 PC8 PC9
## ADHD.Q1 -0.113874775 0.09540069 -0.003783810 0.09079274
## ADHD.Q2 0.022970805 0.16767868 -0.024921227 0.05767276
## ADHD.Q3 0.030251125 0.31979641 -0.128954904 0.19928143
## ADHD.Q4 0.319833635 -0.22186035 -0.136238741 -0.04636797
## ADHD.Q5 -0.155266291 0.13097415 -0.213138796 -0.25882147
## ADHD.Q6 -0.515789910 -0.05485572 -0.017701053 0.30533854
## ADHD.Q7 -0.245563722 -0.47859916 -0.008230004 -0.22352587
## ADHD.Q8 -0.107495813 -0.35289985 0.087928401 -0.27553377
## ADHD.Q9 -0.041060556 0.06638797 0.488949353 0.32135227
## ADHD.Q10 0.135058805 0.29338930 -0.297591145 -0.19444432
## ADHD.Q11 -0.024138078 0.05752253 0.327773990 -0.03823873
## ADHD.Q12 -0.136111549 0.19145471 -0.031731274 0.29776948
## [ reached getOption("max.print") -- omitted 6 rows ]
We were able to use PCA to reduce the results from 18 ADHD questions to 9 principal components while keeping 85% of the variance. For the first principal component (the component that explains the variance the most) we see that each question holds roughly the same amount of weight. The first component has a positive relationship with all 18 questions
is.mood_disorder = str_detect(names(data), '^MD\\.Q')
mood_disorder = data[is.mood_disorder]
md.pca = rank.pca(mood_disorder, pca.thresh)
md.pca%>%
summary()## Importance of first k=9 (out of 15) components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.2869 0.8901 0.54560 0.48373 0.4785 0.4338 0.41024
## Proportion of Variance 0.3775 0.1806 0.06785 0.05334 0.0522 0.0429 0.03836
## Cumulative Proportion 0.3775 0.5581 0.62593 0.67927 0.7315 0.7744 0.81272
## PC8 PC9
## Standard deviation 0.4035 0.37338
## Proportion of Variance 0.0371 0.03178
## Cumulative Proportion 0.8498 0.88160
## Standard deviations (1, .., p=15):
## [1] 1.2869221 0.8900716 0.5456005 0.4837312 0.4785414 0.4338412 0.4102361
## [8] 0.4034524 0.3733821 0.3601925 0.3186736 0.2996987 0.2756148 0.2539722
## [15] 0.2405183
##
## Rotation (n x k) = (15 x 9):
## PC1 PC2 PC3 PC4 PC5 PC6
## MD.Q1a -0.2543278 0.147273718 -0.24046079 0.21487840 -0.18611904 0.212349417
## MD.Q1b -0.2664995 0.005885759 -0.26358013 -0.13618813 0.14553549 -0.003622287
## MD.Q1c -0.1213964 0.227754427 0.48408609 -0.06348524 -0.42280394 0.265412768
## MD.Q1d -0.1908988 0.170616678 -0.10195790 -0.39249062 -0.18001273 0.445830601
## MD.Q1e -0.1967763 0.220382086 0.11734202 -0.26568848 -0.31583754 0.016961076
## MD.Q1f -0.2048252 0.101118382 -0.23262634 -0.22701838 -0.11404800 -0.287946321
## MD.Q1g -0.2230530 0.050666663 -0.29802392 -0.15133759 -0.05760700 -0.297307077
## MD.Q1h -0.1598335 0.338358317 0.19262067 -0.05278491 0.20428980 -0.323267874
## MD.Q1i -0.1636045 0.263971910 0.37525806 -0.05413925 0.22144652 -0.393790752
## MD.Q1j -0.1565535 0.297886667 0.18385599 0.06788461 0.31891921 0.136242343
## MD.Q1k -0.1384274 0.314335407 -0.06093941 0.18863556 0.49400037 0.396372134
## MD.Q1L -0.2309599 0.171750504 -0.35738459 0.40855995 -0.04559609 0.038283582
## PC7 PC8 PC9
## MD.Q1a -0.09902008 0.05234109 -0.54052879
## MD.Q1b -0.15206365 0.34294139 0.06523971
## MD.Q1c -0.05315031 0.42303144 -0.16822960
## MD.Q1d -0.38626121 -0.17653820 0.49880418
## MD.Q1e 0.45395747 -0.40255797 -0.18059534
## MD.Q1f 0.06729004 -0.27620910 -0.17278738
## MD.Q1g 0.30698957 0.19576366 0.19358553
## MD.Q1h -0.33874426 -0.29208952 -0.11849495
## MD.Q1i -0.27539112 0.17283502 -0.02038774
## MD.Q1j 0.54060417 0.25387498 0.18457578
## MD.Q1k 0.07471571 -0.27882103 0.02845118
## MD.Q1L -0.14496184 0.12595956 -0.11941519
## [ reached getOption("max.print") -- omitted 3 rows ]
Similar to the PCA done to ADHD questions, we were able to reduce the 15 questions to 9 principal components. For the first component we see that MD.Q3 is weighted less than the other questions which leads us to believe that it is less important than the other questions. However, all of these questions seem to have a negative relationship with the first component. The second component is more varied, but question MD.Q3 is still the least weighted; which adds to the suspicion that it is less important than the others.
drug_use = data[c("Alcohol", "THC", "Cocaine", "Stimulants", "Sedative.hypnotics", "Opioids")]
drug_use = drug_use %>%
drop_na()
drug.pca = rank.pca(drug_use, pca.thresh)
drug.pca%>%
summary()## Importance of first k=4 (out of 6) components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.4779 1.3523 1.2004 0.74239
## Proportion of Variance 0.3399 0.2846 0.2242 0.08577
## Cumulative Proportion 0.3399 0.6245 0.8487 0.93449
## Standard deviations (1, .., p=6):
## [1] 1.4778602 1.3523390 1.2003990 0.7423910 0.5112899 0.3994609
##
## Rotation (n x k) = (6 x 4):
## PC1 PC2 PC3 PC4
## Alcohol 0.59873741 0.79856286 -0.05767729 0.001194011
## THC 0.35399506 -0.20137630 0.90900737 0.069000983
## Cocaine 0.71511220 -0.56504807 -0.40670175 0.058031197
## Stimulants 0.01080701 0.02137797 0.05531554 0.057781334
## Sedative.hypnotics -0.04377422 0.02060986 -0.03044406 0.503455684
## Opioids -0.05275015 0.03979746 -0.03140007 0.857358449
We were able to reduce the 6 drug questions to 4 principal components. The first component (that explains 34% of the variance) gives significant weight to Cocaine, Alcohol and THC (in order of highest - lowest). The other drugs are not as significant. However, in the second component, that explains 28% of the variance, still gives Alcohol a high weighting but has a negative value for Cocaine and THC. In PC3 (22%) THC has high weighting but everything else has low or negative weights. This shows that the importance of variables can vary heavily in this set of questions
## Importance of first k=2 (out of 53) components:
## PC1 PC2
## Standard deviation 16.2562 11.0020
## Proportion of Variance 0.6025 0.2760
## Cumulative Proportion 0.6025 0.8784
## Standard deviations (1, .., p=53):
## [1] 1.625617e+01 1.100201e+01 4.298463e+00 2.259842e+00 2.064982e+00
## [6] 1.694125e+00 1.519569e+00 1.404101e+00 1.312185e+00 1.195824e+00
## [11] 1.135492e+00 1.097761e+00 1.084214e+00 1.068334e+00 9.619584e-01
## [16] 9.123910e-01 8.769350e-01 8.255091e-01 8.129019e-01 7.527663e-01
## [21] 7.384661e-01 7.297601e-01 6.880044e-01 6.612435e-01 6.158500e-01
## [26] 5.817920e-01 5.759519e-01 5.648775e-01 5.239893e-01 4.922702e-01
## [31] 4.549436e-01 4.264624e-01 4.198509e-01 4.015978e-01 3.915262e-01
## [36] 3.666148e-01 3.624494e-01 3.415041e-01 3.238205e-01 3.158331e-01
## [41] 2.979210e-01 2.861042e-01 2.767981e-01 2.716943e-01 2.377636e-01
## [46] 2.320446e-01 2.190537e-01 2.021666e-01 1.889328e-01 1.713275e-01
## [51] 7.219188e-15 1.148027e-15 1.148027e-15
##
## Rotation (n x k) = (53 x 2):
## PC1 PC2
## Age 0.0620647877 0.9949747519
## Sex -0.0072531154 -0.0013139544
## Race 0.0067183502 -0.0019756931
## ADHD.Q1 -0.0538018681 0.0052266022
## ADHD.Q2 -0.0555768024 0.0131437516
## ADHD.Q3 -0.0518974188 -0.0097617974
## ADHD.Q4 -0.0620688656 -0.0009269728
## ADHD.Q5 -0.0656749621 0.0060069875
## ADHD.Q6 -0.0537440595 0.0022319407
## ADHD.Q7 -0.0527610527 0.0089069799
## ADHD.Q8 -0.0644627667 0.0067179816
## ADHD.Q9 -0.0647007865 -0.0016581017
## ADHD.Q10 -0.0607581069 0.0039688935
## ADHD.Q11 -0.0553917002 0.0155686988
## ADHD.Q12 -0.0482533677 -0.0031498488
## ADHD.Q13 -0.0572811797 0.0133136559
## ADHD.Q14 -0.0599404181 0.0179432493
## ADHD.Q15 -0.0556512239 0.0037447369
## ADHD.Q16 -0.0569413377 -0.0038901384
## ADHD.Q17 -0.0473870752 0.0019770153
## ADHD.Q18 -0.0575764945 -0.0046722699
## ADHD.Total -0.9544847405 0.0624423413
## MD.Q1a -0.0117370617 -0.0031268922
## MD.Q1b -0.0149508997 -0.0071006503
## MD.Q1c 0.0003816895 0.0037086057
## MD.Q1d -0.0057049719 0.0045452572
## MD.Q1e -0.0106591207 0.0037284249
## MD.Q1f -0.0108417095 -0.0002406858
## MD.Q1g -0.0169615453 -0.0005350969
## MD.Q1h -0.0041460482 -0.0061396766
## MD.Q1i -0.0029876057 0.0029566964
## MD.Q1j -0.0050305928 -0.0042465388
## MD.Q1k -0.0034854991 -0.0103054981
## MD.Q1L -0.0095488677 -0.0013163299
## MD.Q1m -0.0066222248 0.0046525494
## MD.Q2 -0.0127116545 -0.0015687173
## MD.Q3 -0.0332974875 -0.0044452927
## MD.TOTAL -0.1483035996 -0.0194338451
## Alcohol -0.0099280156 0.0179090034
## THC 0.0022633833 -0.0412599771
## Cocaine 0.0215747464 0.0216090051
## Stimulants -0.0041764724 -0.0060762925
## Sedative.hypnotics -0.0034086180 -0.0069054001
## Opioids -0.0026294216 -0.0053229874
## Court.order -0.0008875149 -0.0051871667
## Education 0.0056120120 0.0061943216
## Hx.of.Violence 0.0009504824 -0.0022935283
## Disorderly.Conduct 0.0050629122 -0.0030232603
## Abuse -0.0276446538 0.0290582511
## Non.subst.Dx -0.0142051845 0.0097436093
## Subst.Dx 0.0064502653 -0.0143900478
## NonSubstDx -0.0142051845 0.0097436093
## SubstDx 0.0064502653 -0.0143900478
While we did not have much success reducing the groups of questions with PCA, we were able to reduce the entire dataset (53 components) to just two components while still explaining at least 85% of the variance
Removing the lone record that Suicide is NA.
## [1] 146 54
## [1] 145 54
Splitting the train and test dataset
set.seed(108)
sample_size <- floor(nrow(adhd_data_4)*0.8)
indices <- sample(1:nrow(adhd_data_4),sample_size)
data_train <- adhd_data_4[c(indices),]
data_test <- adhd_data_4[-c(indices),]One-hot encoding of features, which is a requirement for XGBoost. At the end, you wanted to make sure the number of columns in data_train and data_test are the same.
# do the 1-hot encoding for the data_train
Suicide <- data_train[, 49]
dummy <- dummyVars(" ~ . ", data = data_train [, -49] )
newdata <- data.frame(predict (dummy, newdata = data_train[, -49]))
data_train <- cbind (newdata, Suicide)
options(max.print="309")
colnames(data_train)## [1] "Age.18" "Age.19" "Age.20"
## [4] "Age.21" "Age.22" "Age.23"
## [7] "Age.24" "Age.25" "Age.26"
## [10] "Age.27" "Age.28" "Age.29"
## [13] "Age.30" "Age.31" "Age.32"
## [16] "Age.33" "Age.34" "Age.35"
## [19] "Age.36" "Age.37" "Age.38"
## [22] "Age.39" "Age.40" "Age.41"
## [25] "Age.42" "Age.43" "Age.44"
## [28] "Age.45" "Age.46" "Age.47"
## [31] "Age.48" "Age.49" "Age.50"
## [34] "Age.51" "Age.52" "Age.53"
## [37] "Age.54" "Age.55" "Age.56"
## [40] "Age.57" "Age.61" "Age.69"
## [43] "Sex.1" "Sex.2" "Race.1"
## [46] "Race.2" "Race.3" "Race.6"
## [49] "ADHD.Q1.0" "ADHD.Q1.1" "ADHD.Q1.2"
## [52] "ADHD.Q1.3" "ADHD.Q1.4" "ADHD.Q2.0"
## [55] "ADHD.Q2.1" "ADHD.Q2.2" "ADHD.Q2.3"
## [58] "ADHD.Q2.4" "ADHD.Q3.0" "ADHD.Q3.1"
## [61] "ADHD.Q3.2" "ADHD.Q3.3" "ADHD.Q3.4"
## [64] "ADHD.Q4.0" "ADHD.Q4.1" "ADHD.Q4.2"
## [67] "ADHD.Q4.3" "ADHD.Q4.4" "ADHD.Q5.0"
## [70] "ADHD.Q5.1" "ADHD.Q5.2" "ADHD.Q5.3"
## [73] "ADHD.Q5.4" "ADHD.Q5.5" "ADHD.Q6.0"
## [76] "ADHD.Q6.1" "ADHD.Q6.2" "ADHD.Q6.3"
## [79] "ADHD.Q6.4" "ADHD.Q7.0" "ADHD.Q7.1"
## [82] "ADHD.Q7.2" "ADHD.Q7.3" "ADHD.Q7.4"
## [85] "ADHD.Q8.0" "ADHD.Q8.1" "ADHD.Q8.2"
## [88] "ADHD.Q8.3" "ADHD.Q8.4" "ADHD.Q9.0"
## [91] "ADHD.Q9.1" "ADHD.Q9.2" "ADHD.Q9.3"
## [94] "ADHD.Q9.4" "ADHD.Q10.0" "ADHD.Q10.1"
## [97] "ADHD.Q10.2" "ADHD.Q10.3" "ADHD.Q10.4"
## [100] "ADHD.Q11.0" "ADHD.Q11.1" "ADHD.Q11.2"
## [103] "ADHD.Q11.3" "ADHD.Q11.4" "ADHD.Q12.0"
## [106] "ADHD.Q12.1" "ADHD.Q12.2" "ADHD.Q12.3"
## [109] "ADHD.Q12.4" "ADHD.Q13.0" "ADHD.Q13.1"
## [112] "ADHD.Q13.2" "ADHD.Q13.3" "ADHD.Q13.4"
## [115] "ADHD.Q14.0" "ADHD.Q14.1" "ADHD.Q14.2"
## [118] "ADHD.Q14.3" "ADHD.Q14.4" "ADHD.Q15.0"
## [121] "ADHD.Q15.1" "ADHD.Q15.2" "ADHD.Q15.3"
## [124] "ADHD.Q15.4" "ADHD.Q16.0" "ADHD.Q16.1"
## [127] "ADHD.Q16.2" "ADHD.Q16.3" "ADHD.Q16.4"
## [130] "ADHD.Q17.0" "ADHD.Q17.1" "ADHD.Q17.2"
## [133] "ADHD.Q17.3" "ADHD.Q17.4" "ADHD.Q18.0"
## [136] "ADHD.Q18.1" "ADHD.Q18.2" "ADHD.Q18.3"
## [139] "ADHD.Q18.4" "ADHD.Total.0" "ADHD.Total.1"
## [142] "ADHD.Total.3" "ADHD.Total.5" "ADHD.Total.6"
## [145] "ADHD.Total.7" "ADHD.Total.8" "ADHD.Total.9"
## [148] "ADHD.Total.10" "ADHD.Total.11" "ADHD.Total.12"
## [151] "ADHD.Total.13" "ADHD.Total.14" "ADHD.Total.16"
## [154] "ADHD.Total.17" "ADHD.Total.18" "ADHD.Total.19"
## [157] "ADHD.Total.20" "ADHD.Total.21" "ADHD.Total.23"
## [160] "ADHD.Total.24" "ADHD.Total.25" "ADHD.Total.26"
## [163] "ADHD.Total.27" "ADHD.Total.28" "ADHD.Total.29"
## [166] "ADHD.Total.30" "ADHD.Total.31" "ADHD.Total.32"
## [169] "ADHD.Total.33" "ADHD.Total.34" "ADHD.Total.35"
## [172] "ADHD.Total.36" "ADHD.Total.37" "ADHD.Total.38"
## [175] "ADHD.Total.39" "ADHD.Total.40" "ADHD.Total.41"
## [178] "ADHD.Total.42" "ADHD.Total.43" "ADHD.Total.44"
## [181] "ADHD.Total.45" "ADHD.Total.46" "ADHD.Total.47"
## [184] "ADHD.Total.48" "ADHD.Total.49" "ADHD.Total.50"
## [187] "ADHD.Total.51" "ADHD.Total.52" "ADHD.Total.53"
## [190] "ADHD.Total.54" "ADHD.Total.55" "ADHD.Total.56"
## [193] "ADHD.Total.57" "ADHD.Total.58" "ADHD.Total.62"
## [196] "ADHD.Total.63" "ADHD.Total.65" "ADHD.Total.67"
## [199] "ADHD.Total.69" "ADHD.Total.71" "ADHD.Total.72"
## [202] "MD.Q1a.0" "MD.Q1a.1" "MD.Q1b.0"
## [205] "MD.Q1b.1" "MD.Q1c.0" "MD.Q1c.1"
## [208] "MD.Q1d.0" "MD.Q1d.1" "MD.Q1e.0"
## [211] "MD.Q1e.1" "MD.Q1f.0" "MD.Q1f.1"
## [214] "MD.Q1g.0" "MD.Q1g.1" "MD.Q1h.0"
## [217] "MD.Q1h.1" "MD.Q1i.0" "MD.Q1i.1"
## [220] "MD.Q1j.0" "MD.Q1j.1" "MD.Q1k.0"
## [223] "MD.Q1k.1" "MD.Q1L.0" "MD.Q1L.1"
## [226] "MD.Q1m.0" "MD.Q1m.1" "MD.Q2.0"
## [229] "MD.Q2.1" "MD.Q3.0" "MD.Q3.1"
## [232] "MD.Q3.2" "MD.Q3.3" "MD.TOTAL.0"
## [235] "MD.TOTAL.1" "MD.TOTAL.2" "MD.TOTAL.3"
## [238] "MD.TOTAL.4" "MD.TOTAL.5" "MD.TOTAL.6"
## [241] "MD.TOTAL.7" "MD.TOTAL.8" "MD.TOTAL.9"
## [244] "MD.TOTAL.10" "MD.TOTAL.11" "MD.TOTAL.12"
## [247] "MD.TOTAL.13" "MD.TOTAL.14" "MD.TOTAL.15"
## [250] "MD.TOTAL.16" "MD.TOTAL.17" "Alcohol.0"
## [253] "Alcohol.1" "Alcohol.2" "Alcohol.3"
## [256] "THC.0" "THC.1" "THC.2"
## [259] "THC.3" "Cocaine.0" "Cocaine.1"
## [262] "Cocaine.2" "Cocaine.3" "Stimulants.0"
## [265] "Stimulants.1" "Stimulants.3" "Sedative.hypnotics.0"
## [268] "Sedative.hypnotics.1" "Sedative.hypnotics.2" "Sedative.hypnotics.3"
## [271] "Opioids.0" "Opioids.1" "Opioids.3"
## [274] "Court.order.0" "Court.order.1" "Education"
## [277] "Hx.of.Violence.0" "Hx.of.Violence.1" "Disorderly.Conduct.0"
## [280] "Disorderly.Conduct.1" "Abuse.L" "Abuse.Q"
## [283] "Abuse.C" "Abuse.4" "Abuse.5"
## [286] "Abuse.6" "Abuse.7" "NonSubstDx.0"
## [289] "NonSubstDx.1" "NonSubstDx.2" "SubstDx.0"
## [292] "SubstDx.1" "SubstDx.2" "SubstDx.3"
## [295] "Suicide"
# repeat the same exercise for the data_test
Suicide1 <- data_test[, 49]
dummy1 <- dummyVars(" ~ . ", data = data_test [, -49] )
newdata1 <- data.frame(predict (dummy1, newdata = data_test[, -49]))
data_test <- cbind (newdata1, Suicide1)
colnames(data_test)## [1] "Age.18" "Age.19" "Age.20"
## [4] "Age.21" "Age.22" "Age.23"
## [7] "Age.24" "Age.25" "Age.26"
## [10] "Age.27" "Age.28" "Age.29"
## [13] "Age.30" "Age.31" "Age.32"
## [16] "Age.33" "Age.34" "Age.35"
## [19] "Age.36" "Age.37" "Age.38"
## [22] "Age.39" "Age.40" "Age.41"
## [25] "Age.42" "Age.43" "Age.44"
## [28] "Age.45" "Age.46" "Age.47"
## [31] "Age.48" "Age.49" "Age.50"
## [34] "Age.51" "Age.52" "Age.53"
## [37] "Age.54" "Age.55" "Age.56"
## [40] "Age.57" "Age.61" "Age.69"
## [43] "Sex.1" "Sex.2" "Race.1"
## [46] "Race.2" "Race.3" "Race.6"
## [49] "ADHD.Q1.0" "ADHD.Q1.1" "ADHD.Q1.2"
## [52] "ADHD.Q1.3" "ADHD.Q1.4" "ADHD.Q2.0"
## [55] "ADHD.Q2.1" "ADHD.Q2.2" "ADHD.Q2.3"
## [58] "ADHD.Q2.4" "ADHD.Q3.0" "ADHD.Q3.1"
## [61] "ADHD.Q3.2" "ADHD.Q3.3" "ADHD.Q3.4"
## [64] "ADHD.Q4.0" "ADHD.Q4.1" "ADHD.Q4.2"
## [67] "ADHD.Q4.3" "ADHD.Q4.4" "ADHD.Q5.0"
## [70] "ADHD.Q5.1" "ADHD.Q5.2" "ADHD.Q5.3"
## [73] "ADHD.Q5.4" "ADHD.Q5.5" "ADHD.Q6.0"
## [76] "ADHD.Q6.1" "ADHD.Q6.2" "ADHD.Q6.3"
## [79] "ADHD.Q6.4" "ADHD.Q7.0" "ADHD.Q7.1"
## [82] "ADHD.Q7.2" "ADHD.Q7.3" "ADHD.Q7.4"
## [85] "ADHD.Q8.0" "ADHD.Q8.1" "ADHD.Q8.2"
## [88] "ADHD.Q8.3" "ADHD.Q8.4" "ADHD.Q9.0"
## [91] "ADHD.Q9.1" "ADHD.Q9.2" "ADHD.Q9.3"
## [94] "ADHD.Q9.4" "ADHD.Q10.0" "ADHD.Q10.1"
## [97] "ADHD.Q10.2" "ADHD.Q10.3" "ADHD.Q10.4"
## [100] "ADHD.Q11.0" "ADHD.Q11.1" "ADHD.Q11.2"
## [103] "ADHD.Q11.3" "ADHD.Q11.4" "ADHD.Q12.0"
## [106] "ADHD.Q12.1" "ADHD.Q12.2" "ADHD.Q12.3"
## [109] "ADHD.Q12.4" "ADHD.Q13.0" "ADHD.Q13.1"
## [112] "ADHD.Q13.2" "ADHD.Q13.3" "ADHD.Q13.4"
## [115] "ADHD.Q14.0" "ADHD.Q14.1" "ADHD.Q14.2"
## [118] "ADHD.Q14.3" "ADHD.Q14.4" "ADHD.Q15.0"
## [121] "ADHD.Q15.1" "ADHD.Q15.2" "ADHD.Q15.3"
## [124] "ADHD.Q15.4" "ADHD.Q16.0" "ADHD.Q16.1"
## [127] "ADHD.Q16.2" "ADHD.Q16.3" "ADHD.Q16.4"
## [130] "ADHD.Q17.0" "ADHD.Q17.1" "ADHD.Q17.2"
## [133] "ADHD.Q17.3" "ADHD.Q17.4" "ADHD.Q18.0"
## [136] "ADHD.Q18.1" "ADHD.Q18.2" "ADHD.Q18.3"
## [139] "ADHD.Q18.4" "ADHD.Total.0" "ADHD.Total.1"
## [142] "ADHD.Total.3" "ADHD.Total.5" "ADHD.Total.6"
## [145] "ADHD.Total.7" "ADHD.Total.8" "ADHD.Total.9"
## [148] "ADHD.Total.10" "ADHD.Total.11" "ADHD.Total.12"
## [151] "ADHD.Total.13" "ADHD.Total.14" "ADHD.Total.16"
## [154] "ADHD.Total.17" "ADHD.Total.18" "ADHD.Total.19"
## [157] "ADHD.Total.20" "ADHD.Total.21" "ADHD.Total.23"
## [160] "ADHD.Total.24" "ADHD.Total.25" "ADHD.Total.26"
## [163] "ADHD.Total.27" "ADHD.Total.28" "ADHD.Total.29"
## [166] "ADHD.Total.30" "ADHD.Total.31" "ADHD.Total.32"
## [169] "ADHD.Total.33" "ADHD.Total.34" "ADHD.Total.35"
## [172] "ADHD.Total.36" "ADHD.Total.37" "ADHD.Total.38"
## [175] "ADHD.Total.39" "ADHD.Total.40" "ADHD.Total.41"
## [178] "ADHD.Total.42" "ADHD.Total.43" "ADHD.Total.44"
## [181] "ADHD.Total.45" "ADHD.Total.46" "ADHD.Total.47"
## [184] "ADHD.Total.48" "ADHD.Total.49" "ADHD.Total.50"
## [187] "ADHD.Total.51" "ADHD.Total.52" "ADHD.Total.53"
## [190] "ADHD.Total.54" "ADHD.Total.55" "ADHD.Total.56"
## [193] "ADHD.Total.57" "ADHD.Total.58" "ADHD.Total.62"
## [196] "ADHD.Total.63" "ADHD.Total.65" "ADHD.Total.67"
## [199] "ADHD.Total.69" "ADHD.Total.71" "ADHD.Total.72"
## [202] "MD.Q1a.0" "MD.Q1a.1" "MD.Q1b.0"
## [205] "MD.Q1b.1" "MD.Q1c.0" "MD.Q1c.1"
## [208] "MD.Q1d.0" "MD.Q1d.1" "MD.Q1e.0"
## [211] "MD.Q1e.1" "MD.Q1f.0" "MD.Q1f.1"
## [214] "MD.Q1g.0" "MD.Q1g.1" "MD.Q1h.0"
## [217] "MD.Q1h.1" "MD.Q1i.0" "MD.Q1i.1"
## [220] "MD.Q1j.0" "MD.Q1j.1" "MD.Q1k.0"
## [223] "MD.Q1k.1" "MD.Q1L.0" "MD.Q1L.1"
## [226] "MD.Q1m.0" "MD.Q1m.1" "MD.Q2.0"
## [229] "MD.Q2.1" "MD.Q3.0" "MD.Q3.1"
## [232] "MD.Q3.2" "MD.Q3.3" "MD.TOTAL.0"
## [235] "MD.TOTAL.1" "MD.TOTAL.2" "MD.TOTAL.3"
## [238] "MD.TOTAL.4" "MD.TOTAL.5" "MD.TOTAL.6"
## [241] "MD.TOTAL.7" "MD.TOTAL.8" "MD.TOTAL.9"
## [244] "MD.TOTAL.10" "MD.TOTAL.11" "MD.TOTAL.12"
## [247] "MD.TOTAL.13" "MD.TOTAL.14" "MD.TOTAL.15"
## [250] "MD.TOTAL.16" "MD.TOTAL.17" "Alcohol.0"
## [253] "Alcohol.1" "Alcohol.2" "Alcohol.3"
## [256] "THC.0" "THC.1" "THC.2"
## [259] "THC.3" "Cocaine.0" "Cocaine.1"
## [262] "Cocaine.2" "Cocaine.3" "Stimulants.0"
## [265] "Stimulants.1" "Stimulants.3" "Sedative.hypnotics.0"
## [268] "Sedative.hypnotics.1" "Sedative.hypnotics.2" "Sedative.hypnotics.3"
## [271] "Opioids.0" "Opioids.1" "Opioids.3"
## [274] "Court.order.0" "Court.order.1" "Education"
## [277] "Hx.of.Violence.0" "Hx.of.Violence.1" "Disorderly.Conduct.0"
## [280] "Disorderly.Conduct.1" "Abuse.L" "Abuse.Q"
## [283] "Abuse.C" "Abuse.4" "Abuse.5"
## [286] "Abuse.6" "Abuse.7" "NonSubstDx.0"
## [289] "NonSubstDx.1" "NonSubstDx.2" "SubstDx.0"
## [292] "SubstDx.1" "SubstDx.2" "SubstDx.3"
## [295] "Suicide1"
## 'data.frame': 116 obs. of 295 variables:
## $ Age.18 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.19 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.21 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.22 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.23 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.24 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.25 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Age.26 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.27 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.28 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.29 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.30 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.31 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.32 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.33 : num 0 0 0 0 1 0 0 0 1 0 ...
## $ Age.34 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.35 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.36 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.37 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Age.38 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.39 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.40 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Age.41 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Age.42 : num 0 1 0 0 0 0 0 1 0 0 ...
## $ Age.43 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.44 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.45 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.46 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Age.47 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.48 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.49 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Age.50 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.51 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.53 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.54 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.55 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.57 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.61 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.69 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sex.1 : num 1 1 0 0 1 1 1 0 0 1 ...
## $ Sex.2 : num 0 0 1 1 0 0 0 1 1 0 ...
## $ Race.1 : num 0 0 1 1 1 1 0 0 0 1 ...
## $ Race.2 : num 1 1 0 0 0 0 1 1 1 0 ...
## $ Race.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Race.6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q1.0 : num 1 0 0 0 0 0 1 0 0 0 ...
## $ ADHD.Q1.1 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q1.2 : num 0 0 0 1 0 1 0 0 0 1 ...
## $ ADHD.Q1.3 : num 0 0 0 0 0 0 0 1 1 0 ...
## $ ADHD.Q1.4 : num 0 0 1 0 1 0 0 0 0 0 ...
## $ ADHD.Q2.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q2.1 : num 1 0 0 1 0 0 0 1 0 0 ...
## $ ADHD.Q2.2 : num 0 1 0 0 0 1 1 0 1 1 ...
## $ ADHD.Q2.3 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ ADHD.Q2.4 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ADHD.Q3.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q3.1 : num 1 0 1 1 0 1 0 0 0 0 ...
## $ ADHD.Q3.2 : num 0 0 0 0 0 0 1 1 0 0 ...
## $ ADHD.Q3.3 : num 0 0 0 0 1 0 0 0 1 1 ...
## $ ADHD.Q3.4 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q4.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q4.1 : num 0 0 0 1 0 0 0 1 0 0 ...
## $ ADHD.Q4.2 : num 1 0 0 0 0 1 1 0 1 0 ...
## $ ADHD.Q4.3 : num 0 1 0 0 1 0 0 0 0 1 ...
## $ ADHD.Q4.4 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ ADHD.Q5.0 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ ADHD.Q5.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q5.2 : num 0 0 1 0 0 0 0 1 0 0 ...
## $ ADHD.Q5.3 : num 0 1 0 1 1 1 0 0 1 1 ...
## $ ADHD.Q5.4 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q5.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q6.0 : num 0 0 1 0 0 0 1 0 0 0 ...
## $ ADHD.Q6.1 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ADHD.Q6.2 : num 0 0 0 0 0 0 0 0 1 1 ...
## $ ADHD.Q6.3 : num 1 1 0 1 0 1 0 1 0 0 ...
## $ ADHD.Q6.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q7.0 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ADHD.Q7.1 : num 1 0 0 1 0 1 0 0 0 0 ...
## $ ADHD.Q7.2 : num 0 1 1 0 0 0 1 0 0 0 ...
## $ ADHD.Q7.3 : num 0 0 0 0 1 0 0 0 0 1 ...
## $ ADHD.Q7.4 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ ADHD.Q8.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q8.1 : num 0 1 0 0 0 1 0 1 0 0 ...
## $ ADHD.Q8.2 : num 0 0 0 1 0 0 1 0 0 0 ...
## $ ADHD.Q8.3 : num 1 0 1 0 1 0 0 0 1 1 ...
## $ ADHD.Q8.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q9.0 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ADHD.Q9.1 : num 0 0 0 0 0 1 0 0 1 0 ...
## $ ADHD.Q9.2 : num 0 0 0 1 0 0 1 0 0 0 ...
## $ ADHD.Q9.3 : num 1 1 1 0 1 0 0 0 0 1 ...
## $ ADHD.Q9.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q10.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ADHD.Q10.1 : num 1 0 0 1 0 0 0 1 0 0 ...
## $ ADHD.Q10.2 : num 0 1 0 0 0 1 1 0 1 0 ...
## $ ADHD.Q10.3 : num 0 0 1 0 1 0 0 0 0 1 ...
## $ ADHD.Q10.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
As you can tell below, the target variable in the data_train is unbalanced and is skewed toward 0 (68% 0s).
##
## 0 1
## 79 37
train_upsample <- upSample(x=data_train[, -295], y = data_train$Suicide)
# There is a new variable created called Class so that's why I can drop the Suicide column
table(train_upsample$Class)##
## 0 1
## 79 79
Preparing the grid for XGBoost
grid_tune <- expand.grid(
nrounds = c(81), # of trees normal range is 1500 - 3000 depends on # of records
max_depth = c(2, 4, 6),
eta = c(0.01, 0.1, 0.3), # Learning rate
gamma = 0, # pruning normal range [0, 1]
colsample_bytree = 1, # subsample ratio for columns for tree
min_child_weight = 1, # the larger the more conservative the model is, can be used as a stop
subsample = 1 # used to prevent overfitting by sampling x% training dataset
)The following defines the trainControl and actually train the XGBoost
trainControl <- trainControl (method = "cv",
number = 3,
verboseIter = T,
allowParallel = T
)
xgb_tune <- train(x = train_upsample[, -295],
y = train_upsample$Class,
trControl = trainControl,
tuneGrid = grid_tune,
method = "xgbTree",
verbose = T
)## + Fold1: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold1: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold1: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold2: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold2: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.01, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.01, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.01, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.10, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.10, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.10, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.30, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.30, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## + Fold3: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## - Fold3: eta=0.30, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=81
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 81, max_depth = 4, eta = 0.3, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
# Creating the best model
train_Control <- trainControl (method = "none",
verboseIter = T,
allowParallel = T
)
final_grid <- expand.grid(nrounds = xgb_tune$bestTune$nrounds,
eta = xgb_tune$bestTune$eta,
max_depth = xgb_tune$bestTune$max_depth,
gamma = xgb_tune$bestTune$gamma,
colsample_bytree = xgb_tune$bestTune$colsample_bytree,
min_child_weight = xgb_tune$bestTune$min_child_weight,
subsample = xgb_tune$bestTune$subsample
)
xgb_model <- train(x= train_upsample[, -295],
y = train_upsample$Class,
trControl = train_Control,
tuneGrid = final_grid,
method = "xgbTree",
verbose = F
) ## Fitting nrounds = 81, eta = 0.3, max_depth = 4, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
After creating xgb.pred, we run the Confusion Matrix to get the performance metrics.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19 4
## 1 2 4
##
## Accuracy : 0.7931
## 95% CI : (0.6028, 0.9201)
## No Information Rate : 0.7241
## P-Value [Acc > NIR] : 0.2735
##
## Kappa : 0.4387
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9048
## Specificity : 0.5000
## Pos Pred Value : 0.8261
## Neg Pred Value : 0.6667
## Prevalence : 0.7241
## Detection Rate : 0.6552
## Detection Prevalence : 0.7931
## Balanced Accuracy : 0.7024
##
## 'Positive' Class : 0
##
Accuracy is 79.31% while Balanced Accuracy is only 70.24%
SVM will help us decide on optimal decision boundary which can then help classify our labeled data. We’re going to model for the response variable suicide attempts with the adhd_data_3 dataset with both a radial and a linear kernel to try to find the decision boundary as we don’t know whether this is a non-linear problem or linear problem.
adhd_data_4_matx = as.matrix(adhd_data_4)
df.corr.p = as.data.frame(rcorr(adhd_data_4_matx)$P)
# removing Suicide, and repeated columns
correlation_table <- cbind(rownames(df.corr.p), df.corr.p[, 49])[-c(49) , ]
correlation_table %>%
kbl(caption = "Correlation with Suicide") %>%
kable_material_dark()| Age | 0.30027942497339 |
| Sex | 0.0471220935204864 |
| Race | 0.218309067324639 |
| ADHD.Q1 | 0.0117740269942401 |
| ADHD.Q2 | 0.11564807090846 |
| ADHD.Q3 | 0.259154266743925 |
| ADHD.Q4 | 0.112410726789768 |
| ADHD.Q5 | 0.216255750653531 |
| ADHD.Q6 | 0.483512910929568 |
| ADHD.Q7 | 0.048634884292714 |
| ADHD.Q8 | 0.151797641933531 |
| ADHD.Q9 | 0.322485825735442 |
| ADHD.Q10 | 0.291560841921094 |
| ADHD.Q11 | 0.515184561690949 |
| ADHD.Q12 | 0.395887015823113 |
| ADHD.Q13 | 0.584780116748058 |
| ADHD.Q14 | 0.249482531618938 |
| ADHD.Q15 | 0.112873665348362 |
| ADHD.Q16 | 0.201977696336559 |
| ADHD.Q17 | 0.872794003748846 |
| ADHD.Q18 | 0.546374843413306 |
| ADHD.Total | 0.102932484662113 |
| MD.Q1a | 0.0625907388295026 |
| MD.Q1b | 0.00317302701726341 |
| MD.Q1c | 0.960575599200187 |
| MD.Q1d | 0.0010368039318247 |
| MD.Q1e | 0.537896077431999 |
| MD.Q1f | 0.00712626183154863 |
| MD.Q1g | 0.000120152964638853 |
| MD.Q1h | 0.144730606824987 |
| MD.Q1i | 0.2592977551472 |
| MD.Q1j | 0.267581368524187 |
| MD.Q1k | 0.016734195148864 |
| MD.Q1L | 0.0156611407664968 |
| MD.Q1m | 0.403393642126824 |
| MD.Q2 | 0.000857782522594164 |
| MD.Q3 | 0.025029663425552 |
| MD.TOTAL | 0.000755726404960466 |
| Alcohol | 0.0138128897960139 |
| THC | 0.868985202967459 |
| Cocaine | 0.158974694425702 |
| Stimulants | 0.441880666140152 |
| Sedative.hypnotics | 0.0752635208018009 |
| Opioids | 0.016937963928505 |
| Court.order | 0.0805564613765424 |
| Education | 0.230822596373245 |
| Hx.of.Violence | 0.149606337250033 |
| Disorderly.Conduct | 0.704780193506212 |
| Abuse | 0.000396677403584178 |
| NonSubstDx | 0.693218272432247 |
| SubstDx | 0.0468228684891696 |
Here are the factors that has at least a 30% correlation with Suicide:
Age
ADHD.Q6
ADHD.Q9
ADHD.Q11
ADHD.Q12
ADHD.Q13
ADHD.Q17
ADHD.Q18
MD.Q1c
MD.Q1e
MD.Q1m
THC
Stimulants
Disorderly.Conduct
NonSubstDx
We further pear down the list of factors to that of correlation coefficient .500 or above.
svm_selected <- as.data.frame(cbind(rownames(df.corr.p[df.corr.p[, 49] >= .5,]), df.corr.p[df.corr.p[, 49]>= .5, 49 ]))
svm_selected <- svm_selected[complete.cases(svm_selected), ]
rownames(svm_selected) <- seq(length=nrow(svm_selected))
svm_selectedset.seed(45)
indexes = createDataPartition(adhd$Suicide, p = .85, list = F) # results not in a List
train = adhd[indexes, ]
test = adhd[-indexes, ]
wts <- 100 / table(test$Suicide)
wts##
## 0 1
## 6.666667 16.666667
##
## Call:
## svm(formula = Suicide ~ ., data = train, class.weights = wts, C = 13)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 112
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11 5
## 1 4 1
##
## Accuracy : 0.5714
## 95% CI : (0.3402, 0.7818)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.9501
##
## Kappa : -0.1053
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7333
## Specificity : 0.1667
## Pos Pred Value : 0.6875
## Neg Pred Value : 0.2000
## Prevalence : 0.7143
## Detection Rate : 0.5238
## Detection Prevalence : 0.7619
## Balanced Accuracy : 0.4500
##
## 'Positive' Class : 0
##
linear_svm = svm(factor(Suicide)~., data=train, kernel = "linear", type = 'C-classification', cost = 10, scale = F, class.weights = wts)
print(linear_svm)##
## Call:
## svm(formula = factor(Suicide) ~ ., data = train, kernel = "linear",
## type = "C-classification", cost = 10, class.weights = wts, scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 98
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12 4
## 1 3 2
##
## Accuracy : 0.6667
## 95% CI : (0.4303, 0.8541)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.7705
##
## Kappa : 0.1404
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8000
## Specificity : 0.3333
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.4000
## Prevalence : 0.7143
## Detection Rate : 0.5714
## Detection Prevalence : 0.7619
## Balanced Accuracy : 0.5667
##
## 'Positive' Class : 0
##
Linear SVM performs better with a higher Balanced Accuracy.