Import Libraries, import data

library("poLCA");library(tidyverse);library(corrr);library(dplyr); library(ggplot2); library(reshape2);library(gtsummary);library(nnet);library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var

BASIC SUMMARY

Clean the data first

#clean gender sex
non_matching_gender <- which(df0$ED_GenderBirthCert != df0$ED_GenderNow)

print("Values at non-matching indices:");
[1] "Values at non-matching indices:"
print(df0[non_matching_gender, c("ED_GenderBirthCert", "ED_GenderNow")])

#There are 18 transgender patients. I'm going to remove them bc we don;t know if they have used HRT
#clean gender
dfb <- df0[df0$ED_GenderBirthCert == df0$ED_GenderNow, ]
dfb <- dfb[,c(7:11,289)]

unique(dfb$ED_GenderNow)
[1] "FEMALE" "MALE"   NA      
#clean marital status
table(dfb$ED_Marital)

                -5                 -7                 -8           Annulled           Divorced            Married Never been married 
                 4                 21                  6                  6                516                936               2950 
         Separated            Widowed 
               207                 80 
#remove and confirm unknown marital status
dfb <- dfb[!(dfb$ED_Marital %in% c(-5, -7, -8)), ]
unique(dfb$ED_Marital)
[1] "Never been married" "Married"            "Divorced"           "Widowed"            "Separated"          "Annulled"          
[7] NA                  
#clean race
unique(dfb$ED_RaceEthCode)
[1] "Hispanic"           "Non-Hispanic Black" "Non-Hispanic White" "Non-Hispanic Other" NA                  
#this is fine
#clean education
#table(dfb$ED_HighestGrade)

dfb <- dfb[!(dfb$ED_HighestGrade %in% c(-5, -7, -8)), ]
table(dfb$ED_HighestGrade)

                                                      10th grade                                                       11th grade 
                                                              78                                                              195 
                                          12th grade, no diploma                                                        1st grade 
                                                             182                                                                6 
                                                       4th grade                                                        7th grade 
                                                               1                                                                4 
                                                       8th grade                                                        9th grade 
                                                              10                                                               46 
                              Associate degree: Academic program Associate degree: Occupational, technical, or vocational program 
                                                             285                                                              339 
                              Bachelor's degree: BA, AB, BS, BBA                                        Doctoral degree: PhD, EdD 
                                                             638                                                               33 
                                               GED or equivalent                                             High school graduate 
                                                             283                                                             1012 
                         Master's degree: MA, MS, MEng, MEd, MBA                                 Never attended/kindergarten only 
                                                             226                                                                3 
                    Professional school degree: MD, DDS, DVM, JD                                          Some college, no degree 
                                                              30                                                             1317 
#this needs to be converted to fewer categories

Summary Tables

#summarize
dfb %>%
  tbl_summary(missing = "no")
Characteristic N = 4,6891
ED_GenderNow
    FEMALE 2,888 (62%)
    MALE 1,800 (38%)
ED_Age 32 (25, 46)
ED_Marital
    Annulled 6 (0.1%)
    Divorced 515 (11%)
    Married 934 (20%)
    Never been married 2,946 (63%)
    Separated 207 (4.4%)
    Widowed 80 (1.7%)
ED_RaceEthCode
    Hispanic 483 (10%)
    Non-Hispanic Black 2,422 (52%)
    Non-Hispanic Other 167 (3.6%)
    Non-Hispanic White 1,609 (34%)
ED_HighestGrade
    10th grade 78 (1.7%)
    11th grade 195 (4.2%)
    12th grade, no diploma 182 (3.9%)
    1st grade 6 (0.1%)
    4th grade 1 (<0.1%)
    7th grade 4 (<0.1%)
    8th grade 10 (0.2%)
    9th grade 46 (1.0%)
    Associate degree: Academic program 285 (6.1%)
    Associate degree: Occupational, technical, or vocational program 339 (7.2%)
    Bachelor's degree: BA, AB, BS, BBA 638 (14%)
    Doctoral degree: PhD, EdD 33 (0.7%)
    GED or equivalent 283 (6.0%)
    High school graduate 1,012 (22%)
    Master's degree: MA, MS, MEng, MEd, MBA 226 (4.8%)
    Never attended/kindergarten only 3 (<0.1%)
    Professional school degree: MD, DDS, DVM, JD 30 (0.6%)
    Some college, no degree 1,317 (28%)
M6_Pain_Widespread_2016 1,092 (40%)
1 n (%); Median (IQR)

dfb %>%
  tbl_summary(by = M6_Pain_Widespread_2016,missing = "no") %>%
  add_p(workspace = 2e8)  %>%
  add_overall() %>%
  modify_header(label ~ "**2016_Pain Outcome**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment Received**")
1977 observations missing `M6_Pain_Widespread_2016` have been removed. To include these observations, use `forcats::fct_na_value_to_level()` on `M6_Pain_Widespread_2016` column before passing to `tbl_summary()`.
`...` must be empty.
✖ Problematic argument:
• workspace = 2e+08There was an error in 'add_p()/add_difference()' for variable 'ED_Marital', p-value omitted:
Error in stats::fisher.test(c("Never been married", "Married", "Never been married", : FEXACT error 7(location). LDSTP=18150 is too small for this problem,
  (pastp=312.06, ipn_0:=ipoin[itp=430]=2477, stp[ipn_0]=312.474).
Increase workspace or consider using 'simulate.p.value=TRUE'
There was an error in 'add_p()/add_difference()' for variable 'ED_HighestGrade', p-value omitted:
Error in stats::fisher.test(c("High school graduate", "Some college, no degree", : FEXACT error 7(location). LDSTP=18150 is too small for this problem,
  (pastp=21.325, ipn_0:=ipoin[itp=366]=2819, stp[ipn_0]=14.9495).
Increase workspace or consider using 'simulate.p.value=TRUE'
2016_Pain Outcome Overall, N = 2,7121 Treatment Received p-value2
0, N = 1,6201 1, N = 1,0921
ED_GenderNow


<0.001
    FEMALE 1,800 (66%) 1,035 (64%) 765 (70%)
    MALE 912 (34%) 585 (36%) 327 (30%)
ED_Age 35 (26, 49) 32 (25, 46) 40 (29, 51) <0.001
ED_Marital



    Annulled 1 (<0.1%) 0 (0%) 1 (<0.1%)
    Divorced 326 (12%) 204 (13%) 122 (11%)
    Married 603 (22%) 334 (21%) 269 (25%)
    Never been married 1,608 (59%) 998 (62%) 610 (56%)
    Separated 130 (4.8%) 66 (4.1%) 64 (5.9%)
    Widowed 44 (1.6%) 18 (1.1%) 26 (2.4%)
ED_RaceEthCode


<0.001
    Hispanic 208 (7.7%) 130 (8.0%) 78 (7.2%)
    Non-Hispanic Black 1,471 (54%) 815 (50%) 656 (60%)
    Non-Hispanic Other 76 (2.8%) 56 (3.5%) 20 (1.8%)
    Non-Hispanic White 954 (35%) 618 (38%) 336 (31%)
ED_HighestGrade



    10th grade 43 (1.6%) 21 (1.3%) 22 (2.0%)
    11th grade 114 (4.2%) 54 (3.3%) 60 (5.5%)
    12th grade, no diploma 97 (3.6%) 46 (2.8%) 51 (4.7%)
    1st grade 5 (0.2%) 4 (0.2%) 1 (<0.1%)
    7th grade 3 (0.1%) 0 (0%) 3 (0.3%)
    8th grade 6 (0.2%) 2 (0.1%) 4 (0.4%)
    9th grade 29 (1.1%) 16 (1.0%) 13 (1.2%)
    Associate degree: Academic program 172 (6.3%) 80 (4.9%) 92 (8.4%)
    Associate degree: Occupational, technical, or vocational program 200 (7.4%) 126 (7.8%) 74 (6.8%)
    Bachelor's degree: BA, AB, BS, BBA 420 (15%) 280 (17%) 140 (13%)
    Doctoral degree: PhD, EdD 28 (1.0%) 19 (1.2%) 9 (0.8%)
    GED or equivalent 119 (4.4%) 73 (4.5%) 46 (4.2%)
    High school graduate 523 (19%) 316 (20%) 207 (19%)
    Master's degree: MA, MS, MEng, MEd, MBA 155 (5.7%) 109 (6.7%) 46 (4.2%)
    Never attended/kindergarten only 1 (<0.1%) 1 (<0.1%) 0 (0%)
    Professional school degree: MD, DDS, DVM, JD 19 (0.7%) 12 (0.7%) 7 (0.6%)
    Some college, no degree 778 (29%) 461 (28%) 317 (29%)
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test

Examining Features

Clean data to look at Pain. Reset Pain Values so it is all integers. Remove -8 value which is “I don’t know”. Increment each integer so the range is 1-11 instead of 0-10 because the LCA model doesn’t classify 0s.

#reassign values to all be integers

df0[, 12:41] <- apply(df0[, 12:41], 2, function(col) ifelse(col == "No pain or tenderness", 0, col));
df0[, 12:41] <- apply(df0[, 12:41], 2, function(col) ifelse(col == "Severe pain or tenderness", 10, col))
#remove -8 i don't know values
#remove NA cases
df <- subset(df0, !apply(df0[, 12:41], 1, function(x) any(x == -8)))
df <- df[complete.cases(df[, c(12:41)]), ]

#increment by 1 for LCA
df[, 12:41] <- lapply(df[, 12:41], as.integer)
df[, 12:41] <- df[, 12:41] + 1

#1 = no pain # 10= severe pain
table(df[,14])

   1    2    3    4    5    6    7    8    9   10   11 
3518  150  115  113   96   93   61   91   79   35   95 

Determine outcome classifier. Can M6 pain be used as our predictor feature? Which M6 feature?

#looking at some m6 outcomes

table(df0$M6_Pain_YN_COUNT) #number of body parts with pain

  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
735  63 115 114 128 139 118 107 113 107  90  64  95  77  49  56  74  88 355 
table(df0$M6_Pain_C) #number of body parts with worsening pain?

  0   1   2   3   4   5   6   7   8   9  10 
708 194 186 272 226 244 251 196 199 106 174 
table(df0$M6_Pain_MdSv_COUNT) #number of body parts with moderate to severe pain? 

   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18 
1144  206  189  143  139  100  114   83   67   98   58   32   44   20   48   39   21   45   97 
table(df0$M6_Pain_Widespread_2016) #unclear - Zinchuk 2016 questionnaire? #col 289

   0    1 
1630 1115 

The 2016 outcome seems good for a binary classifier. Though I’m not sure what it classifies.

#subset to just pain features and 2016 pain classifier
df <- df[,c(12:41,289)]
dfx <- df[complete.cases(df[,]), ] #remove about 2k cases for complete cases

Build a Latent Classifier Model

#sapply(df, typeof)

f2a <- cbind(ED_NowPain_Head, ED_NowPain_Neck, ED_NowPain_Jaw, ED_NowPain_LeftShoulder, ED_NowPain_RightShoulder, ED_NowPain_LeftUpperArm, ED_NowPain_RightUpperArm, ED_NowPain_LeftLowerArm, ED_NowPain_RightLowerArm, ED_NowPain_Chest, ED_NowPain_UpperBack, ED_NowPain_LowerBack, ED_NowPain_Abdomen, ED_NowPain_Genital, ED_NowPain_LeftHipUpperLeg, ED_NowPain_RightHipUpperLeg, ED_NowPain_LeftLowerLeg, ED_NowPain_RightLowerLeg, ED_NowPain_YN_COUNT, ED_Pain_YN_COUNT, ED_NowPain_MdSv_COUNT, ED_Pain_MdSv_COUNT, ED_NowPain_Widespread_1990, ED_NowPain_Widespread_2016, ED_NowPain_Widespread_7, ED_Pain_Widespread_1990, ED_Pain_Widespread_2016, ED_Pain_Widespread_7, ED_Pain_C, ED_NowPain_C)~M6_Pain_Widespread_2016


nes2a <- poLCA(f2a,dfx,nclass=2) 
Conditional item response (column) probabilities,
 by outcome variable, for each class (row) 
 
$ED_NowPain_Head
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.4808 0.0354 0.0658 0.0526 0.0518 0.0490 0.0533 0.0668 0.0510 0.0210 0.0725
class 2:  0.1554 0.0257 0.0877 0.0880 0.0480 0.1108 0.0662 0.1327 0.0938 0.0539 0.1377

$ED_NowPain_Neck
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.4925 0.0318 0.0483 0.0481 0.0552 0.0745 0.0687 0.0669 0.0565 0.0297 0.0278
class 2:  0.0984 0.0390 0.0560 0.0791 0.0723 0.1197 0.0913 0.1067 0.1137 0.0787 0.1452

$ED_NowPain_Jaw
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8882 0.0137 0.0153 0.0181 0.0104 0.0087 0.0068 0.0087 0.0127 0.0044 0.0130
class 2:  0.6047 0.0859 0.0417 0.0402 0.0354 0.0425 0.0135 0.0424 0.0347 0.0186 0.0405

$ED_NowPain_LeftShoulder
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8) Pr(9) Pr(10) Pr(11)
class 1:  0.7334 0.0123 0.0281 0.0341 0.0311 0.0423 0.0306 0.0266 0.026 0.0154 0.0201
class 2:  0.2442 0.0508 0.0452 0.0704 0.0868 0.0992 0.0762 0.0891 0.087 0.0498 0.1011

$ED_NowPain_RightShoulder
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.7890 0.0099 0.0179 0.0316 0.0185 0.0307 0.0235 0.0212 0.0280 0.0101 0.0196
class 2:  0.2796 0.0714 0.0633 0.0414 0.0613 0.1030 0.0623 0.0909 0.0723 0.0660 0.0885

$ED_NowPain_LeftUpperArm
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8315 0.0224 0.0164 0.0148 0.0266 0.0237 0.0136 0.0130 0.0142 0.0026 0.0212
class 2:  0.3483 0.0475 0.0813 0.0560 0.0622 0.0972 0.0570 0.0808 0.0684 0.0330 0.0682

$ED_NowPain_RightUpperArm
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8652 0.0142 0.0161 0.0142 0.0093 0.0248 0.0194 0.0150 0.0087 0.0062 0.0068
class 2:  0.3696 0.0508 0.0694 0.0819 0.0684 0.0631 0.0525 0.0723 0.0455 0.0487 0.0778

$ED_NowPain_LeftLowerArm
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8706 0.0118 0.0129 0.0245 0.0069 0.0180 0.0099 0.0186 0.0099 0.0045 0.0125
class 2:  0.4570 0.0735 0.0593 0.0658 0.0547 0.0673 0.0570 0.0435 0.0436 0.0226 0.0558

$ED_NowPain_RightLowerArm
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8635 0.0093 0.0181 0.0206 0.0191 0.0176 0.0101 0.0149 0.0062 0.0051 0.0155
class 2:  0.4626 0.0611 0.0692 0.0702 0.0394 0.0711 0.0443 0.0508 0.0342 0.0319 0.0653

$ED_NowPain_Chest
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8089 0.0129 0.0197 0.0313 0.0196 0.0211 0.0156 0.0245 0.0199 0.0120 0.0146
class 2:  0.3624 0.0333 0.0738 0.0699 0.0626 0.1056 0.0723 0.0636 0.0620 0.0235 0.0710

$ED_NowPain_UpperBack
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.6754 0.0180 0.0374 0.0487 0.0431 0.0429 0.0194 0.0421 0.0315 0.0111 0.0303
class 2:  0.1445 0.0258 0.0389 0.0759 0.0988 0.1096 0.0939 0.1027 0.0798 0.0777 0.1524

$ED_NowPain_LowerBack
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.5460 0.0234 0.0388 0.0317 0.0433 0.0589 0.0574 0.0502 0.0540 0.0328 0.0636
class 2:  0.1342 0.0334 0.0470 0.0609 0.0528 0.0734 0.0946 0.1211 0.1117 0.0768 0.1941

$ED_NowPain_Abdomen
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.9033 0.0138 0.0111 0.0160 0.0069 0.0074 0.0133 0.0068 0.0081 0.0042 0.0092
class 2:  0.4437 0.0391 0.0468 0.0758 0.0485 0.0892 0.0585 0.0913 0.0290 0.0293 0.0489

$ED_NowPain_Genital
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.9604 0.0006 0.0038 0.0062 0.0037 0.0086 0.0074 0.0006 0.0050 0.0012 0.0025
class 2:  0.6912 0.0270 0.0393 0.0466 0.0145 0.0291 0.0290 0.0311 0.0342 0.0270 0.0311

$ED_NowPain_LeftHipUpperLeg
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.7801 0.0109 0.0203 0.0265 0.0237 0.0353 0.0273 0.0160 0.0284 0.0075  0.024
class 2:  0.3629 0.0409 0.0469 0.0583 0.0556 0.0901 0.0611 0.0872 0.0695 0.0465  0.081

$ED_NowPain_RightHipUpperLeg
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.7907 0.0083 0.0272 0.0312 0.0257 0.0263 0.0194 0.0221 0.0211 0.0063 0.0217
class 2:  0.3566 0.0379 0.0644 0.0534 0.0771 0.0906 0.0515 0.0697 0.0828 0.0361 0.0798

$ED_NowPain_LeftLowerLeg
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8355 0.0051 0.0199 0.0139 0.0132 0.0273 0.0254 0.0093 0.0223 0.0057 0.0224
class 2:  0.4733 0.0433 0.0723 0.0730 0.0547 0.0589 0.0674 0.0332 0.0404 0.0246 0.0589

$ED_NowPain_RightLowerLeg
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.8555 0.0076 0.0106 0.0235 0.0235 0.0156 0.0111 0.0117 0.0167 0.0080 0.0161
class 2:  0.4958 0.0463 0.0663 0.0664 0.0477 0.0505 0.0549 0.0311 0.0581 0.0342 0.0487

$ED_NowPain_YN_COUNT
           Pr(1) Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11) Pr(12) Pr(13) Pr(14) Pr(15) Pr(16) Pr(17) Pr(18) Pr(19)
class 1:  0.0142 0.112 0.1435 0.1423 0.1559 0.1479 0.1516 0.1296 0.0030 0.0000 0.0000  0.000 0.0000 0.0000 0.0000 0.0000 0.0000  0.000  0.000
class 2:  0.0000 0.000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0452 0.1422 0.1627 0.1223  0.084 0.0954 0.0902 0.0674 0.0332 0.0425  0.058  0.057

$ED_Pain_YN_COUNT
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11) Pr(12) Pr(13) Pr(14) Pr(15) Pr(16) Pr(17) Pr(18) Pr(19)
class 1:  0.5232 0.1002 0.1107 0.0644 0.0635 0.0416 0.0368 0.0158 0.0160 0.0031 0.0025 0.0026 0.0023 0.0031 0.0024 0.0031 0.0037 0.0037 0.0012
class 2:  0.3424 0.0591 0.0695 0.0600 0.0605 0.0288 0.0275 0.0368 0.0385 0.0415 0.0248 0.0174 0.0273 0.0249 0.0240 0.0187 0.0311 0.0197 0.0477

$ED_NowPain_MdSv_COUNT
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11) Pr(12) Pr(13) Pr(14) Pr(15) Pr(16) Pr(17) Pr(18) Pr(19)
class 1:  0.1034 0.1800 0.1954 0.1651 0.1431 0.0994 0.0772 0.0363 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
class 2:  0.0081 0.0136 0.0199 0.0509 0.0525 0.0625 0.0977 0.1009 0.1327 0.1057 0.0933 0.0736 0.0456 0.0477 0.0259 0.0187 0.0218 0.0166 0.0124

$ED_Pain_MdSv_COUNT
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11) Pr(12) Pr(13) Pr(14) Pr(15) Pr(16) Pr(17) Pr(18) Pr(19)
class 1:  0.6601 0.1083 0.0793 0.0506 0.0350 0.0181 0.0194 0.0086 0.0056 0.0025 0.0017 0.0014 0.0006 0.0031 0.0012 0.0043 0.0000 0.0000 0.0000
class 2:  0.5173 0.0911 0.0568 0.0458 0.0418 0.0340 0.0183 0.0239 0.0197 0.0228 0.0240 0.0153 0.0114 0.0207 0.0124 0.0073 0.0155 0.0021 0.0197

$ED_NowPain_Widespread_1990
           Pr(1)  Pr(2)
class 1:  0.8074 0.1926
class 2:  0.1525 0.8475

$ED_NowPain_Widespread_2016
           Pr(1)  Pr(2)
class 1:  0.9262 0.0738
class 2:  0.2841 0.7159

$ED_NowPain_Widespread_7
           Pr(1)  Pr(2)
class 1:  0.9705 0.0295
class 2:  0.0119 0.9881

$ED_Pain_Widespread_1990
           Pr(1)  Pr(2)
class 1:  0.9077 0.0923
class 2:  0.6478 0.3522

$ED_Pain_Widespread_2016
           Pr(1)  Pr(2)
class 1:  0.9541 0.0459
class 2:  0.7266 0.2734

$ED_Pain_Widespread_7
           Pr(1)  Pr(2)
class 1:  0.9483 0.0517
class 2:  0.6699 0.3301

$ED_Pain_C
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:  0.4871 0.0948 0.0658 0.0814 0.0510 0.0615 0.0421 0.0338 0.0340 0.0218 0.0267
class 2:  0.3138 0.0744 0.0960 0.0917 0.0732 0.0793 0.0383 0.0377 0.0819 0.0453 0.0683

$ED_NowPain_C
          Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)  Pr(7)  Pr(8)  Pr(9) Pr(10) Pr(11)
class 1:      0 0.0235 0.0496 0.0741 0.1108 0.1088 0.1482 0.1410 0.1549 0.0657 0.1234
class 2:      0 0.0042 0.0111 0.0262 0.0528 0.0759 0.1228 0.2033 0.1934 0.1222 0.1883

Estimated class population shares 
 0.6262 0.3738 
 
Predicted class memberships (by modal posterior prob.) 
 0.6257 0.3743 
 
========================================================= 
Fit for 2 latent classes: 
========================================================= 
2 / 1 
========================================================= 
number of observations: 2581 
number of estimated parameters: 558 
residual degrees of freedom: 2023 
maximum log-likelihood: -101856.8 
 
AIC(2): 204829.6
BIC(2): 208097.2
X^2(2): 1.107972e+30 (Chi-square goodness of fit) 
 
pidmat <- cbind(1,c(1:7))
exb <- exp(pidmat %*% nes2a$coeff)


matplot(c(1:7),(cbind(1,exb)/(1+rowSums(exb))),ylim=c(0,1),type="l",
main="Pain Reports as a predictor of outcome class",
xlab="Strong recovery (1) to strong chronic pain (7)",
ylab="Probability of latent class membership",lwd=2,col=1)

Plots

Plot Features

gghisto <- list(
  theme(axis.text.x = element_text(size=18, color = "Navyblue"),
          axis.text.y = element_text(face="bold", 
          size=14, angle= 30, color = "Navyblue"),
          axis.title=element_text(size=17),
          plot.title = element_text(size=17,face="bold")))

elements0 <- list(
  ggtitle("Density Distribution of Pain features"),
  ylab("Density"),
  xlab("Self-Reported Pain Level"),
  geom_density(alpha=.07)
)
df.plot0 <- melt(dfx[,1:9])
No id variables; using all as measure variables
#table(df.plot)
df.plot0 <- subset(df.plot0, value != 1)
p0 <- ggplot(aes(x = value, colour = variable, fill = variable), data = df.plot0) +
  elements0 +
  gghisto


df.plot1 <- melt(dfx[,10:18])
No id variables; using all as measure variables
df.plot1 <- subset(df.plot1, value != 1)
p1 <- ggplot(aes(x = value, colour = variable, fill = variable), data = df.plot1) +
  elements0 +
  gghisto

df.plot2 <- melt(dfx[,1:18])
No id variables; using all as measure variables
df.plot2 <- subset(df.plot2, value != 1)
p2 <- ggplot(aes(x = value, colour = variable, fill = variable), data = df.plot2) +
  elements0 +
  gghisto

p0;p1;p2

ggplot(na.omit(dfb), aes(x = ED_Age, fill = factor(`M6_Pain_Widespread_2016`))) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Age by M6 Pain Response",
       x = "Age",
       y = "Density") +
  scale_fill_manual(values = c("hotpink", "lightskyblue"))+
  gghisto


ggplot(na.omit(dfx), aes(x = ED_NowPain_MdSv_COUNT, fill = factor(`M6_Pain_Widespread_2016`))) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Now Pain x M6 Pain ",
       x = "ED NOW PAIN",
       y = "Density") +
  scale_fill_manual(values = c("hotpink", "lightskyblue"))+
  gghisto

SLEEP AND MOOD

lapply(df[, 42:76], function(col) sort(unique(col), decreasing = TRUE, na.last = TRUE))
Error in `[.data.frame`(df, , 42:76) : undefined columns selected

Initial correlations

correlate(df0[,c(2:5,30:50)])
Non-numeric variables removed from input: `ED_NowPain_YN_COUNT`, `ED_Pain_YN_COUNT`, `ED_NowPain_MdSv_COUNT`, `ED_Pain_MdSv_COUNT`, `ED_NowPain_Widespread_1990`, `ED_NowPain_Widespread_2016`, `ED_NowPain_Widespread_7`, `ED_Pain_Widespread_1990`, `ED_Pain_Widespread_2016`, `ED_Pain_Widespread_7`, `ED_Pain_C`, and `ED_NowPain_C`Correlation computed with
• Method: 'pearson'
• Missing treated using: 'pairwise.complete.obs'

Ordinal Logistic Regression

dfx$M6_Pain_Widespread_2016 <- as.factor(dfx$M6_Pain_Widespread_2016)

# Check the levels of the factor
levels(dfx$M6_Pain_Widespread_2016)

# Fit logistic regression model
logit_model <- glm(M6_Pain_Widespread_2016 ~ ., data = dfx, family = binomial)

# Summary of the model
summary(logit_model)
#filter significant features
coefficients_table <- summary(logit_model)$coefficients
# Filter coefficients table based on significance level
significant_features <- coefficients_table[coefficients_table[, "Pr(>|z|)"] < 0.01, ]

# View the significant features
print(significant_features)
                            Estimate Std. Error   z value     Pr(>|z|)
(Intercept)              -3.21531944 0.36544644 -8.798333 1.388637e-18
ED_NowPain_Jaw           -0.06318017 0.02128283 -2.968599 2.991608e-03
ED_NowPain_RightShoulder -0.06374031 0.02137345 -2.982220 2.861665e-03
ED_NowPain_LowerBack      0.04231805 0.01573973  2.688614 7.174931e-03
ED_NowPain_MdSv_COUNT     0.14014139 0.04243127  3.302785 9.572967e-04
ED_Pain_MdSv_COUNT       -0.08215910 0.02830421 -2.902716 3.699417e-03
ED_Pain_Widespread_2016   0.92097171 0.25563642  3.602662 3.149745e-04
ED_Pain_C                 0.09180860 0.02004184  4.580847 4.630975e-06
exp(0.92097171)
[1] 2.51173
exp(0.14014139)
[1] 1.150436

The ED_Pain_Widespread_2016 feature is interesting. This means that the likelihood of the 6 month outcome being a 1 instead of a 0 increases by a factor of approximately \(exp(0.92)\) = 2.51x. For ED_NowPain_MdSv_COUNT, the increase is \(exp(0.14)\) = 1.15.

BIC(logit_model)
[1] 3216.773

These values aren’t great. this model needs feature selection.

feature selection

summary(stepwise_model)

Call:
glm(formula = M6_Pain_Widespread_2016 ~ ED_NowPain_Neck + ED_NowPain_Jaw + 
    ED_NowPain_RightShoulder + ED_NowPain_LeftLowerArm + ED_NowPain_LowerBack + 
    ED_NowPain_Abdomen + ED_NowPain_LeftHipUpperLeg + ED_NowPain_RightHipUpperLeg + 
    ED_Pain_YN_COUNT + ED_NowPain_MdSv_COUNT + ED_Pain_MdSv_COUNT + 
    ED_NowPain_Widespread_1990 + ED_NowPain_Widespread_7 + ED_Pain_Widespread_1990 + 
    ED_Pain_Widespread_2016 + ED_Pain_C + ED_NowPain_C, family = binomial, 
    data = dfx)

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -3.35534    0.32506 -10.322  < 2e-16 ***
ED_NowPain_Neck              0.02377    0.01636   1.453 0.146164    
ED_NowPain_Jaw              -0.06458    0.02037  -3.171 0.001522 ** 
ED_NowPain_RightShoulder    -0.06999    0.01790  -3.910 9.24e-05 ***
ED_NowPain_LeftLowerArm     -0.03231    0.01979  -1.633 0.102474    
ED_NowPain_LowerBack         0.04420    0.01442   3.065 0.002180 ** 
ED_NowPain_Abdomen          -0.03991    0.02007  -1.988 0.046769 *  
ED_NowPain_LeftHipUpperLeg  -0.03071    0.01688  -1.820 0.068826 .  
ED_NowPain_RightHipUpperLeg  0.04436    0.01677   2.645 0.008161 ** 
ED_Pain_YN_COUNT             0.04808    0.02770   1.736 0.082528 .  
ED_NowPain_MdSv_COUNT        0.16557    0.03029   5.467 4.58e-08 ***
ED_Pain_MdSv_COUNT          -0.08583    0.02793  -3.073 0.002120 ** 
ED_NowPain_Widespread_1990  -0.31922    0.12322  -2.591 0.009581 ** 
ED_NowPain_Widespread_7      0.30888    0.13990   2.208 0.027252 *  
ED_Pain_Widespread_1990      0.39927    0.20187   1.978 0.047943 *  
ED_Pain_Widespread_2016      0.89649    0.24962   3.591 0.000329 ***
ED_Pain_C                    0.09186    0.01973   4.657 3.21e-06 ***
ED_NowPain_C                 0.05162    0.02381   2.168 0.030192 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3495.2  on 2580  degrees of freedom
Residual deviance: 2978.7  on 2563  degrees of freedom
AIC: 3014.7

Number of Fisher Scoring iterations: 4



---
title: "R Notebook"
output: html_notebook
---

### Import Libraries, import data
```{r}
library("poLCA");library(tidyverse);library(corrr);library(dplyr); library(ggplot2); library(reshape2);library(gtsummary);library(nnet);library(pROC);library(MASS)
df0 <- read.csv("DF_Alice.csv");
```


### BASIC SUMMARY

#### Clean the data first 
```{r}
#clean gender sex
non_matching_gender <- which(df0$ED_GenderBirthCert != df0$ED_GenderNow)

print("Values at non-matching indices:");
print(df0[non_matching_gender, c("ED_GenderBirthCert", "ED_GenderNow")])

#There are 18 transgender patients. I'm going to remove them bc we don;t know if they have used HRT
```


```{r}
#clean gender
dfb <- df0[df0$ED_GenderBirthCert == df0$ED_GenderNow, ]
dfb <- dfb[,c(7:11,289)]

unique(dfb$ED_GenderNow)
```


```{r}
#clean marital status
table(dfb$ED_Marital)

#remove and confirm unknown marital status
dfb <- dfb[!(dfb$ED_Marital %in% c(-5, -7, -8)), ]
unique(dfb$ED_Marital)
```

```{r}
#clean race
unique(dfb$ED_RaceEthCode)
#this is fine
```

```{r}
#clean education
#table(dfb$ED_HighestGrade)

dfb <- dfb[!(dfb$ED_HighestGrade %in% c(-5, -7, -8)), ]
table(dfb$ED_HighestGrade)

#this needs to be converted to fewer categories
```

#### Summary Tables
```{r}
#summarize
dfb %>%
  tbl_summary(missing = "no")

dfb %>%
  tbl_summary(by = M6_Pain_Widespread_2016,missing = "no") %>%
  add_p(workspace = 2e8)  %>%
  add_overall() %>%
  modify_header(label ~ "**2016_Pain Outcome**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment Received**")

```



### Examining Features

#### Clean data to look at Pain. Reset Pain Values so it is all integers. Remove -8 value which is "I don't know". Increment each integer so the range is 1-11 instead of 0-10 because the LCA model doesn't classify 0s. 
```{r}
#reassign values to all be integers

df0[, 12:41] <- apply(df0[, 12:41], 2, function(col) ifelse(col == "No pain or tenderness", 0, col));
df0[, 12:41] <- apply(df0[, 12:41], 2, function(col) ifelse(col == "Severe pain or tenderness", 10, col))
```

```{r}
#remove -8 i don't know values
#remove NA cases
df <- subset(df0, !apply(df0[, 12:41], 1, function(x) any(x == -8)))
df <- df[complete.cases(df[, c(12:41)]), ]

#increment by 1 for LCA
df[, 12:41] <- lapply(df[, 12:41], as.integer)
df[, 12:41] <- df[, 12:41] + 1

#1 = no pain # 10= severe pain
table(df[,14])
```


#### Determine outcome classifier. Can M6 pain be used as our predictor feature? Which M6 feature?
```{r}
#looking at some m6 outcomes

table(df0$M6_Pain_YN_COUNT) #number of body parts with pain
table(df0$M6_Pain_C) #number of body parts with worsening pain?
table(df0$M6_Pain_MdSv_COUNT) #number of body parts with moderate to severe pain? 
table(df0$M6_Pain_Widespread_2016) #unclear - Zinchuk 2016 questionnaire? #col 289
```

#### The 2016 outcome seems good for a binary classifier. Though I'm not sure what it classifies. 
```{r}
#subset to just pain features and 2016 pain classifier
df <- df[,c(12:41,289)]
dfx <- df[complete.cases(df[,]), ] #remove about 2k cases for complete cases
```

#### Build a Latent Classifier Model
```{r}
#sapply(df, typeof)

f2a <- cbind(ED_NowPain_Head, ED_NowPain_Neck, ED_NowPain_Jaw, ED_NowPain_LeftShoulder, ED_NowPain_RightShoulder, ED_NowPain_LeftUpperArm, ED_NowPain_RightUpperArm, ED_NowPain_LeftLowerArm, ED_NowPain_RightLowerArm, ED_NowPain_Chest, ED_NowPain_UpperBack, ED_NowPain_LowerBack, ED_NowPain_Abdomen, ED_NowPain_Genital, ED_NowPain_LeftHipUpperLeg, ED_NowPain_RightHipUpperLeg, ED_NowPain_LeftLowerLeg, ED_NowPain_RightLowerLeg, ED_NowPain_YN_COUNT, ED_Pain_YN_COUNT, ED_NowPain_MdSv_COUNT, ED_Pain_MdSv_COUNT, ED_NowPain_Widespread_1990, ED_NowPain_Widespread_2016, ED_NowPain_Widespread_7, ED_Pain_Widespread_1990, ED_Pain_Widespread_2016, ED_Pain_Widespread_7, ED_Pain_C, ED_NowPain_C)~M6_Pain_Widespread_2016


nes2a <- poLCA(f2a,dfx,nclass=2) 
pidmat <- cbind(1,c(1:7))
exb <- exp(pidmat %*% nes2a$coeff)


matplot(c(1:7),(cbind(1,exb)/(1+rowSums(exb))),ylim=c(0,1),type="l",
main="Pain Reports as a predictor of outcome class",
xlab="Strong recovery (1) to strong chronic pain (7)",
ylab="Probability of latent class membership",lwd=2,col=1)

```

### Plots
#### Plot Features
```{r}
gghisto <- list(
  theme(axis.text.x = element_text(size=18, color = "Navyblue"),
          axis.text.y = element_text(face="bold", 
          size=14, angle= 30, color = "Navyblue"),
          axis.title=element_text(size=17),
          plot.title = element_text(size=17,face="bold")))

elements0 <- list(
  ggtitle("Density Distribution of Pain features"),
  ylab("Density"),
  xlab("Self-Reported Pain Level"),
  geom_density(alpha=.07)
)
```


```{r}
df.plot0 <- melt(dfx[,1:9])
#table(df.plot)
df.plot0 <- subset(df.plot0, value != 1)
p0 <- ggplot(aes(x = value, colour = variable, fill = variable), data = df.plot0) +
  elements0 +
  gghisto


df.plot1 <- melt(dfx[,10:18])
df.plot1 <- subset(df.plot1, value != 1)
p1 <- ggplot(aes(x = value, colour = variable, fill = variable), data = df.plot1) +
  elements0 +
  gghisto

df.plot2 <- melt(dfx[,1:18])
df.plot2 <- subset(df.plot2, value != 1)
p2 <- ggplot(aes(x = value, colour = variable, fill = variable), data = df.plot2) +
  elements0 +
  gghisto

p0;p1;p2
```


```{r}
ggplot(na.omit(dfb), aes(x = ED_Age, fill = factor(`M6_Pain_Widespread_2016`))) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Age by M6 Pain Response",
       x = "Age",
       y = "Density") +
  scale_fill_manual(values = c("hotpink", "lightskyblue"))+
  gghisto

ggplot(na.omit(dfx), aes(x = ED_NowPain_MdSv_COUNT, fill = factor(`M6_Pain_Widespread_2016`))) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Now Pain x M6 Pain ",
       x = "ED NOW PAIN",
       y = "Density") +
  scale_fill_manual(values = c("hotpink", "lightskyblue"))+
  gghisto
```





#### SLEEP AND MOOD 
```{r}
#lapply(df[, 42:76], function(col) sort(unique(col), decreasing = TRUE, na.last = TRUE))

```



### Initial correlations
```{r}
correlate(df0[,c(2:5,30:50)])
```
### Ordinal Logistic Regression

```{r}
dfx$M6_Pain_Widespread_2016 <- as.factor(dfx$M6_Pain_Widespread_2016)

# Check the levels of the factor
levels(dfx$M6_Pain_Widespread_2016)

# Fit logistic regression model
logit_model <- glm(M6_Pain_Widespread_2016 ~ ., data = dfx, family = binomial)

# Summary of the model
summary(logit_model)
```


```{r}
#filter significant features
coefficients_table <- summary(logit_model)$coefficients
# Filter coefficients table based on significance level
significant_features <- coefficients_table[coefficients_table[, "Pr(>|z|)"] < 0.01, ]

# View the significant features
print(significant_features)
exp(0.92097171)
exp(0.14014139)

```

The `ED_Pain_Widespread_2016` feature is interesting. This means that the likelihood of the 6 month outcome being a 1 instead of a 0  increases by a factor of approximately $exp(0.92)$ = 2.51x. For `ED_NowPain_MdSv_COUNT`, the increase is $exp(0.14)$ = 1.15. 

```{r}
#testing the GLM

lrt <- anova(logit_model, test = "Chisq")
lrt


# Calculate AIC/ BIC
AIC(logit_model)
BIC(logit_model)
```

#### These values aren't great. this model needs feature selection. 
feature selection
```{r}
#m2 <- glm(M6_Pain_Widespread_2016 ~ ., data = dfx, family = binomial)
# Perform stepwise selection
#stepwise_model <- step(m2)

m3 <- glm(M6_Pain_Widespread_2016 ~ ., data = dfx, family = binomial)
# Perform stepwise selection with AIC
stepwise_model <- stepAIC(m3, direction = "both")

summary(stepwise_model)
```

---
---


---
---


---


---


---


