Data clean up

#library
pacman::p_load(
tidyverse,data.table,tableone,tibble,stringi,readxl,dplyr,scales,ggsci,
skimr,Epi,janitor,summarytools,broom,easystats,car,rms)

#conflict解消
select<-dplyr::select 
#factor var
fac_var<-c("RANDID","SEX", "CURSMOKE", "DIABETES", "BPMEDS","educ","PREVCHD", "PREVAP", "PREVMI", "PREVSTRK", "PREVHYP", "PERIOD", "DEATH", 
           "ANGINA", "HOSPMI", "MI_FCHD", "ANYCHD", "STROKE", "CVD", "HYPERTEN")

#data import
fr<-read_csv("frmgham2.csv") %>% 
  mutate(across(fac_var,as.factor))
## Rows: 11627 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (39): RANDID, SEX, TOTCHOL, AGE, SYSBP, DIABP, CURSMOKE, CIGPDAY, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(fac_var, as.factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(fac_var)
## 
##   # Now:
##   data %>% select(all_of(fac_var))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
fr %>% 
  head(10)
## # A tibble: 10 × 39
##    RANDID SEX   TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <fct>  <fct>   <dbl> <dbl> <dbl> <dbl> <fct>      <dbl> <dbl> <fct>    <fct> 
##  1 2448   1         195    39  106   70   0              0  27.0 0        0     
##  2 2448   1         209    52  121   66   0              0  NA   0        0     
##  3 6238   2         250    46  121   81   0              0  28.7 0        0     
##  4 6238   2         260    52  105   69.5 0              0  29.4 0        0     
##  5 6238   2         237    58  108   66   0              0  28.5 0        0     
##  6 9428   1         245    48  128.  80   1             20  25.3 0        0     
##  7 9428   1         283    54  141   89   1             30  25.3 0        0     
##  8 10552  2         225    61  150   95   1             30  28.6 0        0     
##  9 10552  2         232    67  183  109   1             20  30.2 0        0     
## 10 11252  2         285    46  130   84   1             23  23.1 0        0     
## # ℹ 28 more variables: HEARTRTE <dbl>, GLUCOSE <dbl>, educ <fct>,
## #   PREVCHD <fct>, PREVAP <fct>, PREVMI <fct>, PREVSTRK <fct>, PREVHYP <fct>,
## #   TIME <dbl>, PERIOD <fct>, HDLC <dbl>, LDLC <dbl>, DEATH <fct>,
## #   ANGINA <fct>, HOSPMI <fct>, MI_FCHD <fct>, ANYCHD <fct>, STROKE <fct>,
## #   CVD <fct>, HYPERTEN <fct>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>, TIMEHYP <dbl>
fr %>% 
  skim()
Data summary
Name Piped data
Number of rows 11627
Number of columns 39
_______________________
Column type frequency:
factor 20
numeric 19
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
RANDID 0 1.00 FALSE 4434 623: 3, 112: 3, 112: 3, 128: 3
SEX 0 1.00 FALSE 2 2: 6605, 1: 5022
CURSMOKE 0 1.00 FALSE 2 0: 6598, 1: 5029
DIABETES 0 1.00 FALSE 2 0: 11097, 1: 530
BPMEDS 593 0.95 FALSE 2 0: 10090, 1: 944
educ 295 0.97 FALSE 4 1: 4690, 2: 3410, 3: 1885, 4: 1347
PREVCHD 0 1.00 FALSE 2 0: 10785, 1: 842
PREVAP 0 1.00 FALSE 2 0: 11000, 1: 627
PREVMI 0 1.00 FALSE 2 0: 11253, 1: 374
PREVSTRK 0 1.00 FALSE 2 0: 11475, 1: 152
PREVHYP 0 1.00 FALSE 2 0: 6283, 1: 5344
PERIOD 0 1.00 FALSE 3 1: 4434, 2: 3930, 3: 3263
DEATH 0 1.00 FALSE 2 0: 8100, 1: 3527
ANGINA 0 1.00 FALSE 2 0: 9725, 1: 1902
HOSPMI 0 1.00 FALSE 2 0: 10473, 1: 1154
MI_FCHD 0 1.00 FALSE 2 0: 9839, 1: 1788
ANYCHD 0 1.00 FALSE 2 0: 8469, 1: 3158
STROKE 0 1.00 FALSE 2 0: 10566, 1: 1061
CVD 0 1.00 FALSE 2 0: 8728, 1: 2899
HYPERTEN 0 1.00 FALSE 2 1: 8642, 0: 2985

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TOTCHOL 409 0.96 241.16 45.37 107.00 210.00 238.00 268.00 696.0 ▅▇▁▁▁
AGE 0 1.00 54.79 9.56 32.00 48.00 54.00 62.00 81.0 ▂▇▇▅▁
SYSBP 0 1.00 136.32 22.80 83.50 120.00 132.00 149.00 295.0 ▆▇▁▁▁
DIABP 0 1.00 83.04 11.66 30.00 75.00 82.00 90.00 150.0 ▁▅▇▁▁
CIGPDAY 79 0.99 8.25 12.19 0.00 0.00 0.00 20.00 90.0 ▇▂▁▁▁
BMI 52 1.00 25.88 4.10 14.43 23.09 25.48 28.07 56.8 ▃▇▁▁▁
HEARTRTE 6 1.00 76.78 12.46 37.00 69.00 75.00 85.00 220.0 ▆▇▁▁▁
GLUCOSE 1440 0.88 84.12 24.99 39.00 72.00 80.00 89.00 478.0 ▇▁▁▁▁
TIME 0 1.00 1957.02 1758.78 0.00 0.00 2156.00 4252.50 4854.0 ▇▁▇▁▆
HDLC 8600 0.26 49.36 15.63 10.00 39.00 48.00 58.00 189.0 ▆▇▁▁▁
LDLC 8601 0.26 176.47 46.86 20.00 145.00 173.00 205.00 565.0 ▂▇▁▁▁
TIMEAP 0 1.00 7241.56 2477.78 0.00 6224.00 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMEMI 0 1.00 7593.85 2136.73 0.00 7212.00 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMEMIFC 0 1.00 7543.04 2192.12 0.00 7049.50 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMECHD 0 1.00 7008.15 2641.34 0.00 5598.50 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMESTRK 0 1.00 7660.88 2011.08 0.00 7295.00 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMECVD 0 1.00 7166.08 2541.67 0.00 6004.00 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMEDTH 0 1.00 7854.10 1788.37 26.00 7797.50 8766.00 8766.00 8766.0 ▁▁▁▁▇
TIMEHYP 0 1.00 3598.96 3464.16 0.00 0.00 2429.00 7329.00 8766.0 ▇▂▂▂▅
#continuous var distribution

fr %>% 
  select(-c(fac_var)) %>% 
  pivot_longer(cols=everything(),
               names_to = "term",
               values_to = "value") %>% 
  ggplot(aes(x=value))+
  geom_histogram()+
  facet_wrap(~term,scales = "free")+
  #x軸の文字を小さくする
  theme(axis.text.x = element_text(size=5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19187 rows containing non-finite values (`stat_bin()`).

Research Questionの模索

outcomeの選定

#stroke
fr %>% 
  distinct(RANDID, .keep_all = T) %>%
  count(STROKE)
## # A tibble: 2 × 2
##   STROKE     n
##   <fct>  <int>
## 1 0       4019
## 2 1        415

Strokeのアウトカム発生数は415あるので、多変量解析で組み込める予測因子の数も十分確保できそう

STROKEと連続変数の関係性(非線形性の検討)

  • 単変量での非線形性をRCSで評価
  • あくまでもtime to eventではなく、binaryのoutcomeとの関係性を見ていることに注意
  • Baselineにおける変数の結果とアウトカムの関連を評価→PERIOD=1
#continuous var

fr %>% 
  select(-c(fac_var)) %>% 
  colnames() %>%
  dput()
## c("TOTCHOL", "AGE", "SYSBP", "DIABP", "CIGPDAY", "BMI", "HEARTRTE", 
## "GLUCOSE", "TIME", "HDLC", "LDLC", "TIMEAP", "TIMEMI", "TIMEMIFC", 
## "TIMECHD", "TIMESTRK", "TIMECVD", "TIMEDTH", "TIMEHYP")
cont_var<-c("TOTCHOL", "AGE", "SYSBP", "DIABP", "CIGPDAY", "BMI", "HEARTRTE", "GLUCOSE")
#事前準備
ddist <- datadist(fr) 
options(datadist='ddist') 


#rcs()で変数の非線形を構築、4でknotsを指定

##plot
rcs_plot<-
  function(x,n=4,d=fr){
    #x:explanatory variable 
    #n:knot number
    #d=data set
    
rcs_formula_str <- paste("STROKE~ rcs(", x, ",",n,")")
rcs<-lrm(as.formula(rcs_formula_str),data=d) 
  Predict(rcs) %>% 
  ggplot()
}

##stat
rcs_stat<-function(x,d=fr){
  #x:explanatory variable
  #d:data set

  for(i in 1:length(x)){
    rcs_formula_str <- paste("STROKE~ rcs(", x[i], ",4)")
    rcs<-lrm(as.formula(rcs_formula_str),data=d) 
    print(rcs %>% 
            anova())
  }
}
library(gridExtra)
## 
##  次のパッケージを付け加えます: 'gridExtra'
##  以下のオブジェクトは 'package:dplyr' からマスクされています:
## 
##     combine
#plot in all exam time
grid.arrange(
  rcs_plot("AGE"),
  rcs_plot("SYSBP"),
  rcs_plot("DIABP"),
  rcs_plot("CIGPDAY"),
  rcs_plot("BMI"),
  rcs_plot("HEARTRTE"),
  rcs_plot("GLUCOSE"),
  rcs_plot("TOTCHOL"),
  ncol=3
)

  • all time examにおける単変量解析で非線形性評価
  • 視覚的に非線形性がありそうなのは、DBP,BMI,HR,TCHO
  • 特にDBP,BMIはU字型で最適な(最もリスクが低い)範囲が有りそう
  • HR,TCHOはある一定の値よりリスク増加の関係性があるように見える
  • かなり強い関連性があるように見えるのは、AGE,SBP,DBP,BMI,HR,GLU
#plot in baseline variable

fr_p1<-fr %>% filter(PERIOD==1)

grid.arrange(
  rcs_plot("AGE",d=fr_p1),
  rcs_plot("SYSBP",d=fr_p1),
  rcs_plot("DIABP",d=fr_p1),
  rcs_plot("CIGPDAY",d=fr_p1),
  rcs_plot("BMI",d=fr_p1),
  rcs_plot("HEARTRTE",d=fr_p1),
  rcs_plot("GLUCOSE",d=fr_p1),
  rcs_plot("TOTCHOL",d=fr_p1),
  ncol=3
)

  • Baseline variableにおける非線形性の評価
  • 非線形性がありそうなのはBMI, HR, GLU
#統計解析でも非線形性を一応評価

#rcs_stat(cont_var)
rcs_stat(cont_var,d=fr_p1)
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  TOTCHOL    5.12       3    0.1632
##   Nonlinear 0.04       2    0.9805
##  TOTAL      5.12       3    0.1632
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  AGE        163.68     3    <.0001
##   Nonlinear   7.49     2    0.0236
##  TOTAL      163.68     3    <.0001
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  SYSBP      162.79     3    <.0001
##   Nonlinear   7.17     2    0.0277
##  TOTAL      162.79     3    <.0001
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  DIABP      120.61     3    <.0001
##   Nonlinear   1.57     2    0.4558
##  TOTAL      120.61     3    <.0001
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  CIGPDAY    1.11       2    0.5740
##   Nonlinear 0.82       1    0.3645
##  TOTAL      1.11       2    0.5740
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  BMI        37.17      3    <.0001
##   Nonlinear  3.15      2    0.2071
##  TOTAL      37.17      3    <.0001
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  HEARTRTE   5.22       3    0.1565
##   Nonlinear 5.00       2    0.0823
##  TOTAL      5.22       3    0.1565
##                 Wald Statistics          Response: STROKE 
## 
##  Factor     Chi-Square d.f. P     
##  GLUCOSE    16.95      3    0.0007
##   Nonlinear  7.16      2    0.0279
##  TOTAL      16.95      3    0.0007

交互作用の検討

  • カテゴリカル変数、連続変数すべての組み合わせで交互作用を検討
  • 使用するvariableを定義

data drivenに選択する方法(望ましくない)

var<-c("SEX", "CURSMOKE", "DIABETES", "BPMEDS", "PREVCHD", "PREVAP", "PREVMI", "PREVSTRK", "PREVHYP", cont_var)

#組み合わせの総数
choose(length(var),2) #136通り
## [1] 136
#交互作用の組み合わせで回帰式作成
inter_formula<-
  combn(x=var,m=2) %>%
  matrix(nrow=136,ncol=2,byrow=T) %>% 
  as.data.frame() %>% 
  mutate(inter=str_c(V1,"*",V2),
         formula=str_c("STROKE~",V1,"+",V2,"+",inter)) 

#上記で作成した回帰式を用いて、尤度比検定を一括で実施
mat<-matrix(nrow=136,ncol=4,
              dimnames=list(NULL,c("var","LR","Df","p")))

for(i in 1:nrow(inter_formula)){
  
  res<-
    glm(as.formula(inter_formula$formula[i]),data=fr_p1,family = binomial(link = "logit")) %>% 
    car::Anova(type=2,test.statistic = "LR") %>% 
    as.data.frame() %>% 
    rownames_to_column() %>% 
    rename("var"="rowname","LR"="LR Chisq","Df"="Df","p"="Pr(>Chisq)") %>% 
    .[3,] %>% 
    as.matrix()
  
  mat[i,]<-res
}

#p<0.05の交互作用項を抽出
mat %>% 
  as.data.frame() %>% 
  filter(p<0.05) 
##                  var       LR Df           p
## 1        SEX:PREVCHD 5.843505  1  0.01563465
## 2         SEX:PREVAP 3.846206  1  0.04985864
## 3         SEX:PREVMI 4.708745  1  0.03000955
## 4        SEX:PREVHYP 8.646947  1 0.003276098
## 5          SEX:DIABP 8.249965  1 0.004075279
## 6  CURSMOKE:HEARTRTE 3.862393  1  0.04937986
## 7   DIABETES:TOTCHOL 4.273419  1  0.03871297
## 8     DIABETES:SYSBP 4.622708  1  0.03155139
## 9     BPMEDS:PREVCHD 4.369406  1  0.03658975
## 10    BPMEDS:TOTCHOL 7.385464  1  0.00657531
## 11        BPMEDS:AGE 4.632506  1  0.03137171
## 12   PREVCHD:GLUCOSE 4.150657  1  0.04161838
## 13       PREVHYP:BMI 4.158136  1  0.04143499
## 14       TOTCHOL:AGE 4.629455  1  0.03142754
## 15  HEARTRTE:GLUCOSE 4.014679  1  0.04510582
  • 尤度比検定で交互作用項がp<0.05になっているものを算出
  • data drivenな交互作用項の選択になっているので、多くの問題をはらんでいる:
    1. 多重比較の問題
    2. testimation bias
  • 実際にはこのようなやり方で交互作用項を検討するのは上記のような問題が有り、特に因果推論の文脈では臨床的な仮定に基づきa prioriに交互作用項を選択するのが望ましい。
  • 予測研究においては、最終的に得られるモデルの予測性能にのみ興味があるため、testimation biasによるoverfittingを対処できるのであれば、変数選択に尤度比検定を用いてもいいのかもしれない。今回はこの分野の臨床的なドメイン知識が乏しいので、尤度比検定で交互作用項を選択する。その代わり、回帰モデルではshrinkageによる対処を行う予定。
#交互作用項として選択するもの

mat %>% 
  as.data.frame() %>% 
  filter(p<0.05) %>% 
  select(var) %>% 
  as.vector() %>% 
  dput()
## list(var = c("SEX:PREVCHD", "SEX:PREVAP", "SEX:PREVMI", "SEX:PREVHYP", 
## "SEX:DIABP", "CURSMOKE:HEARTRTE", "DIABETES:TOTCHOL", "DIABETES:SYSBP", 
## "BPMEDS:PREVCHD", "BPMEDS:TOTCHOL", "BPMEDS:AGE", "PREVCHD:GLUCOSE", 
## "PREVHYP:BMI", "TOTCHOL:AGE", "HEARTRTE:GLUCOSE"))
inter_var<-c("SEX:PREVCHD", "SEX:PREVAP", "SEX:PREVMI", "SEX:PREVHYP", "SEX:DIABP", "CURSMOKE:HEARTRTE","DIABETES:TOTCHOL", "DIABETES:SYSBP",
             "BPMEDS:PREVCHD", "BPMEDS:TOTCHOL", "BPMEDS:AGE", "PREVCHD:GLUCOSE", "PREVHYP:BMI", "TOTCHOL:AGE","HEARTRTE:GLUCOSE")

full modelの構築

#nonlinelity term
nonlin_term<-c("BMI","HEARTRTE","GLUCOSE")

#vecからnonlin_term分の要素を削除
var[!var %in% nonlin_term]
##  [1] "SEX"      "CURSMOKE" "DIABETES" "BPMEDS"   "PREVCHD"  "PREVAP"  
##  [7] "PREVMI"   "PREVSTRK" "PREVHYP"  "TOTCHOL"  "AGE"      "SYSBP"   
## [13] "DIABP"    "CIGPDAY"
fullmodel<-
  paste0("STROKE~",
       str_c(var[!var %in% nonlin_term],collapse = "+"), #非線形以外のvarを+でつなぐ
       "+",
       str_c(paste0("rcs(",nonlin_term,",4)"),collapse = "+"), #非線形のvarを+でつなぐ
       "+",
       str_c(inter_var,collapse = "+")) #交互作用項を+でつなぐ

fullmodel
## [1] "STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVCHD+PREVAP+PREVMI+PREVSTRK+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+SEX:PREVCHD+SEX:PREVAP+SEX:PREVMI+SEX:PREVHYP+SEX:DIABP+CURSMOKE:HEARTRTE+DIABETES:TOTCHOL+DIABETES:SYSBP+BPMEDS:PREVCHD+BPMEDS:TOTCHOL+BPMEDS:AGE+PREVCHD:GLUCOSE+PREVHYP:BMI+TOTCHOL:AGE+HEARTRTE:GLUCOSE"

欠測補完→別のRmdで実行