#library
pacman::p_load(
tidyverse,data.table,tableone,tibble,stringi,readxl,dplyr,scales,ggsci,
skimr,Epi,janitor,summarytools,broom,easystats,car,rms)
#conflict解消
select<-dplyr::select
#factor var
fac_var<-c("RANDID","SEX", "CURSMOKE", "DIABETES", "BPMEDS","educ","PREVCHD", "PREVAP", "PREVMI", "PREVSTRK", "PREVHYP", "PERIOD", "DEATH",
"ANGINA", "HOSPMI", "MI_FCHD", "ANYCHD", "STROKE", "CVD", "HYPERTEN")
#data import
fr<-read_csv("frmgham2.csv") %>%
mutate(across(fac_var,as.factor))
## Rows: 11627 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (39): RANDID, SEX, TOTCHOL, AGE, SYSBP, DIABP, CURSMOKE, CIGPDAY, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(fac_var, as.factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(fac_var)
##
## # Now:
## data %>% select(all_of(fac_var))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
fr %>%
head(10)
## # A tibble: 10 × 39
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES BPMEDS
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <fct> <fct>
## 1 2448 1 195 39 106 70 0 0 27.0 0 0
## 2 2448 1 209 52 121 66 0 0 NA 0 0
## 3 6238 2 250 46 121 81 0 0 28.7 0 0
## 4 6238 2 260 52 105 69.5 0 0 29.4 0 0
## 5 6238 2 237 58 108 66 0 0 28.5 0 0
## 6 9428 1 245 48 128. 80 1 20 25.3 0 0
## 7 9428 1 283 54 141 89 1 30 25.3 0 0
## 8 10552 2 225 61 150 95 1 30 28.6 0 0
## 9 10552 2 232 67 183 109 1 20 30.2 0 0
## 10 11252 2 285 46 130 84 1 23 23.1 0 0
## # ℹ 28 more variables: HEARTRTE <dbl>, GLUCOSE <dbl>, educ <fct>,
## # PREVCHD <fct>, PREVAP <fct>, PREVMI <fct>, PREVSTRK <fct>, PREVHYP <fct>,
## # TIME <dbl>, PERIOD <fct>, HDLC <dbl>, LDLC <dbl>, DEATH <fct>,
## # ANGINA <fct>, HOSPMI <fct>, MI_FCHD <fct>, ANYCHD <fct>, STROKE <fct>,
## # CVD <fct>, HYPERTEN <fct>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## # TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>, TIMEHYP <dbl>
fr %>%
skim()
Name | Piped data |
Number of rows | 11627 |
Number of columns | 39 |
_______________________ | |
Column type frequency: | |
factor | 20 |
numeric | 19 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
RANDID | 0 | 1.00 | FALSE | 4434 | 623: 3, 112: 3, 112: 3, 128: 3 |
SEX | 0 | 1.00 | FALSE | 2 | 2: 6605, 1: 5022 |
CURSMOKE | 0 | 1.00 | FALSE | 2 | 0: 6598, 1: 5029 |
DIABETES | 0 | 1.00 | FALSE | 2 | 0: 11097, 1: 530 |
BPMEDS | 593 | 0.95 | FALSE | 2 | 0: 10090, 1: 944 |
educ | 295 | 0.97 | FALSE | 4 | 1: 4690, 2: 3410, 3: 1885, 4: 1347 |
PREVCHD | 0 | 1.00 | FALSE | 2 | 0: 10785, 1: 842 |
PREVAP | 0 | 1.00 | FALSE | 2 | 0: 11000, 1: 627 |
PREVMI | 0 | 1.00 | FALSE | 2 | 0: 11253, 1: 374 |
PREVSTRK | 0 | 1.00 | FALSE | 2 | 0: 11475, 1: 152 |
PREVHYP | 0 | 1.00 | FALSE | 2 | 0: 6283, 1: 5344 |
PERIOD | 0 | 1.00 | FALSE | 3 | 1: 4434, 2: 3930, 3: 3263 |
DEATH | 0 | 1.00 | FALSE | 2 | 0: 8100, 1: 3527 |
ANGINA | 0 | 1.00 | FALSE | 2 | 0: 9725, 1: 1902 |
HOSPMI | 0 | 1.00 | FALSE | 2 | 0: 10473, 1: 1154 |
MI_FCHD | 0 | 1.00 | FALSE | 2 | 0: 9839, 1: 1788 |
ANYCHD | 0 | 1.00 | FALSE | 2 | 0: 8469, 1: 3158 |
STROKE | 0 | 1.00 | FALSE | 2 | 0: 10566, 1: 1061 |
CVD | 0 | 1.00 | FALSE | 2 | 0: 8728, 1: 2899 |
HYPERTEN | 0 | 1.00 | FALSE | 2 | 1: 8642, 0: 2985 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
TOTCHOL | 409 | 0.96 | 241.16 | 45.37 | 107.00 | 210.00 | 238.00 | 268.00 | 696.0 | ▅▇▁▁▁ |
AGE | 0 | 1.00 | 54.79 | 9.56 | 32.00 | 48.00 | 54.00 | 62.00 | 81.0 | ▂▇▇▅▁ |
SYSBP | 0 | 1.00 | 136.32 | 22.80 | 83.50 | 120.00 | 132.00 | 149.00 | 295.0 | ▆▇▁▁▁ |
DIABP | 0 | 1.00 | 83.04 | 11.66 | 30.00 | 75.00 | 82.00 | 90.00 | 150.0 | ▁▅▇▁▁ |
CIGPDAY | 79 | 0.99 | 8.25 | 12.19 | 0.00 | 0.00 | 0.00 | 20.00 | 90.0 | ▇▂▁▁▁ |
BMI | 52 | 1.00 | 25.88 | 4.10 | 14.43 | 23.09 | 25.48 | 28.07 | 56.8 | ▃▇▁▁▁ |
HEARTRTE | 6 | 1.00 | 76.78 | 12.46 | 37.00 | 69.00 | 75.00 | 85.00 | 220.0 | ▆▇▁▁▁ |
GLUCOSE | 1440 | 0.88 | 84.12 | 24.99 | 39.00 | 72.00 | 80.00 | 89.00 | 478.0 | ▇▁▁▁▁ |
TIME | 0 | 1.00 | 1957.02 | 1758.78 | 0.00 | 0.00 | 2156.00 | 4252.50 | 4854.0 | ▇▁▇▁▆ |
HDLC | 8600 | 0.26 | 49.36 | 15.63 | 10.00 | 39.00 | 48.00 | 58.00 | 189.0 | ▆▇▁▁▁ |
LDLC | 8601 | 0.26 | 176.47 | 46.86 | 20.00 | 145.00 | 173.00 | 205.00 | 565.0 | ▂▇▁▁▁ |
TIMEAP | 0 | 1.00 | 7241.56 | 2477.78 | 0.00 | 6224.00 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMEMI | 0 | 1.00 | 7593.85 | 2136.73 | 0.00 | 7212.00 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMEMIFC | 0 | 1.00 | 7543.04 | 2192.12 | 0.00 | 7049.50 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMECHD | 0 | 1.00 | 7008.15 | 2641.34 | 0.00 | 5598.50 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMESTRK | 0 | 1.00 | 7660.88 | 2011.08 | 0.00 | 7295.00 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMECVD | 0 | 1.00 | 7166.08 | 2541.67 | 0.00 | 6004.00 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMEDTH | 0 | 1.00 | 7854.10 | 1788.37 | 26.00 | 7797.50 | 8766.00 | 8766.00 | 8766.0 | ▁▁▁▁▇ |
TIMEHYP | 0 | 1.00 | 3598.96 | 3464.16 | 0.00 | 0.00 | 2429.00 | 7329.00 | 8766.0 | ▇▂▂▂▅ |
#continuous var distribution
fr %>%
select(-c(fac_var)) %>%
pivot_longer(cols=everything(),
names_to = "term",
values_to = "value") %>%
ggplot(aes(x=value))+
geom_histogram()+
facet_wrap(~term,scales = "free")+
#x軸の文字を小さくする
theme(axis.text.x = element_text(size=5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19187 rows containing non-finite values (`stat_bin()`).
#stroke
fr %>%
distinct(RANDID, .keep_all = T) %>%
count(STROKE)
## # A tibble: 2 × 2
## STROKE n
## <fct> <int>
## 1 0 4019
## 2 1 415
Strokeのアウトカム発生数は415あるので、多変量解析で組み込める予測因子の数も十分確保できそう
#continuous var
fr %>%
select(-c(fac_var)) %>%
colnames() %>%
dput()
## c("TOTCHOL", "AGE", "SYSBP", "DIABP", "CIGPDAY", "BMI", "HEARTRTE",
## "GLUCOSE", "TIME", "HDLC", "LDLC", "TIMEAP", "TIMEMI", "TIMEMIFC",
## "TIMECHD", "TIMESTRK", "TIMECVD", "TIMEDTH", "TIMEHYP")
cont_var<-c("TOTCHOL", "AGE", "SYSBP", "DIABP", "CIGPDAY", "BMI", "HEARTRTE", "GLUCOSE")
#事前準備
ddist <- datadist(fr)
options(datadist='ddist')
#rcs()で変数の非線形を構築、4でknotsを指定
##plot
rcs_plot<-
function(x,n=4,d=fr){
#x:explanatory variable
#n:knot number
#d=data set
rcs_formula_str <- paste("STROKE~ rcs(", x, ",",n,")")
rcs<-lrm(as.formula(rcs_formula_str),data=d)
Predict(rcs) %>%
ggplot()
}
##stat
rcs_stat<-function(x,d=fr){
#x:explanatory variable
#d:data set
for(i in 1:length(x)){
rcs_formula_str <- paste("STROKE~ rcs(", x[i], ",4)")
rcs<-lrm(as.formula(rcs_formula_str),data=d)
print(rcs %>%
anova())
}
}
library(gridExtra)
##
## 次のパッケージを付け加えます: 'gridExtra'
## 以下のオブジェクトは 'package:dplyr' からマスクされています:
##
## combine
#plot in all exam time
grid.arrange(
rcs_plot("AGE"),
rcs_plot("SYSBP"),
rcs_plot("DIABP"),
rcs_plot("CIGPDAY"),
rcs_plot("BMI"),
rcs_plot("HEARTRTE"),
rcs_plot("GLUCOSE"),
rcs_plot("TOTCHOL"),
ncol=3
)
#plot in baseline variable
fr_p1<-fr %>% filter(PERIOD==1)
grid.arrange(
rcs_plot("AGE",d=fr_p1),
rcs_plot("SYSBP",d=fr_p1),
rcs_plot("DIABP",d=fr_p1),
rcs_plot("CIGPDAY",d=fr_p1),
rcs_plot("BMI",d=fr_p1),
rcs_plot("HEARTRTE",d=fr_p1),
rcs_plot("GLUCOSE",d=fr_p1),
rcs_plot("TOTCHOL",d=fr_p1),
ncol=3
)
#統計解析でも非線形性を一応評価
#rcs_stat(cont_var)
rcs_stat(cont_var,d=fr_p1)
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## TOTCHOL 5.12 3 0.1632
## Nonlinear 0.04 2 0.9805
## TOTAL 5.12 3 0.1632
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## AGE 163.68 3 <.0001
## Nonlinear 7.49 2 0.0236
## TOTAL 163.68 3 <.0001
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## SYSBP 162.79 3 <.0001
## Nonlinear 7.17 2 0.0277
## TOTAL 162.79 3 <.0001
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## DIABP 120.61 3 <.0001
## Nonlinear 1.57 2 0.4558
## TOTAL 120.61 3 <.0001
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## CIGPDAY 1.11 2 0.5740
## Nonlinear 0.82 1 0.3645
## TOTAL 1.11 2 0.5740
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## BMI 37.17 3 <.0001
## Nonlinear 3.15 2 0.2071
## TOTAL 37.17 3 <.0001
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## HEARTRTE 5.22 3 0.1565
## Nonlinear 5.00 2 0.0823
## TOTAL 5.22 3 0.1565
## Wald Statistics Response: STROKE
##
## Factor Chi-Square d.f. P
## GLUCOSE 16.95 3 0.0007
## Nonlinear 7.16 2 0.0279
## TOTAL 16.95 3 0.0007
var<-c("SEX", "CURSMOKE", "DIABETES", "BPMEDS", "PREVCHD", "PREVAP", "PREVMI", "PREVSTRK", "PREVHYP", cont_var)
#組み合わせの総数
choose(length(var),2) #136通り
## [1] 136
#交互作用の組み合わせで回帰式作成
inter_formula<-
combn(x=var,m=2) %>%
matrix(nrow=136,ncol=2,byrow=T) %>%
as.data.frame() %>%
mutate(inter=str_c(V1,"*",V2),
formula=str_c("STROKE~",V1,"+",V2,"+",inter))
#上記で作成した回帰式を用いて、尤度比検定を一括で実施
mat<-matrix(nrow=136,ncol=4,
dimnames=list(NULL,c("var","LR","Df","p")))
for(i in 1:nrow(inter_formula)){
res<-
glm(as.formula(inter_formula$formula[i]),data=fr_p1,family = binomial(link = "logit")) %>%
car::Anova(type=2,test.statistic = "LR") %>%
as.data.frame() %>%
rownames_to_column() %>%
rename("var"="rowname","LR"="LR Chisq","Df"="Df","p"="Pr(>Chisq)") %>%
.[3,] %>%
as.matrix()
mat[i,]<-res
}
#p<0.05の交互作用項を抽出
mat %>%
as.data.frame() %>%
filter(p<0.05)
## var LR Df p
## 1 SEX:PREVCHD 5.843505 1 0.01563465
## 2 SEX:PREVAP 3.846206 1 0.04985864
## 3 SEX:PREVMI 4.708745 1 0.03000955
## 4 SEX:PREVHYP 8.646947 1 0.003276098
## 5 SEX:DIABP 8.249965 1 0.004075279
## 6 CURSMOKE:HEARTRTE 3.862393 1 0.04937986
## 7 DIABETES:TOTCHOL 4.273419 1 0.03871297
## 8 DIABETES:SYSBP 4.622708 1 0.03155139
## 9 BPMEDS:PREVCHD 4.369406 1 0.03658975
## 10 BPMEDS:TOTCHOL 7.385464 1 0.00657531
## 11 BPMEDS:AGE 4.632506 1 0.03137171
## 12 PREVCHD:GLUCOSE 4.150657 1 0.04161838
## 13 PREVHYP:BMI 4.158136 1 0.04143499
## 14 TOTCHOL:AGE 4.629455 1 0.03142754
## 15 HEARTRTE:GLUCOSE 4.014679 1 0.04510582
#交互作用項として選択するもの
mat %>%
as.data.frame() %>%
filter(p<0.05) %>%
select(var) %>%
as.vector() %>%
dput()
## list(var = c("SEX:PREVCHD", "SEX:PREVAP", "SEX:PREVMI", "SEX:PREVHYP",
## "SEX:DIABP", "CURSMOKE:HEARTRTE", "DIABETES:TOTCHOL", "DIABETES:SYSBP",
## "BPMEDS:PREVCHD", "BPMEDS:TOTCHOL", "BPMEDS:AGE", "PREVCHD:GLUCOSE",
## "PREVHYP:BMI", "TOTCHOL:AGE", "HEARTRTE:GLUCOSE"))
inter_var<-c("SEX:PREVCHD", "SEX:PREVAP", "SEX:PREVMI", "SEX:PREVHYP", "SEX:DIABP", "CURSMOKE:HEARTRTE","DIABETES:TOTCHOL", "DIABETES:SYSBP",
"BPMEDS:PREVCHD", "BPMEDS:TOTCHOL", "BPMEDS:AGE", "PREVCHD:GLUCOSE", "PREVHYP:BMI", "TOTCHOL:AGE","HEARTRTE:GLUCOSE")
#nonlinelity term
nonlin_term<-c("BMI","HEARTRTE","GLUCOSE")
#vecからnonlin_term分の要素を削除
var[!var %in% nonlin_term]
## [1] "SEX" "CURSMOKE" "DIABETES" "BPMEDS" "PREVCHD" "PREVAP"
## [7] "PREVMI" "PREVSTRK" "PREVHYP" "TOTCHOL" "AGE" "SYSBP"
## [13] "DIABP" "CIGPDAY"
fullmodel<-
paste0("STROKE~",
str_c(var[!var %in% nonlin_term],collapse = "+"), #非線形以外のvarを+でつなぐ
"+",
str_c(paste0("rcs(",nonlin_term,",4)"),collapse = "+"), #非線形のvarを+でつなぐ
"+",
str_c(inter_var,collapse = "+")) #交互作用項を+でつなぐ
fullmodel
## [1] "STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVCHD+PREVAP+PREVMI+PREVSTRK+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+SEX:PREVCHD+SEX:PREVAP+SEX:PREVMI+SEX:PREVHYP+SEX:DIABP+CURSMOKE:HEARTRTE+DIABETES:TOTCHOL+DIABETES:SYSBP+BPMEDS:PREVCHD+BPMEDS:TOTCHOL+BPMEDS:AGE+PREVCHD:GLUCOSE+PREVHYP:BMI+TOTCHOL:AGE+HEARTRTE:GLUCOSE"