Đây là một package rất hữu ích trong việc phân tích những mô hình hồi quy (linear regression,logistic regression) cũng những hàm khá nổi tiếng khác như lm,glm.
ob = read.csv("D://obesity data.csv",header = TRUE)
head(ob)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
library(finalfit)
library(tidyverse)
x = c("fat","bmi","age","weight","gender")
y = c("pcfat")
ob %>% finalfit(y,x) -> t1
knitr::kable(t1,row.names = FALSE,
align = c("l","l","r","r","r"))
| Dependent: pcfat | Mean (sd) | Coefficient (univariable) | Coefficient (multivariable) | |
|---|---|---|---|---|
| fat | [4277,40825] | 31.6 (7.2) | 0.00 (0.00 to 0.00, p<0.001) | 0.00 (0.00 to 0.00, p<0.001) |
| bmi | [14.5,37.1] | 31.6 (7.2) | 1.04 (0.92 to 1.15, p<0.001) | 0.15 (0.08 to 0.22, p<0.001) |
| age | [13,88] | 31.6 (7.2) | 0.13 (0.11 to 0.15, p<0.001) | 0.01 (0.00 to 0.02, p=0.003) |
| weight | [34,95] | 31.6 (7.2) | 0.04 (0.00 to 0.09, p=0.049) | -0.42 (-0.45 to -0.39, p<0.001) |
| gender | F | 34.7 (5.2) | - | - |
| M | 24.2 (5.8) | -10.52 (-11.18 to -9.85, p<0.001) | -1.74 (-2.12 to -1.36, p<0.001) |
Điểm hay của hàm finalfit là cho ta kết quả của các mô hình đơn cộng với mô hình quy bội.Bây giờ ta sẽ kiểm tra lại bằng hàm lm.
summary(lm(data = ob , pcfat ~ fat))
##
## Call:
## lm(formula = pcfat ~ fat, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8255 -3.1716 0.8029 3.1385 14.0799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.198e+01 4.041e-01 29.64 <2e-16 ***
## fat 1.135e-03 2.238e-05 50.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.07 on 1215 degrees of freedom
## Multiple R-squared: 0.6792, Adjusted R-squared: 0.679
## F-statistic: 2573 on 1 and 1215 DF, p-value: < 2.2e-16
summary(lm(data = ob ,pcfat ~ bmi))
##
## Call:
## lm(formula = pcfat ~ bmi, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.612 -4.181 1.392 4.690 18.241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.39889 1.36777 6.141 1.11e-09 ***
## bmi 1.03619 0.06051 17.123 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.45 on 1215 degrees of freedom
## Multiple R-squared: 0.1944, Adjusted R-squared: 0.1937
## F-statistic: 293.2 on 1 and 1215 DF, p-value: < 2.2e-16
summary(lm(data = ob,pcfat~age))
##
## Call:
## lm(formula = pcfat ~ age, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.7114 -4.0069 0.8378 4.9514 18.2192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.58408 0.57003 44.88 <2e-16 ***
## age 0.12769 0.01135 11.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.839 on 1215 degrees of freedom
## Multiple R-squared: 0.09431, Adjusted R-squared: 0.09357
## F-statistic: 126.5 on 1 and 1215 DF, p-value: < 2.2e-16
summary(lm(data = ob,pcfat ~ weight))
##
## Call:
## lm(formula = pcfat ~ weight, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3122 -4.5234 0.8902 5.2695 16.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.22295 1.22370 23.881 <2e-16 ***
## weight 0.04319 0.02188 1.975 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared: 0.003199, Adjusted R-squared: 0.002378
## F-statistic: 3.899 on 1 and 1215 DF, p-value: 0.04855
summary(lm(data = ob,pcfat~gender))
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <2e-16 ***
## genderM -10.5163 0.3381 -31.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 2.2e-16
summary(lm(data = ob,pcfat~fat+bmi+age+weight+gender))
##
## Call:
## lm(formula = pcfat ~ fat + bmi + age + weight + gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.8202 -0.7119 0.1881 0.8683 11.3473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.628e+01 4.335e-01 60.626 < 2e-16 ***
## fat 1.452e-03 1.933e-05 75.138 < 2e-16 ***
## bmi 1.492e-01 3.518e-02 4.240 2.4e-05 ***
## age 9.423e-03 3.167e-03 2.975 0.00299 **
## weight -4.183e-01 1.373e-02 -30.455 < 2e-16 ***
## genderM -1.743e+00 1.934e-01 -9.009 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.666 on 1211 degrees of freedom
## Multiple R-squared: 0.9464, Adjusted R-squared: 0.9462
## F-statistic: 4278 on 5 and 1211 DF, p-value: < 2.2e-16
mydata = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv",header = TRUE)
mydata$admit =as.factor(mydata$admit)
head(mydata)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
xvars = c("gre","gpa")
yvars = c("admit")
mydata %>% finalfit(yvars,xvars) -> t1
knitr::kable(t1,row.names = FALSE,
align = c("l","l","r","r","r"))
| Dependent: admit | 0 | 1 | OR (univariable) | OR (multivariable) | |
|---|---|---|---|---|---|
| gre | Mean (SD) | 573.2 (115.8) | 618.9 (108.9) | 1.00 (1.00-1.01, p<0.001) | 1.00 (1.00-1.00, p=0.011) |
| gpa | Mean (SD) | 3.3 (0.4) | 3.5 (0.4) | 2.86 (1.61-5.20, p<0.001) | 2.13 (1.14-4.02, p=0.018) |
Bây giờ ta sẽ kiểm tra lại bằng hàm glm
res = glm(data = mydata, formula = admit~gre+gpa,family = binomial)
summary(res)
##
## Call:
## glm(formula = admit ~ gre + gpa, family = binomial, data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2730 -0.8988 -0.7206 1.3013 2.0620
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.949378 1.075093 -4.604 4.15e-06 ***
## gre 0.002691 0.001057 2.544 0.0109 *
## gpa 0.754687 0.319586 2.361 0.0182 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 480.34 on 397 degrees of freedom
## AIC: 486.34
##
## Number of Fisher Scoring iterations: 4
library(epicalc)
logistic.display(res)
##
## Logistic regression predicting admit : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## gre (cont. var.) 1.0036 (1.0017,1.0055) 1.0027 (1.0006,1.0048)
##
## gpa (cont. var.) 2.86 (1.59,5.14) 2.13 (1.14,3.98)
##
## P(Wald's test) P(LR-test)
## gre (cont. var.) 0.011 0.01
##
## gpa (cont. var.) 0.018 0.017
##
## Log-likelihood = -240.172
## No. of observations = 400
## AIC value = 486.344
mydata %>%
or_plot(yvars,xvars)