Đây là một package rất hữu ích trong việc phân tích những mô hình hồi quy (linear regression,logistic regression) cũng những hàm khá nổi tiếng khác như lm,glm.

Load dataset:

ob = read.csv("D://obesity data.csv",header = TRUE)
head(ob)

##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2

Phân tích hồi quy tuyến tính với finalfit

library(finalfit)
library(tidyverse)
x = c("fat","bmi","age","weight","gender")
y = c("pcfat")
ob %>% finalfit(y,x) -> t1
knitr::kable(t1,row.names = FALSE,
             align = c("l","l","r","r","r"))

Dependent: pcfat		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
fat	[4277,40825]	31.6 (7.2)	0.00 (0.00 to 0.00, p<0.001)	0.00 (0.00 to 0.00, p<0.001)
bmi	[14.5,37.1]	31.6 (7.2)	1.04 (0.92 to 1.15, p<0.001)	0.15 (0.08 to 0.22, p<0.001)
age	[13,88]	31.6 (7.2)	0.13 (0.11 to 0.15, p<0.001)	0.01 (0.00 to 0.02, p=0.003)
weight	[34,95]	31.6 (7.2)	0.04 (0.00 to 0.09, p=0.049)	-0.42 (-0.45 to -0.39, p<0.001)
gender	F	34.7 (5.2)	-	-
	M	24.2 (5.8)	-10.52 (-11.18 to -9.85, p<0.001)	-1.74 (-2.12 to -1.36, p<0.001)

Điểm hay của hàm finalfit là cho ta kết quả của các mô hình đơn cộng với mô hình quy bội.Bây giờ ta sẽ kiểm tra lại bằng hàm lm.

summary(lm(data = ob , pcfat ~ fat))

## 
## Call:
## lm(formula = pcfat ~ fat, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8255  -3.1716   0.8029   3.1385  14.0799 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.198e+01  4.041e-01   29.64   <2e-16 ***
## fat         1.135e-03  2.238e-05   50.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.07 on 1215 degrees of freedom
## Multiple R-squared:  0.6792, Adjusted R-squared:  0.679 
## F-statistic:  2573 on 1 and 1215 DF,  p-value: < 2.2e-16

summary(lm(data = ob ,pcfat ~ bmi))

## 
## Call:
## lm(formula = pcfat ~ bmi, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.612  -4.181   1.392   4.690  18.241 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.39889    1.36777   6.141 1.11e-09 ***
## bmi          1.03619    0.06051  17.123  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.45 on 1215 degrees of freedom
## Multiple R-squared:  0.1944, Adjusted R-squared:  0.1937 
## F-statistic: 293.2 on 1 and 1215 DF,  p-value: < 2.2e-16

summary(lm(data = ob,pcfat~age))

## 
## Call:
## lm(formula = pcfat ~ age, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.7114  -4.0069   0.8378   4.9514  18.2192 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.58408    0.57003   44.88   <2e-16 ***
## age          0.12769    0.01135   11.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.839 on 1215 degrees of freedom
## Multiple R-squared:  0.09431,    Adjusted R-squared:  0.09357 
## F-statistic: 126.5 on 1 and 1215 DF,  p-value: < 2.2e-16

summary(lm(data = ob,pcfat ~ weight))

## 
## Call:
## lm(formula = pcfat ~ weight, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.3122  -4.5234   0.8902   5.2695  16.9742 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.22295    1.22370  23.881   <2e-16 ***
## weight       0.04319    0.02188   1.975   0.0485 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared:  0.003199,   Adjusted R-squared:  0.002378 
## F-statistic: 3.899 on 1 and 1215 DF,  p-value: 0.04855

summary(lm(data = ob,pcfat~gender))

## 
## Call:
## lm(formula = pcfat ~ gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.0724  -3.2724   0.1484   3.6276  14.8439 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.6724     0.1826   189.9   <2e-16 ***
## genderM     -10.5163     0.3381   -31.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4428 
## F-statistic: 967.3 on 1 and 1215 DF,  p-value: < 2.2e-16

summary(lm(data = ob,pcfat~fat+bmi+age+weight+gender))

## 
## Call:
## lm(formula = pcfat ~ fat + bmi + age + weight + gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.8202  -0.7119   0.1881   0.8683  11.3473 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.628e+01  4.335e-01  60.626  < 2e-16 ***
## fat          1.452e-03  1.933e-05  75.138  < 2e-16 ***
## bmi          1.492e-01  3.518e-02   4.240  2.4e-05 ***
## age          9.423e-03  3.167e-03   2.975  0.00299 ** 
## weight      -4.183e-01  1.373e-02 -30.455  < 2e-16 ***
## genderM     -1.743e+00  1.934e-01  -9.009  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.666 on 1211 degrees of freedom
## Multiple R-squared:  0.9464, Adjusted R-squared:  0.9462 
## F-statistic:  4278 on 5 and 1211 DF,  p-value: < 2.2e-16

Phân tích hồi quy logistic với finalfit

mydata = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv",header = TRUE)
mydata$admit  =as.factor(mydata$admit)
head(mydata)

##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2

xvars = c("gre","gpa")
yvars = c("admit")
mydata %>% finalfit(yvars,xvars) -> t1
knitr::kable(t1,row.names = FALSE,
             align = c("l","l","r","r","r"))

Dependent: admit		0	1	OR (univariable)	OR (multivariable)
gre	Mean (SD)	573.2 (115.8)	618.9 (108.9)	1.00 (1.00-1.01, p<0.001)	1.00 (1.00-1.00, p=0.011)
gpa	Mean (SD)	3.3 (0.4)	3.5 (0.4)	2.86 (1.61-5.20, p<0.001)	2.13 (1.14-4.02, p=0.018)

Bây giờ ta sẽ kiểm tra lại bằng hàm glm

res  = glm(data = mydata, formula = admit~gre+gpa,family = binomial)
summary(res)

## 
## Call:
## glm(formula = admit ~ gre + gpa, family = binomial, data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2730  -0.8988  -0.7206   1.3013   2.0620  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.949378   1.075093  -4.604 4.15e-06 ***
## gre          0.002691   0.001057   2.544   0.0109 *  
## gpa          0.754687   0.319586   2.361   0.0182 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 480.34  on 397  degrees of freedom
## AIC: 486.34
## 
## Number of Fisher Scoring iterations: 4

library(epicalc)
logistic.display(res)

## 
## Logistic regression predicting admit : 1 vs 0 
##  
##                  crude OR(95%CI)         adj. OR(95%CI)         
## gre (cont. var.) 1.0036 (1.0017,1.0055)  1.0027 (1.0006,1.0048) 
##                                                                 
## gpa (cont. var.) 2.86 (1.59,5.14)        2.13 (1.14,3.98)       
##                                                                 
##                  P(Wald's test) P(LR-test)
## gre (cont. var.) 0.011          0.01      
##                                           
## gpa (cont. var.) 0.018          0.017     
##                                           
## Log-likelihood = -240.172
## No. of observations = 400
## AIC value = 486.344

mydata %>% 
  or_plot(yvars,xvars)

Phân tích hồi quy với package finalfit

Đặng Khải Hoàn

2/20/2020

Load dataset:

Phân tích hồi quy tuyến tính với finalfit

Phân tích hồi quy logistic với finalfit