Doc du lieu: loai bo cac o trong trong excel

t = "C:\\Users\\SONY\\Google Drive\\HOC TAP\\CH 22\\Phan tich du lieu\\Workshop GS Tuan\\Phan tich du lieu va ung dung\\Datasets for practice\\Hip fracture data.csv"
hip =  read.csv(t, na.strings = "")
attach(hip)
head(hip)
##   id     dov gender age      dob visit   v1   v2    v3    v4 wt bmi  ht v5
## 1  3 15/6/89   Male  73   8/6/16     1 0.98 0.88 1.079 1.458 98  32 175 NA
## 2  8 17/4/89 Female  67 11/12/21     1 0.85 0.85 0.966 1.325 72  26 166 18
## 3  9 12/6/90   Male  68   8/1/22     1 0.87 0.84 1.013 1.494 87  26 184 36
## 4 10  4/6/90 Female  62  15/5/28     1 0.62 0.71 0.839 1.214 72  24 173 NA
## 5 23  8/8/89   Male  61  22/9/28     1 0.87 0.60 0.811 1.144 72  24 173 44
## 6 24  3/5/89 Female  76   1/8/13     1 0.76 0.58 0.743 0.980 67  28 156 15
##     v6 v7 v8 v9 hipfx timehip
## 1 39.9  1  0  0     0    0.55
## 2 31.0  0  0  0     0   19.68
## 3 28.6  0  0  0     0    5.05
## 4 28.2  1  0  0     0   18.55
## 5 28.9  1  0  0     0   19.37
## 6 33.3  0  0  0     0   12.30

So sanh ty le gay xuong theo gioi tinh

library(DescTools)
options(scipen = 999)
Desc(hipfx ~ gender)
## ------------------------------------------------------------------------- 
## hipfx ~ gender
## 
## 
## Summary: 
## n: 2'788, rows: 2, columns: 2
## 
## Pearson's Chi-squared test (cont. adj):
##   X-squared = 20.296, df = 1, p-value = 0.000006635
## Fisher's exact test p-value = 0.000003505
## McNemar's chi-squared = 725.09, df = 1, p-value < 0.00000000000000022
## 
##                     estimate lwr.ci upr.ci'
##                                           
## odds ratio             0.460  0.328  0.646
## rel. risk (col1)       0.774  0.709  0.846
## rel. risk (col2)       1.682  1.307  2.164
## 
## 
## Phi-Coefficient        0.087
## Contingency Coeff.     0.086
## Cramer's V             0.087
## 
##                                      
##         gender   Female   Male    Sum
## hipfx                                
##                                      
## 0       freq      1'512  1'087  2'599
##         perc      54.2%  39.0%  93.2%
##         p.row     58.2%  41.8%      .
##         p.col     91.4%  95.9%      .
##                                      
## 1       freq        142     47    189
##         perc       5.1%   1.7%   6.8%
##         p.row     75.1%  24.9%      .
##         p.col      8.6%   4.1%      .
##                                      
## Sum     freq      1'654  1'134  2'788
##         perc      59.3%  40.7% 100.0%
##         p.row         .      .      .
##         p.col         .      .      .
##                                      
## 
## ----------
## ' 95% conf. level

Mo hinh hoi quy fx ~ gioi

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
## 
## Attaching package: 'rrcov'
## The following object is masked from 'package:DescTools':
## 
##     Cov
library(epiDisplay)
## Loading required package: foreign
## Loading required package: MASS
## Loading required package: nnet
m1 = glm(hipfx ~ gender, family =  binomial, data = hip)
logistic.display(m1)
## 
## Logistic regression predicting hipfx 
##  
##                        OR(95%CI)         P(Wald's test) P(LR-test)
## gender: Male vs Female 0.46 (0.33,0.65)  < 0.001        < 0.001   
##                                                                   
## Log-likelihood = -679.98
## No. of observations = 2788
## AIC value = 1363.96

Bieu do tuong quan bmi the hipfx # BMI thap thi nguy co gay xuong cao

boxplot(bmi~ hipfx)

Loai outlier

hh = subset(hip, v2 < 2)
boxplot(hh$v2 ~ hh$hipfx, col = c("blue", "red"))

Tao bien moi v2 (phien giai gia tri mo hinh hoi quy)

library(epiDisplay)
hh$v2.n = hh$v2 / 0.1
m2 = glm(hipfx ~ v2.n, family =  binomial, data = hh)
logistic.display(m2)
## 
## Logistic regression predicting hipfx 
##  
##                   OR(95%CI)         P(Wald's test) P(LR-test)
## v2.n (cont. var.) 0.41 (0.36,0.47)  < 0.001        < 0.001   
##                                                              
## Log-likelihood = -525.8766
## No. of observations = 2722
## AIC value = 1055.7531

Danh gia AUC cua mo hinh fx ~ v2.n

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
hh$predicted = predict(m2, type = "response")
mm = roc(hh$hipfx, hh$predicted)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(mm)
## Area under the curve: 0.8249

Ve bieu do ROC

plot(mm)

Danh gia mo hinh fx ~ bmi

bb = subset(hh, bmi > 0)
m3 = glm(hipfx ~ bmi, family =  binomial, data = bb)
bb$predicted = predict(m3, type = "response")
mm1 = roc(bb$hipfx, bb$predicted)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(mm1)
## Area under the curve: 0.6809

Ve bieu do

plot(mm1)