Ngày 1:Giới thieu R

##Việc 1 ##Việc 2

#install.packages(c("lessR",  "table1", "simpleboot", "boot", "gapminder", "ggfortify", "DescTools", "epiDisplay", "BMA", "ggplot2", "gridExtra", "metafor", "MatchIt", "cobalt"), dependencies = T)

##Viec 3

df <- read.csv(file.choose())
dim(df)
## [1] 5110   12

đọc 6 dòng cuối

tail(df,6)
##         id gender age hypertension heart_disease ever_married     work_type
## 5105 14180 Female  13            0             0           No      children
## 5106 18234 Female  80            1             0          Yes       Private
## 5107 44873 Female  81            0             0          Yes Self-employed
## 5108 19723 Female  35            0             0          Yes Self-employed
## 5109 37544   Male  51            0             0          Yes       Private
## 5110 44679 Female  44            0             0          Yes      Govt_job
##      Residence_type avg_glucose_level  bmi  smoking_status stroke
## 5105          Rural            103.08 18.6         Unknown      0
## 5106          Urban             83.75   NA    never smoked      0
## 5107          Urban            125.20 40.0    never smoked      0
## 5108          Rural             82.99 30.6    never smoked      0
## 5109          Rural            166.29 25.6 formerly smoked      0
## 5110          Urban             85.28 26.2         Unknown      0

mo ta du lieu

summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201

viec 5

df$sex[df$gender=="Female"] = 0
df$sex[df$gender=="Male"] = 1
df$sex[df$gender=="Other"] = 2
#head(df)
#table(df$gender,df$sex)

###Mã hoá biến bmi

Nếu bmi < 18.5 thì bmi_cat = “Underweight” 
Nếu 18.5  bmi < 25.0 thì bmi_cat = “Normal”  
Nếu 25.0  bmi < 30 thì bmi_cat = “Overweight” 
Nếu bmi ≥ 30.0 thì bmi = “Obese”  
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- df %>%
  mutate(bmi_cat = case_when(
    bmi < 18.5              ~ "Underweight",
    bmi >= 18.5 & bmi < 25  ~ "Normal",
    bmi >= 25   & bmi < 30  ~ "Overweight",
    bmi >= 30               ~ "Obese"
  ))
table(df$bmi_cat)
## 
##      Normal       Obese  Overweight Underweight 
##        1243        1920        1409         337

###factor

df$stroke1= as.factor(df$stroke)

###mo ta

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ gender + age + hypertension + heart_disease +
         work_type + smoking_status | stroke,
       data = df)
## Warning in table1.formula(~gender + age + hypertension + heart_disease + :
## Terms to the right of '|' in formula 'x' define table columns and are expected
## to be factors with meaningful labels.
0
(N=4861)
1
(N=249)
Overall
(N=5110)
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
library(table1)
df$hypertension <- factor(df$hypertension,
                    levels = c(0, 1),
                    labels = c("No", "Yes"))
table1(~ hypertension +
         work_type + smoking_status | stroke,
       data = df)
## Warning in table1.formula(~hypertension + work_type + smoking_status | stroke,
## : Terms to the right of '|' in formula 'x' define table columns and are
## expected to be factors with meaningful labels.
0
(N=4861)
1
(N=249)
Overall
(N=5110)
hypertension
No 4429 (91.1%) 183 (73.5%) 4612 (90.3%)
Yes 432 (8.9%) 66 (26.5%) 498 (9.7%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)