Ngày 1: Giới thiệu R

Việc 1: Tải R

Việc 2: cài packages

#install.packages(c("lessR",  "table1", "simpleboot", "boot", "gapminder", "ggfortify", "DescTools", "epiDisplay", "BMA", "ggplot2", "gridExtra", "metafor", "MatchIt", "cobalt"), dependencies = T)

Việc 3: đọc dữ liệu

file.choose() [1] “/Users/nguyenminhha/Desktop/Du Lieu Thuc Hanh /Stroke Data.csv” P=file.choose()

df = read.csv("/Users/nguyenminhha/Desktop/Du Lieu Thuc Hanh /Stroke Data.csv")

Việc 4: Thông tin về dữ liệu

4.1:Quan sát và biến số

dim(df)
## [1] 5110   12

4.2: Đọc 10 dòng đầu

head(df, 10)

việc 4.3: Đọc 6 dòng cuối

tail(df)

4.4: Mô tả dữ liệu

summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201

Việc 5: Biên tập dữ liệu

5.1: Mã hoá biến sex: mã hoá biến gender (Male/Femail/other) thành biến sex với giá trị 0/1/2 (0=female; 1=Male; 2=other)

df$sex[df$gender == "Female"] = 0
df$sex[df$gender == "Male"] = 1
df$sex[df$gender == "Other"] = 2

head(df)

5.2: Mã hoá biến bmi

df$bmi_cat[df$bmi< 18.5] = "Underweight"
df$bmi_cat[df$bmi>= 18.5 & df$bmi< 25.0] = "Normal"
df$bmi_cat[df$bmi>= 25.0 & df$bmi< 30] = "Overweight"
df$bmi_cat[df$bmi>= 30] = "Obese"

table(df$bmi_cat)
## 
##      Normal       Obese  Overweight Underweight 
##        1243        1920        1409         337
head(df)

5.3: Biến stroke1: chuyển biến stroke từ biến số (con số) thành biến định tính (factor)

df$stroke1 = as.factor(df$stroke)
table(df$stroke1, df$stroke)
##    
##        0    1
##   0 4861    0
##   1    0  249
head(df)
summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201                                         
##       sex           bmi_cat          stroke1 
##  Min.   :0.0000   Length:5110        0:4861  
##  1st Qu.:0.0000   Class :character   1: 249  
##  Median :0.0000   Mode  :character           
##  Mean   :0.4143                              
##  3rd Qu.:1.0000                              
##  Max.   :2.0000                              
## 

Việc 6. Phân tích mô tả

6.1. Mô tả

Mô tả đặc điểm tuổi, CHA, tim …

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + gender + hypertension + heart_disease + ever_married + work_type | stroke, data=df)
## Warning in table1.formula(~age + gender + hypertension + heart_disease + :
## Terms to the right of '|' in formula 'x' define table columns and are expected
## to be factors with meaningful labels.
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
ever_married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
table1(~ hypertension + as.factor(hypertension) + heart_disease + ever_married + work_type | stroke, data=df) 
## Warning in table1.formula(~hypertension + as.factor(hypertension) +
## heart_disease + : Terms to the right of '|' in formula 'x' define table columns
## and are expected to be factors with meaningful labels.
0
(N=4861)
1
(N=249)
Overall
(N=5110)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
as.factor(hypertension)
0 4429 (91.1%) 183 (73.5%) 4612 (90.3%)
1 432 (8.9%) 66 (26.5%) 498 (9.7%)
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
ever_married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)