Ngày 1: Giới thiệu ngôn ngữ R

Việc 1. Cài đặt R và RStudio

https://www.rstudio.com/products/rstudio/download

Việc 2. Cài dặt các gói phân tích (packages)

#install.packages(c("readxl", "tidyverse", "dplyr", "ggplot2", "gridExtra", "GGally", "DescTools", "table1", "compareGroups", "simpleboot", "epiDisplay", "Publish"), dependencies = T)

Việc 3. Đọc dữ liệu vào R

ob = read.csv("C:\\Thach\\VN trips\\2024_2Aug\\Data Analysis workshop (Hospital 108)\\Datasets\\obesity data.csv")

Tìm đường dẫn với file.choose()

#t = file.choose()

Việc 4. Thông tin về dữ liệu ob

4.1 Có bao nhiêu biến số và quan sát

dim(ob)

## [1] 1217   13

4.2 Liệt kê 6 quan sát đầu tiên

head(ob)

##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
##   diabetes
## 1        1
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0

4.3 Liệt kê 6 quan sát cuối cùng

tail(ob)

##        id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat
## 1212 1222      F    153     50 21.4  59  1309  0.87 18328 29147  37.6
## 1213 1223      F    150     44 19.6  44  1474  0.95 12906 28534  30.1
## 1214 1224      F    148     51 23.3  58  1522  0.97 14938 33931  29.6
## 1215 1225      F    149     50 22.5  57  1409  0.93 16777 30598  34.4
## 1216 1226      F    144     49 23.6  67  1266  0.90 20094 27272  41.3
## 1217 1227      F    141     45 22.6  58  1228  0.91 14567 28111  33.2
##      hypertension diabetes
## 1212            1        0
## 1213            0        1
## 1214            0        0
## 1215            1        0
## 1216            1        0
## 1217            0        0

4.4

summary(ob)

##        id            gender              height          weight     
##  Min.   :   1.0   Length:1217        Min.   :136.0   Min.   :34.00  
##  1st Qu.: 309.0   Class :character   1st Qu.:151.0   1st Qu.:49.00  
##  Median : 615.0   Mode  :character   Median :155.0   Median :54.00  
##  Mean   : 614.5                      Mean   :156.7   Mean   :55.14  
##  3rd Qu.: 921.0                      3rd Qu.:162.0   3rd Qu.:61.00  
##  Max.   :1227.0                      Max.   :185.0   Max.   :95.00  
##       bmi            age            WBBMC          wbbmd            fat       
##  Min.   :14.5   Min.   :13.00   Min.   : 695   Min.   :0.650   Min.   : 4277  
##  1st Qu.:20.2   1st Qu.:35.00   1st Qu.:1481   1st Qu.:0.930   1st Qu.:13768  
##  Median :22.2   Median :48.00   Median :1707   Median :1.010   Median :16955  
##  Mean   :22.4   Mean   :47.15   Mean   :1725   Mean   :1.009   Mean   :17288  
##  3rd Qu.:24.3   3rd Qu.:58.00   3rd Qu.:1945   3rd Qu.:1.090   3rd Qu.:20325  
##  Max.   :37.1   Max.   :88.00   Max.   :3040   Max.   :1.350   Max.   :40825  
##       lean           pcfat       hypertension      diabetes     
##  Min.   :19136   Min.   : 9.2   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:30325   1st Qu.:27.0   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :33577   Median :32.4   Median :1.000   Median :0.0000  
##  Mean   :35463   Mean   :31.6   Mean   :0.507   Mean   :0.1109  
##  3rd Qu.:39761   3rd Qu.:36.8   3rd Qu.:1.000   3rd Qu.:0.0000  
##  Max.   :63059   Max.   :48.4   Max.   :1.000   Max.   :1.0000

Việc 5. Biên tập dữ liệu bằng gói phân tích “tidyverse”

5.1 Mã hóa biến gender

# Cách đơn giản:
ob$sex[ob$gender == "F"] = 1
ob$sex[ob$gender == "M"] = 0

ob$sex.b = ifelse(ob$gender== "F", 1, 0)
table(ob$sex, ob$sex.b)

##    
##       0   1
##   0 355   0
##   1   0 862

# Sử dụng tidyverse:
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

ob = ob %>%
  mutate(sex.2 = case_when(gender == "F" ~ 1,
                           gender == "M" ~ 0))
table(ob$sex, ob$sex.2)

##    
##       0   1
##   0 355   0
##   1   0 862

5.2 Mã hóa biến BMI

# Cách đơn giản:
ob$obese[ob$bmi< 18.5] = "Underweight"
ob$obese[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$obese[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$obese[ob$bmi>= 30] = "Obese"

# Sử dụng tidyverse:
ob = ob %>%
  mutate(obese.2 = case_when(bmi< 18.5 ~ "Underweight",
                         bmi>= 18.5 & bmi< 25 ~ "Normal",
                         bmi>= 25 & bmi< 30 ~ "Overweight",
                         bmi>= 30 ~ "Obese"))
table(ob$obese, ob$obese.2)

##              
##               Normal Obese Overweight Underweight
##   Normal         865     0          0           0
##   Obese            0    15          0           0
##   Overweight       0     0        230           0
##   Underweight      0     0          0         107

5.2 Tạo biến số mới

# Cách đơn giản:
ob$lean.kg = ob$lean*1000
ob$fat.kg = ob$fat*1000

# Sử dụng tidyverse:
ob = ob %>% 
  mutate(lean.kg.2 = lean*1000,
         fat.kg.2 = fat*1000)

5.3 Tạo tập dữ liệu ‘men.overweigth’

# Cách đơn giản:
men.overweight = subset(ob, gender == "M" & bmi>= 25)
dim(men.overweight)

## [1] 85 22

table(men.overweight$obese)

## 
##      Obese Overweight 
##          4         81

# Sử dụng tidyverse:
men.overweight.2 = ob %>% filter(gender == "M", bmi>= 25)
dim(men.overweight.2)

## [1] 85 22

table(men.overweight.2$obese)

## 
##      Obese Overweight 
##          4         81

5.4 Tạo tập dữ liệu ‘Subset’

# Cách đơn giản:
Subset = subset(ob, select = c(id, age, gender, bmi, lean, fat))
head(Subset)

##   id age gender  bmi  lean   fat
## 1  1  53      F 21.8 28600 17802
## 2  2  65      M 19.1 40229  8381
## 3  3  64      F 23.1 36057 19221
## 4  4  56      F 21.8 33094 17472
## 5  5  54      M 19.9 40621  7336
## 6  6  52      F 20.1 30068 14904

Subset.b = ob[, c("id", "age", "gender", "bmi", "lean", "fat")]
head(Subset.b)

##   id age gender  bmi  lean   fat
## 1  1  53      F 21.8 28600 17802
## 2  2  65      M 19.1 40229  8381
## 3  3  64      F 23.1 36057 19221
## 4  4  56      F 21.8 33094 17472
## 5  5  54      M 19.9 40621  7336
## 6  6  52      F 20.1 30068 14904

# Sử dụng tidyverse:
Subset.2 = ob %>% select(id, age, gender, bmi, lean, fat)
head(Subset.2)

##   id age gender  bmi  lean   fat
## 1  1  53      F 21.8 28600 17802
## 2  2  65      M 19.1 40229  8381
## 3  3  64      F 23.1 36057 19221
## 4  4  56      F 21.8 33094 17472
## 5  5  54      M 19.9 40621  7336
## 6  6  52      F 20.1 30068 14904

Basic Data Analysis Workshop - HN

Thach Tran

2024-08-15

Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV 108

Ngày 1: Giới thiệu ngôn ngữ R

Việc 1. Cài đặt R và RStudio

Việc 2. Cài dặt các gói phân tích (packages)

Việc 3. Đọc dữ liệu vào R

Việc 4. Thông tin về dữ liệu ob

4.1 Có bao nhiêu biến số và quan sát

4.2 Liệt kê 6 quan sát đầu tiên

4.3 Liệt kê 6 quan sát cuối cùng

4.4

Việc 5. Biên tập dữ liệu bằng gói phân tích “tidyverse”

5.1 Mã hóa biến gender

5.2 Mã hóa biến BMI

5.2 Tạo biến số mới

5.3 Tạo tập dữ liệu ‘men.overweigth’

5.4 Tạo tập dữ liệu ‘Subset’

Việc 6. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs