Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV SIS Cần Thơ (2-6/1/2025)

Ngày 2: Hiển thị dữ liệu

Việc 1. Đọc dữ liệu vào R

ob = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Obesity data.csv")

Việc 2. Biểu đồ histogram

2.1 Phân bố tỉ trọng mỡ

library(ggplot2)
library(gridExtra) 

p = ggplot(data = ob, aes(x = pcfat))
p1 = p + geom_histogram()
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tỉ trọng mỡ (%)", y = "Số người", title = "Phân bố tỉ trọng mỡ")

grid.arrange(p1, p2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.2 Phân bố tỉ trọng mỡ theo giới tính

p = ggplot(data = ob, aes(x = pcfat, fill = gender))
p1 = p + geom_histogram(col="white") + labs(x = "Tỉ trọng mỡ", y = "Số người", title = "Phân bố tỉ trọng mỡ")
p2 = p + geom_density(alpha = 0.5) + labs(x = "Tỉ trọng mỡ", y = "Số người", title = "Phân bố tỉ trọng mỡ")

grid.arrange(p1, p2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Việc 3. Biểu đồ thanh

3.1 Tạo biến số OB từ biến bmi

ob$OB[ob$bmi< 18.5] = "Underweight"
ob$OB[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$OB[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$OB[ob$bmi>= 30] = "Obese"

ob$OB = factor(ob$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))

3.2 Phân bố của tình trạng béo phì

p = ggplot(data = ob, aes(x = OB, fill = OB, col = OB))
p + geom_bar(position = "dodge")

3.3 Phân bố của tình trạng béo phì theo giới tính

p = ggplot(data = ob, aes(x = OB, y = pcfat, fill = gender, group = gender))
p + geom_bar(stat = "identity", position = "dodge")

3.4 Thêm tỉ lệ %

Tính tỉ lệ %:

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

temp = ob %>% group_by(gender) %>% count(OB) %>% mutate(pct = n/sum(n))
temp$pct = round(temp$pct*100, 1)

Thêm % vào biểu đồ

p = ggplot(data = temp, aes(x = OB, y = pct, fill = gender, group = gender))
p1 = p + geom_bar(stat = "identity", position = "dodge") + geom_text(aes(x = OB, y = pct, label = pct, group = gender), position = position_dodge(width = 1), vjust = -0.5, col = "blue") + labs(x = "Obesity status", y = "Percent") + theme(legend.position = "none")
p1

Việc 4. Soạn biểu đồ hộp so sánh phân bố của tỉ trọng mỡ theo giới tính

p = ggplot(data = ob, aes(x = gender, y = pcfat, col = gender))
p1 = p + geom_boxplot() 
p1

p2 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Giới tính", y = "Tỉ trọng mỡ (%)") + ggtitle("Tỉ trọng mỡ theo giới tính")
p2

grid.arrange(p1, p2, ncol = 2)

Việc 5. Soạn biểu đồ tương quan

5.1 Mối liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ

p = ggplot(data = ob, aes(x = bmi, y = pcfat))
p1 = p + geom_point()
p2 = p + geom_point() + geom_smooth()

grid.arrange(p1, p2, ncol = 2)

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

5.2 Mối liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính

p = ggplot(data = ob, aes(x = bmi, y = pcfat, fill = gender, col = gender))
p1 = p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2)) + labs(x = "Chỉ số khối cơ thể (kg/m2)", y = "Tỉ trọng mỡ (%)") + ggtitle("Liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính")
p1

So sánh 2 nhóm - biến liên tục

việc 6. Sao sánh mật độ xương cổ xương đùi giữa nam và nữ

6.1 Đọc dữ liệu vào R

df = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Bone data.csv")
dim(df)

## [1] 2162    9

head(df)

##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0

6.2 Vẽ histogram đánh giá phân bố mật độ xương

p = ggplot(data = df, aes(x = fnbmd))
p + geom_histogram(fill = "blue", col = "white")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 40 rows containing non-finite values (`stat_bin()`).

6.3 So sánh mật độ xương cổ xương đùi giữa nam và nữ

t.test(fnbmd ~ sex, data = df)

## 
##  Welch Two Sample t-test
## 
## data:  fnbmd by sex
## t = -20.407, df = 1561, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -0.1448770 -0.1194686
## sample estimates:
## mean in group Female   mean in group Male 
##            0.7775231            0.9096959

Dùng gói lessR

library(lessR)

## Warning: package 'lessR' was built under R version 4.3.3

## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following objects are masked from 'package:dplyr':
## 
##     recode, rename

ttest(fnbmd ~ sex, data = df)

## 
## Compare fnbmd across sex with levels Male and Female 
## Grouping Variable:  sex
## Response Variable:  fnbmd
## 
## 
## ------ Describe ------
## 
## fnbmd for sex Male:  n.miss = 23,  n = 822,  mean = 0.910,  sd = 0.153
## fnbmd for sex Female:  n.miss = 17,  n = 1300,  mean = 0.778,  sd = 0.132
## 
## Mean Difference of fnbmd:  0.132
## 
## Weighted Average Standard Deviation:   0.141 
## 
## 
## ------ Assumptions ------
## 
## Note: These hypothesis tests can perform poorly, and the 
##       t-test is typically robust to violations of assumptions. 
##       Use as heuristic guides instead of interpreting literally. 
## 
## Null hypothesis, for each group, is a normal distribution of fnbmd.
## Group Male: Sample mean assumed normal because n > 30, so no test needed.
## Group Female: Sample mean assumed normal because n > 30, so no test needed.
## 
## Null hypothesis is equal variances of fnbmd, homogeneous.
## Variance Ratio test:  F = 0.023/0.018 = 1.336,  df = 821;1299,  p-value = 0.000
## Levene's test, Brown-Forsythe:  t = 3.449,  df = 2120,  p-value = 0.001
## 
## 
## ------ Infer ------
## 
## --- Assume equal population variances of fnbmd for each sex 
## 
## t-cutoff for 95% range of variation: tcut =  1.961 
## Standard Error of Mean Difference: SE =  0.006 
## 
## Hypothesis Test of 0 Mean Diff:  t-value = 21.080,  df = 2120,  p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.012
## 95% Confidence Interval for Mean Difference:  0.120 to 0.144
## 
## 
## --- Do not assume equal population variances of fnbmd for each sex 
## 
## t-cutoff: tcut =  1.961 
## Standard Error of Mean Difference: SE =  0.006 
## 
## Hypothesis Test of 0 Mean Diff:  t = 20.407,  df = 1560.981, p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.013
## 95% Confidence Interval for Mean Difference:  0.119 to 0.145
## 
## 
## ------ Effect Size ------
## 
## --- Assume equal population variances of fnbmd for each sex 
## 
## Standardized Mean Difference of fnbmd, Cohen's d:  0.939
## 
## 
## ------ Practical Importance ------
## 
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for sex Male: 0.044
## Density bandwidth for sex Female: 0.034

Việc 7. Đánh giá ảnh hưởng của cafe lên RER

7.1 Nhập nhanh dữ liệu RER

placebo = c(105, 119, 100, 97, 96, 101, 94, 95, 98)
coffee = c(96, 99, 94, 89, 96, 93, 88, 105, 88)

7.2 Đánh giá ảnh hưởng của cafe lên RER bằng kiểm định t

t.test(placebo, coffee)

## 
##  Welch Two Sample t-test
## 
## data:  placebo and coffee
## t = 1.9948, df = 14.624, p-value = 0.06505
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4490961 13.1157627
## sample estimates:
## mean of x mean of y 
## 100.55556  94.22222

7.3 Đánh giá ảnh hưởng của cafe lên RER bằng bootstrap test

library(simpleboot)

## Simple Bootstrap Routines (1.1-7)

library(boot)
b = two.boot(placebo, coffee, mean, R = 500)
boot.ci(b)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.518, 12.159 )   ( 0.444, 12.052 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.615, 12.222 )   ( 0.561, 12.221 )  
## Calculations and Intervals on Original Scale

hist(b, breaks = 50)

Phân tích dữ liệu bằng ngôn ngữ R - BV SIS Cần Thơ

Thach Tran

2024-12-28

Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV SIS Cần Thơ (2-6/1/2025)

Ngày 2: Hiển thị dữ liệu

Việc 1. Đọc dữ liệu vào R

Việc 2. Biểu đồ histogram

2.1 Phân bố tỉ trọng mỡ

2.2 Phân bố tỉ trọng mỡ theo giới tính

Việc 3. Biểu đồ thanh

3.1 Tạo biến số OB từ biến bmi

3.2 Phân bố của tình trạng béo phì

3.3 Phân bố của tình trạng béo phì theo giới tính

3.4 Thêm tỉ lệ %

Việc 4. Soạn biểu đồ hộp so sánh phân bố của tỉ trọng mỡ theo giới tính

Việc 5. Soạn biểu đồ tương quan

5.1 Mối liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ

5.2 Mối liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính

So sánh 2 nhóm - biến liên tục

việc 6. Sao sánh mật độ xương cổ xương đùi giữa nam và nữ

6.1 Đọc dữ liệu vào R

6.2 Vẽ histogram đánh giá phân bố mật độ xương

6.3 So sánh mật độ xương cổ xương đùi giữa nam và nữ

Việc 7. Đánh giá ảnh hưởng của cafe lên RER

7.1 Nhập nhanh dữ liệu RER

7.2 Đánh giá ảnh hưởng của cafe lên RER bằng kiểm định t

7.3 Đánh giá ảnh hưởng của cafe lên RER bằng bootstrap test

Việc 8. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs