Việc 1. Phân tích mô tả

1.1 Đọc dữ liệu vào R

ob = read.csv("C:\\Users\\Admin\\Desktop\\TL tập huấn NCKH\\Obesity data.csv")

1.2. Mô tả đặc điểm

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~ age + gender + weight + height + pcfat + hypertension + diabetes, data = ob)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Min, Max]	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]
pcfat
Mean (SD)	31.6 (7.18)
Median [Min, Max]	32.4 [9.20, 48.4]
hypertension
Mean (SD)	0.507 (0.500)
Median [Min, Max]	1.00 [0, 1.00]
diabetes
Mean (SD)	0.111 (0.314)
Median [Min, Max]	0 [0, 1.00]

1.3. Nhận xét kết quả

ob$hyper = as.factor(ob$hypertension)
ob$diab = as.factor(ob$diabetes)
table1(~ age + gender + weight + height + pcfat + hypertension + hyper + diabetes + diab, data = ob)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Min, Max]	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]
pcfat
Mean (SD)	31.6 (7.18)
Median [Min, Max]	32.4 [9.20, 48.4]
hypertension
Mean (SD)	0.507 (0.500)
Median [Min, Max]	1.00 [0, 1.00]
hyper
0	600 (49.3%)
1	617 (50.7%)
diabetes
Mean (SD)	0.111 (0.314)
Median [Min, Max]	0 [0, 1.00]
diab
0	1082 (88.9%)
1	135 (11.1%)

1.4 Trình bày kết quả median (Q1, Q3)

library(table1)
table1(~ age + weight + height + pcfat, data = ob, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Q1, Q3]	48.0 [35.0, 58.0]
weight
Mean (SD)	55.1 (9.40)
Median [Q1, Q3]	54.0 [49.0, 61.0]
height
Mean (SD)	157 (7.98)
Median [Q1, Q3]	155 [151, 162]
pcfat
Mean (SD)	31.6 (7.18)
Median [Q1, Q3]	32.4 [27.0, 36.8]

1.5 Mô tả theo giới tính

table1(~ age + weight + height + pcfat + hyper + diab | gender, data = ob)

	F (N=862)	M (N=355)	Overall (N=1217)
age
Mean (SD)	48.6 (16.4)	43.7 (18.8)	47.2 (17.3)
Median [Min, Max]	49.0 [14.0, 85.0]	44.0 [13.0, 88.0]	48.0 [13.0, 88.0]
weight
Mean (SD)	52.3 (7.72)	62.0 (9.59)	55.1 (9.40)
Median [Min, Max]	51.0 [34.0, 95.0]	62.0 [38.0, 95.0]	54.0 [34.0, 95.0]
height
Mean (SD)	153 (5.55)	165 (6.73)	157 (7.98)
Median [Min, Max]	153 [136, 170]	165 [146, 185]	155 [136, 185]
pcfat
Mean (SD)	34.7 (5.19)	24.2 (5.76)	31.6 (7.18)
Median [Min, Max]	34.7 [14.6, 48.4]	24.6 [9.20, 39.0]	32.4 [9.20, 48.4]
hyper
0	430 (49.9%)	170 (47.9%)	600 (49.3%)
1	432 (50.1%)	185 (52.1%)	617 (50.7%)
diab
0	760 (88.2%)	322 (90.7%)	1082 (88.9%)
1	102 (11.8%)	33 (9.3%)	135 (11.1%)

1.6. Đánh giá sự khác biệt giữa 2 nhóm

library(compareGroups)
createTable(compareGroups(gender ~ age + weight + height + pcfat + hyper + diab, data = ob))

## 
## --------Summary descriptives table by 'gender'---------
## 
## ________________________________________ 
##             F           M      p.overall 
##           N=862       N=355              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age    48.6 (16.4) 43.7 (18.8)  <0.001   
## weight 52.3 (7.72) 62.0 (9.59)  <0.001   
## height 153 (5.55)  165 (6.73)   <0.001   
## pcfat  34.7 (5.19) 24.2 (5.76)  <0.001   
## hyper:                           0.569   
##     0  430 (49.9%) 170 (47.9%)           
##     1  432 (50.1%) 185 (52.1%)           
## diab:                            0.238   
##     0  760 (88.2%) 322 (90.7%)           
##     1  102 (11.8%) 33 (9.30%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Việc 2. Phân tích khác biệt giữa 2 nhóm:

2.1 – Nhập dữ liệu tải trọng

A = c(14, 4, 10, 6, 3, 11, 12)
B = c(16, 17, 13, 12, 7, 16, 11, 8, 7)
wt = c(A, B)
group = c(rep("A", 7), rep("B", 9))
df = data.frame(wt, group)
dim(df)

## [1] 16  2

2.2 – Kiểm tra phân bố chuẩn

library(lessR)

## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following object is masked from 'package:table1':
## 
##     label

Histogram(wt, data = df)

## >>> Note: wt is not in a data frame (table)
## >>> Note: wt is not in a data frame (table)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(wt, density=TRUE)  # smoothed curve + histogram 
## Plot(wt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- wt --- 
##  
##      n   miss     mean       sd      min      mdn      max 
##      16      0    10.44     4.29     3.00    11.00    17.00 
##  
## 
## No (Box plot) outliers 
## 
## 
## Bin Width: 2 
## Number of Bins: 8 
##  
##      Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ------------------------------------------------- 
##   2 >  4       3      2    0.12        2     0.12 
##   4 >  6       5      1    0.06        3     0.19 
##   6 >  8       7      3    0.19        6     0.38 
##   8 > 10       9      1    0.06        7     0.44 
##  10 > 12      11      4    0.25       11     0.69 
##  12 > 14      13      2    0.12       13     0.81 
##  14 > 16      15      2    0.12       15     0.94 
##  16 > 18      17      1    0.06       16     1.00 
##

shapiro.test(df$wt)

## 
##  Shapiro-Wilk normality test
## 
## data:  df$wt
## W = 0.96213, p-value = 0.7006

2.3 – Mô tả đặc điểm tải trọng

library(table1)
table1(~ wt | group, data = df, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))

	A (N=7)	B (N=9)	Overall (N=16)
wt
Mean (SD)	8.57 (4.24)	11.9 (3.95)	10.4 (4.29)
Median [Q1, Q3]	10.0 [5.00, 11.5]	12.0 [8.00, 16.0]	11.0 [7.00, 13.3]

2.4 – Kiểm định t-test hai nhóm

t.test(A, B)

## 
##  Welch Two Sample t-test
## 
## data:  A and B
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean of x mean of y 
##  8.571429 11.888889

t.test(wt ~ group, data = df)

## 
##  Welch Two Sample t-test
## 
## data:  wt by group
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean in group A mean in group B 
##        8.571429       11.888889

2.5 – Bootstrap cho trung bình 2 nhóm

library(simpleboot)

## Simple Bootstrap Routines (1.1-8)

library(boot)
b = two.boot(A, B, mean, R = 1000)
boot.ci(b)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-7.105,  0.491 )   (-7.238,  0.540 )  
## 
## Level     Percentile            BCa          
## 95%   (-7.175,  0.603 )   (-7.175,  0.610 )  
## Calculations and Intervals on Original Scale

hist(b, breaks = 50)

Sử dụng ChatGPT PROMPT: Tôi có dữ liệu cho 2 nhóm như sau. Bạn có thể viết mã R để phân tích sự khác biệt giữa hai nhóm dùng bootstrap? A = c(14, 4, 10, 6, 3, 11, 12) B = c(16, 17, 13, 12, 7, 16, 11, 8, 7)

library(boot)
A <- c(14, 4, 10, 6, 3, 11, 12)
B <- c(16, 17, 13, 12, 7, 16, 11, 8, 7)
# Hàm trả về chênh lệch trung bình giữa hai nhóm
diff_mean <- function(data1, data2) {
  mean(data1) - mean(data2)
}
obs_diff <- diff_mean(A, B)
obs_diff

## [1] -3.31746

library(boot)
# Gộp dữ liệu thành một danh sách để truyền vào boot
data_list <- list(A = A, B = B)
# Hàm bootstrap
boot_diff <- function(data, indices) {
  A_boot <- sample(data$A, replace = TRUE)
  B_boot <- sample(data$B, replace = TRUE)
  return(mean(A_boot) - mean(B_boot))
}
# Chạy bootstrap 10,000 lần
set.seed(123)
boot_result <- boot(data = data_list, statistic = boot_diff, R = 10000)
# Tóm tắt kết quả bootstrap
boot_result

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data_list, statistic = boot_diff, R = 10000)
## 
## 
## Bootstrap Statistics :
##      original   bias    std. error
## t1* -4.333333 1.020943     1.90975

# Khoảng tin cậy 95%
boot.ci(boot_result, type = c("perc", "bca"))

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result, type = c("perc", "bca"))
## 
## Intervals : 
## Level     Percentile            BCa          
## 95%   (-7.032,  0.397 )   (-9.306, -1.556 )  
## Calculations and Intervals on Original Scale

hist(boot_result$t,
     main = "Phân phối bootstrap của chênh lệch trung bình",
     xlab = "Mean(A) - Mean(B)",
     col = "lightblue", border = "gray")
abline(v = obs_diff, col = "red", lwd = 2)

boot_diff_median <- function(data, indices) {
  A_boot <- sample(data$A, replace = TRUE)
  B_boot <- sample(data$B, replace = TRUE)
  return(median(A_boot) - median(B_boot))
}
boot_result_median <- boot(data = data_list, statistic = boot_diff_median, R = 10000)
boot.ci(boot_result_median, type = c("perc", "bca"))

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result_median, type = c("perc", "bca"))
## 
## Intervals : 
## Level     Percentile            BCa          
## 95%   (-10,   3 )   (-13,  -6 )  
## Calculations and Intervals on Original Scale
## Warning : BCa Intervals used Extreme Quantiles
## Some BCa intervals may be unstable

# Hàm tiện ích để tạo chỉ số bootstrap riêng cho mỗi nhóm
boot_2groups <- function(A, B, R = 2000) {
  nA <- length(A)
  nB <- length(B)
  boot_values <- numeric(R)
  for (i in 1:R) {
    idxA <- sample(1:nA, nA, replace = TRUE)
    idxB <- sample(1:nB, nB, replace = TRUE)
    boot_values[i] <- mean(A[idxA]) - mean(B[idxB])
  }
  return(boot_values)
}
# Chạy bootstrap
set.seed(123)
boot_values <- boot_2groups(A, B, R = 2000)
# Kết quả ước lượng
diff_estimate <- mean(A) - mean(B)
diff_estimate

## [1] -3.31746

# Tạo đối tượng "boot" để dùng hàm boot.ci
boot_obj <- boot(data = list(A = A, B = B),
                 statistic = function(data, i) {
                   # i ở đây là vector cho A và B (chúng ta tạo tách riêng)
                   idxA <- sample(1:length(data$A), length(data$A), replace = TRUE)
                   idxB <- sample(1:length(data$B), length(data$B), replace = TRUE)
                   mean(data$A[idxA]) - mean(data$B[idxB])
                 },
                 R = 2000)
boot_obj

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = list(A = A, B = B), statistic = function(data, i) {
##     idxA <- sample(1:length(data$A), length(data$A), replace = TRUE)
##     idxB <- sample(1:length(data$B), length(data$B), replace = TRUE)
##     mean(data$A[idxA]) - mean(data$B[idxB])
## }, R = 2000)
## 
## 
## Bootstrap Statistics :
##      original    bias    std. error
## t1* 0.1746032 -3.404722    1.901877

boot.ci(boot_obj, conf = 0.95, type = c("norm", "basic", "perc", "bca"))

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_obj, conf = 0.95, type = c("norm", "basic", 
##     "perc", "bca"))
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-0.1483,  7.3069 )   (-0.0635,  7.3333 )  
## 
## Level     Percentile            BCa          
## 95%   (-6.9841,  0.4127 )   (-0.2647,  2.6032 )  
## Calculations and Intervals on Original Scale
## Warning : BCa Intervals used Extreme Quantiles
## Some BCa intervals may be unstable

Bài thực hành 2

Trần Phan Anh Tú

2025-10-22