Bài tập ngày 2

Việc 1. Phân tích mô tả

1.1. Đọc dữ liệu vào R

ob = read.csv("E:\\Khoa XD\\Nam 2025-2026\\Tap huan NCKH\\Obesity data.csv")
head(ob)

##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
##   diabetes
## 1        1
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0

1.2. Mô tả đặc điểm

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~age + gender + height + weight + pcfat + hypertension + diabetes, data = ob)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Min, Max]	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
pcfat
Mean (SD)	31.6 (7.18)
Median [Min, Max]	32.4 [9.20, 48.4]
hypertension
Mean (SD)	0.507 (0.500)
Median [Min, Max]	1.00 [0, 1.00]
diabetes
Mean (SD)	0.111 (0.314)
Median [Min, Max]	0 [0, 1.00]

1.3. Nhận xét kết quả tiền căn bệnh cao huyết áp và tiểu đường

ob$hyper = as.factor(ob$hypertension)
ob$dm = as.factor(ob$diabetes)
table1(~age + gender + height + weight + pcfat + hypertension + hyper + diabetes + dm, data = ob)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Min, Max]	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
pcfat
Mean (SD)	31.6 (7.18)
Median [Min, Max]	32.4 [9.20, 48.4]
hypertension
Mean (SD)	0.507 (0.500)
Median [Min, Max]	1.00 [0, 1.00]
hyper
0	600 (49.3%)
1	617 (50.7%)
diabetes
Mean (SD)	0.111 (0.314)
Median [Min, Max]	0 [0, 1.00]
dm
0	1082 (88.9%)
1	135 (11.1%)

1.4. Trình bày Median (Q1, Q3)

table1(~ age + weight + height + pcfat, data = ob, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Q1, Q3]	48.0 [35.0, 58.0]
weight
Mean (SD)	55.1 (9.40)
Median [Q1, Q3]	54.0 [49.0, 61.0]
height
Mean (SD)	157 (7.98)
Median [Q1, Q3]	155 [151, 162]
pcfat
Mean (SD)	31.6 (7.18)
Median [Q1, Q3]	32.4 [27.0, 36.8]

1.5. So sánh nam và nữ

table1(~age + height + weight + pcfat + hypertension + hyper + diabetes + dm | gender, data = ob)

	F (N=862)	M (N=355)	Overall (N=1217)
age
Mean (SD)	48.6 (16.4)	43.7 (18.8)	47.2 (17.3)
Median [Min, Max]	49.0 [14.0, 85.0]	44.0 [13.0, 88.0]	48.0 [13.0, 88.0]
height
Mean (SD)	153 (5.55)	165 (6.73)	157 (7.98)
Median [Min, Max]	153 [136, 170]	165 [146, 185]	155 [136, 185]
weight
Mean (SD)	52.3 (7.72)	62.0 (9.59)	55.1 (9.40)
Median [Min, Max]	51.0 [34.0, 95.0]	62.0 [38.0, 95.0]	54.0 [34.0, 95.0]
pcfat
Mean (SD)	34.7 (5.19)	24.2 (5.76)	31.6 (7.18)
Median [Min, Max]	34.7 [14.6, 48.4]	24.6 [9.20, 39.0]	32.4 [9.20, 48.4]
hypertension
Mean (SD)	0.501 (0.500)	0.521 (0.500)	0.507 (0.500)
Median [Min, Max]	1.00 [0, 1.00]	1.00 [0, 1.00]	1.00 [0, 1.00]
hyper
0	430 (49.9%)	170 (47.9%)	600 (49.3%)
1	432 (50.1%)	185 (52.1%)	617 (50.7%)
diabetes
Mean (SD)	0.118 (0.323)	0.0930 (0.291)	0.111 (0.314)
Median [Min, Max]	0 [0, 1.00]	0 [0, 1.00]	0 [0, 1.00]
dm
0	760 (88.2%)	322 (90.7%)	1082 (88.9%)
1	102 (11.8%)	33 (9.3%)	135 (11.1%)

1.6. Đánh giá khác biệt giữa nam và nữ

library(compareGroups)
createTable(compareGroups(gender ~ age + height + weight + pcfat + hyper + dm, data = ob))

## 
## --------Summary descriptives table by 'gender'---------
## 
## ________________________________________ 
##             F           M      p.overall 
##           N=862       N=355              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age    48.6 (16.4) 43.7 (18.8)  <0.001   
## height 153 (5.55)  165 (6.73)   <0.001   
## weight 52.3 (7.72) 62.0 (9.59)  <0.001   
## pcfat  34.7 (5.19) 24.2 (5.76)  <0.001   
## hyper:                           0.569   
##     0  430 (49.9%) 170 (47.9%)           
##     1  432 (50.1%) 185 (52.1%)           
## dm:                              0.238   
##     0  760 (88.2%) 322 (90.7%)           
##     1  102 (11.8%) 33 (9.30%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Việc 2. Phân tích khác biệt giưa 2 nhóm

2.1. Nhập dữ liệu vào R

A = c(14, 4, 10, 6, 3, 11, 12)
B = c(16, 17, 13, 12, 7, 16, 11, 8, 7)

wt = c(A, B)
group = c(rep("A", 7), rep("B", 9))
df = data.frame(wt, group)
dim(df)

## [1] 16  2

2.2. Tải trọng có tuân theo phân bố chuẩn

Dùng biểu đồ

library(lessR)

## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following object is masked from 'package:table1':
## 
##     label

Histogram(wt, data = df)

## >>> Note: wt is not in a data frame (table)
## >>> Note: wt is not in a data frame (table)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(wt, density=TRUE)  # smoothed curve + histogram 
## Plot(wt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- wt --- 
##  
##      n   miss     mean       sd      min      mdn      max 
##      16      0    10.44     4.29     3.00    11.00    17.00 
##  
## 
## No (Box plot) outliers 
## 
## 
## Bin Width: 2 
## Number of Bins: 8 
##  
##      Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ------------------------------------------------- 
##   2 >  4       3      2    0.12        2     0.12 
##   4 >  6       5      1    0.06        3     0.19 
##   6 >  8       7      3    0.19        6     0.38 
##   8 > 10       9      1    0.06        7     0.44 
##  10 > 12      11      4    0.25       11     0.69 
##  12 > 14      13      2    0.12       13     0.81 
##  14 > 16      15      2    0.12       15     0.94 
##  16 > 18      17      1    0.06       16     1.00 
##

Dùng test

shapiro.test(df$wt)

## 
##  Shapiro-Wilk normality test
## 
## data:  df$wt
## W = 0.96213, p-value = 0.7006

2.3. Mô tả đặc điểm về tải trọng của 2 nhóm

library(table1)
table1(~ wt | group, data = df, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))

	A (N=7)	B (N=9)	Overall (N=16)
wt
Mean (SD)	8.57 (4.24)	11.9 (3.95)	10.4 (4.29)
Median [Q1, Q3]	10.0 [5.00, 11.5]	12.0 [8.00, 16.0]	11.0 [7.00, 13.3]

2.4. Thực hiện phép kiểm t để đánh giá khác biệt về tải trọng của 2 nhóm

t.test(A, B)

## 
##  Welch Two Sample t-test
## 
## data:  A and B
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean of x mean of y 
##  8.571429 11.888889

t.test(wt ~ group, data = df)

## 
##  Welch Two Sample t-test
## 
## data:  wt by group
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean in group A mean in group B 
##        8.571429       11.888889

2.5. Thực hiện bootstrap

library(simpleboot)

## Simple Bootstrap Routines (1.1-8)

library(boot)
b = two.boot(A, B, mean, R = 1000)
boot.ci(b)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-7.070,  0.543 )   (-7.205,  0.682 )  
## 
## Level     Percentile            BCa          
## 95%   (-7.317,  0.570 )   (-7.543,  0.362 )  
## Calculations and Intervals on Original Scale

hist(b, breaks = 50)

2.6. Sử dụng ChatGPT

# Dữ liệu
A <- c(14, 4, 10, 6, 3, 11, 12)
B <- c(16, 17, 13, 12, 7, 16, 11, 8, 7)

# Số lần lặp bootstrap
n_boot <- 10000

# Hàm tính chênh lệch trung bình
diff_means <- function(x, y) {
  mean(x) - mean(y)
}

# Hàm bootstrap
set.seed(123)  # để tái lập kết quả
boot_diff <- replicate(n_boot, {
  sample_A <- sample(A, replace = TRUE)
  sample_B <- sample(B, replace = TRUE)
  diff_means(sample_A, sample_B)
})

# Kết quả thống kê
mean_diff <- mean(A) - mean(B)
ci <- quantile(boot_diff, c(0.025, 0.975))  # khoảng tin cậy 95%
p_value <- mean(boot_diff >= 0) # Xác suất hiệu trung bình >= 0 (tuỳ hướng giả thuyết)

cat("Chênh lệch trung bình gốc:", mean_diff, "\n")

## Chênh lệch trung bình gốc: -3.31746

cat("Khoảng tin cậy 95% (bootstrap):", ci, "\n")

## Khoảng tin cậy 95% (bootstrap): -7.047619 0.4285714

cat("Xác suất (hiệu trung bình >= 0):", p_value, "\n")

## Xác suất (hiệu trung bình >= 0): 0.0441

# Vẽ biểu đồ phân phối bootstrap
hist(boot_diff, breaks = 30, col = "lightblue", border = "white",
     main = "Phân phối bootstrap của chênh lệch trung bình (A - B)",
     xlab = "Hiệu trung bình")
abline(v = ci, col = "red", lwd = 2, lty = 2)
abline(v = mean_diff, col = "darkblue", lwd = 2)

Bai tap ngay 2

Hien Ngo

2025-10-22

Bài tập ngày 2

Việc 1. Phân tích mô tả

1.1. Đọc dữ liệu vào R

1.2. Mô tả đặc điểm

1.3. Nhận xét kết quả tiền căn bệnh cao huyết áp và tiểu đường

1.4. Trình bày Median (Q1, Q3)

1.5. So sánh nam và nữ

1.6. Đánh giá khác biệt giữa nam và nữ

Việc 2. Phân tích khác biệt giưa 2 nhóm

2.1. Nhập dữ liệu vào R

2.2. Tải trọng có tuân theo phân bố chuẩn

Dùng biểu đồ

Dùng test

2.3. Mô tả đặc điểm về tải trọng của 2 nhóm

2.4. Thực hiện phép kiểm t để đánh giá khác biệt về tải trọng của 2 nhóm

2.5. Thực hiện bootstrap

2.6. Sử dụng ChatGPT