Bài tập ngày 2

#Việc 1. Phân tích mô tả ##1.1 Đọc dữ liệu “Obesity data.csv” vào R và gọi dữ liệu là “ob”

file.choose()

## [1] "D:\\NCKH-RSTUDIO\\Bài tập ngày 2\\Bai tap ngay 2.Rmd"

ob = read.csv("D:\\TẬP HUẤN R&R STUDIO\\Obesity data.csv")

##1.2 Mô tả đặc điểm tuổi (age), giới tính (gender), cân nặng (weight), chiều cao (height), tỉ trọng mỡ (pcfat), tiền căn bệnh cao huyết áp (hypertension)và tiền căn bệnh tiểu đường (diabetes)

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~age + gender + weight + height + pcfat + hypertension + diabetes, data=ob)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Min, Max]	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]
pcfat
Mean (SD)	31.6 (7.18)
Median [Min, Max]	32.4 [9.20, 48.4]
hypertension
Mean (SD)	0.507 (0.500)
Median [Min, Max]	1.00 [0, 1.00]
diabetes
Mean (SD)	0.111 (0.314)
Median [Min, Max]	0 [0, 1.00]

##1.3 Bạn nhận xét như thế nào về kết quả của tiền căn bệnh cao huyết áp và tiểu đường. Làm cách nào để trình bày kết quả tốt hơn? Do giá trị mã hóa của tiền căn bệnh cao huyết áp và tiểu đường là 0 và 1 nên hệ thống hiểu nó là biến liên tục. Kết quả thống kê hiển thị Mean. Nhưng điều cần biết là số lượng và tỷ lệ mắc bệnh. => cần dùng hàm as.factor để chuuyển thành biến phân loại

table1(~age + gender + weight + height + pcfat + as.factor(hypertension) + as.factor(diabetes), data=ob)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median [Min, Max]	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]
pcfat
Mean (SD)	31.6 (7.18)
Median [Min, Max]	32.4 [9.20, 48.4]
as.factor(hypertension)
0	600 (49.3%)
1	617 (50.7%)
as.factor(diabetes)
0	1082 (88.9%)
1	135 (11.1%)

##1.4.Trình bày kết quả trung vị (Q1, Q3) và (min, max) cho biến liên tục.

my.render.cont <- function(x) {
q <- stats::quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
m <- mean(x, na.rm = TRUE)
sdv <- sd(x, na.rm = TRUE)
out <- c(" " = "",  # Dòng trống để giữ format
"Mean (SD)" = sprintf("%.1f (%.1f)", m, sdv),
"Median (Q1, Q3)" = sprintf("%.1f (%.1f, %.1f)", q[3], q[2], q[4]),
"Median (Min, Max)" = sprintf("%.1f [%.1f, %.1f]", q[3], q[1], q[5])
)
return(out)
}

table1(~ age + gender + height + weight + pcfat + as.factor(hypertension) + as.factor(diabetes),data = ob,render.continuous = my.render.cont)

	Overall (N=1217)
age
Mean (SD)	47.2 (17.3)
Median (Q1, Q3)	48.0 (35.0, 58.0)
Median (Min, Max)	48.0 [13.0, 88.0]
gender
F	862 (70.8%)
M	355 (29.2%)
height
Mean (SD)	156.7 (8.0)
Median (Q1, Q3)	155.0 (151.0, 162.0)
Median (Min, Max)	155.0 [136.0, 185.0]
weight
Mean (SD)	55.1 (9.4)
Median (Q1, Q3)	54.0 (49.0, 61.0)
Median (Min, Max)	54.0 [34.0, 95.0]
pcfat
Mean (SD)	31.6 (7.2)
Median (Q1, Q3)	32.4 (27.0, 36.8)
Median (Min, Max)	32.4 [9.2, 48.4]
as.factor(hypertension)
0	600 (49.3%)
1	617 (50.7%)
as.factor(diabetes)
0	1082 (88.9%)
1	135 (11.1%)

##1.5 Mô tả đặc điểm tuổi (age), cân nặng (weight), chiều cao (height), tỉ trọng mỡ (pcfat) và tiền căn bệnh cao huyết áp (hypertension) theo giới tính (gender)

table1(~ age + height + weight + pcfat + as.factor(hypertension) + as.factor(diabetes)|gender,data = ob)

	F (N=862)	M (N=355)	Overall (N=1217)
age
Mean (SD)	48.6 (16.4)	43.7 (18.8)	47.2 (17.3)
Median [Min, Max]	49.0 [14.0, 85.0]	44.0 [13.0, 88.0]	48.0 [13.0, 88.0]
height
Mean (SD)	153 (5.55)	165 (6.73)	157 (7.98)
Median [Min, Max]	153 [136, 170]	165 [146, 185]	155 [136, 185]
weight
Mean (SD)	52.3 (7.72)	62.0 (9.59)	55.1 (9.40)
Median [Min, Max]	51.0 [34.0, 95.0]	62.0 [38.0, 95.0]	54.0 [34.0, 95.0]
pcfat
Mean (SD)	34.7 (5.19)	24.2 (5.76)	31.6 (7.18)
Median [Min, Max]	34.7 [14.6, 48.4]	24.6 [9.20, 39.0]	32.4 [9.20, 48.4]
as.factor(hypertension)
0	430 (49.9%)	170 (47.9%)	600 (49.3%)
1	432 (50.1%)	185 (52.1%)	617 (50.7%)
as.factor(diabetes)
0	760 (88.2%)	322 (90.7%)	1082 (88.9%)
1	102 (11.8%)	33 (9.3%)	135 (11.1%)

##1.6 Đánh giá xem đặc điểm nào là khác biệt đáng kể (significant difference) giữa nam và nữ Vì trong compareGroups không được dùng hàm như as.factor() trực tiếp.compareGroups() yêu cầu các biến phải có kiểu dữ liệu phù hợp (numeric hoặc factor) trước khi đưa vào công thức. Do đó, cần tạo biến định danh cho hypertension và diabetes

ob$hyper <- factor(ob$hypertension, labels = c("Không", "Có"))
ob$diab <- factor(ob$diabetes, labels = c("Không", "Có"))

library(compareGroups)
createTable(compareGroups(gender~age + height + weight + pcfat + hyper + diab,data = ob))

## 
## --------Summary descriptives table by 'gender'---------
## 
## ___________________________________________ 
##                F           M      p.overall 
##              N=862       N=355              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age       48.6 (16.4) 43.7 (18.8)  <0.001   
## height    153 (5.55)  165 (6.73)   <0.001   
## weight    52.3 (7.72) 62.0 (9.59)  <0.001   
## pcfat     34.7 (5.19) 24.2 (5.76)  <0.001   
## hyper:                              0.569   
##     Không 430 (49.9%) 170 (47.9%)           
##     Có    432 (50.1%) 185 (52.1%)           
## diab:                               0.238   
##     Không 760 (88.2%) 322 (90.7%)           
##     Có    102 (11.8%) 33 (9.30%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

#Việc 2. Phân tích khác biệt giữa 2 nhóm ##2.1 Giả sử bạn có dữ liệu về tải trọng của 2 nhóm A và B như sau: Nhóm A (n= 7): 14, 4, 10, 6, 3, 11, 12 Nhóm B (n= 9): 16, 17, 13, 12, 7, 16, 11, 8, 7 Nhập dữ liệu trên vào R.

A=c(14, 4, 10, 6, 3, 11, 12)
B=c(16, 17, 13, 12, 7, 16, 11, 8, 7)
wt=c(A,B)
group=c(rep("A", 7), rep("B", 9))
df <- data.frame(wt, group)

df

##    wt group
## 1  14     A
## 2   4     A
## 3  10     A
## 4   6     A
## 5   3     A
## 6  11     A
## 7  12     A
## 8  16     B
## 9  17     B
## 10 13     B
## 11 12     B
## 12  7     B
## 13 16     B
## 14 11     B
## 15  8     B
## 16  7     B

dim(df)

## [1] 16  2

##2.2 Tải trọng có tuân theo phân bổ chuẩn (normal distribution) không? Tải gói LessR: install.packages(“lessR”) Trong R, khi gọi install.packages(), tên gói phải được đặt trong dấu ngoặc kép. Ví dụ: install.packages(“lessR”) gọi gói về:

library(lessR)

## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following object is masked from 'package:table1':
## 
##     label

#Vẽ biểu đồ: wt là tải trọng cần vẽ, lấy từ data là bảng df đã tạo

Histogram(wt,data=df)

## >>> Note: wt is not in a data frame (table)
## >>> Note: wt is not in a data frame (table)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(wt, density=TRUE)  # smoothed curve + histogram 
## Plot(wt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- wt --- 
##  
##      n   miss     mean       sd      min      mdn      max 
##      16      0    10.44     4.29     3.00    11.00    17.00 
##  
## 
## No (Box plot) outliers 
## 
## 
## Bin Width: 2 
## Number of Bins: 8 
##  
##      Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ------------------------------------------------- 
##   2 >  4       3      2    0.12        2     0.12 
##   4 >  6       5      1    0.06        3     0.19 
##   6 >  8       7      3    0.19        6     0.38 
##   8 > 10       9      1    0.06        7     0.44 
##  10 > 12      11      4    0.25       11     0.69 
##  12 > 14      13      2    0.12       13     0.81 
##  14 > 16      15      2    0.12       15     0.94 
##  16 > 18      17      1    0.06       16     1.00 
##

#Dùng Shapiro.test để kiểm định phân phối chuẩn của wt.

shapiro.test(df$wt)

## 
##  Shapiro-Wilk normality test
## 
## data:  df$wt
## W = 0.96213, p-value = 0.7006

#Kết luận: p=0.7006 > 0.5 => phân phối chuẩn ##2.3 Mô tả đặc điểm về tải trọng giữa 2 nhóm

table1(~wt,data=df)

	Overall (N=16)
wt
Mean (SD)	10.4 (4.29)
Median [Min, Max]	11.0 [3.00, 17.0]

##2.4 Thực hiện phép kiểm t để đánh giá khác biệt về tải trọng của 2 nhóm. Bạn nhận xét gì về kết quả này. t-test có thể dùng 2 lệnh khác nhau, nhưng kết quả như nhau: thứ nhất: t.test(A,B). Lệnh này chỉ cần có dữ liệu của A và B là làm được. thứ hai: t.test(wt~group,data=df). Lệnh này dùng dữ liệu đã gộp thành bảng.

t.test(A,B)

## 
##  Welch Two Sample t-test
## 
## data:  A and B
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean of x mean of y 
##  8.571429 11.888889

t.test(wt~group,data=df)

## 
##  Welch Two Sample t-test
## 
## data:  wt by group
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean in group A mean in group B 
##        8.571429       11.888889

#kết luận: p>0.05 => không có sự khác biệt. Hiểu 1 cách khác, khoảng ước lượng với độ tin cậy 95% cóchứa giá trị0 (từ -7,8 đến 1,18 sẽ đi qua giá trị 0)=> chênh lệch (difference=A-B)=0=> có khả năng A=B=> không khác biệt

##2.5 Thực hiện bootstrap để đánh giá khác biệt trung bình tải trọng giữa 2 nhóm. Bạn nhận xét gì về kết quả này. Nếu dữ liệu không phân phối chuẩn, sẽ thực hiện bootstrap thay cho t-test

library(simpleboot)

## Simple Bootstrap Routines (1.1-8)

library(boot)
b=two.boot(A,B,mean,R=1000)
boot.ci(b)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-7.173,  0.393 )   (-7.078,  0.444 )  
## 
## Level     Percentile            BCa          
## 95%   (-7.079,  0.443 )   (-7.359,  0.037 )  
## Calculations and Intervals on Original Scale

#VẼ BIỂU ĐỒ PHÂN BỔ

hist(b,breaks=50)

Bài tập ngày 2

Nguyễn Xuân Vinh

2025-10-23