Ngày 3: Phân tích mô tả…độ lệch chuẩn, sai số chuẩn…

##1. Lấy data birthwt

n3=read.csv("D:\\OneDrive\\Documents\\HPK1 2018\\NVPS2025\\Viet bao khoa hoc paper\\R\\DU LIEU THUC HANH\\birthwt.csv")

##2. đọc packages Table1, Mô tả đặc điểm tuổi mẹ, cân nặng mẹ và cân nặng con

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + lwt + bwt, data = n3)
Overall
(N=189)
age
Mean (SD) 23.2 (5.30)
Median [Min, Max] 23.0 [14.0, 45.0]
lwt
Mean (SD) 130 (30.6)
Median [Min, Max] 121 [80.0, 250]
bwt
Mean (SD) 2940 (729)
Median [Min, Max] 2980 [709, 4990]

##3. Mô tả đặc điểm tuổi mẹ, cân nặng mẹ, tình trạng hút thuốc trong thai kỳ, chủng tộc và cân nặng con theo tình trạng trẻ sanh thiếu cân

n3$race.c = as.character(factor(n3$race, levels = c(1, 2, 3), labels = c("White", "Black", "Other")))
n3$smoke.c = as.character(factor(n3$smoke, levels = c(0, 1), labels = c("No", "Yes")))
n3$low.c = as.character(factor(n3$low, levels = c(0, 1), labels = c("No LBW", "Low birthweight")))
table1(~ age + lwt + smoke.c + race.c + bwt | low.c, data = n3)
Low birthweight
(N=59)
No LBW
(N=130)
Overall
(N=189)
age
Mean (SD) 22.3 (4.51) 23.7 (5.58) 23.2 (5.30)
Median [Min, Max] 22.0 [14.0, 34.0] 23.0 [14.0, 45.0] 23.0 [14.0, 45.0]
lwt
Mean (SD) 122 (26.6) 133 (31.7) 130 (30.6)
Median [Min, Max] 120 [80.0, 200] 124 [85.0, 250] 121 [80.0, 250]
smoke.c
No 29 (49.2%) 86 (66.2%) 115 (60.8%)
Yes 30 (50.8%) 44 (33.8%) 74 (39.2%)
race.c
Black 11 (18.6%) 15 (11.5%) 26 (13.8%)
Other 25 (42.4%) 42 (32.3%) 67 (35.4%)
White 23 (39.0%) 73 (56.2%) 96 (50.8%)
bwt
Mean (SD) 2100 (391) 3330 (478) 2940 (729)
Median [Min, Max] 2210 [709, 2500] 3270 [2520, 4990] 2980 [709, 4990]

##4.Sử dụng gói lệnh ‘table1’ để mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt), tình trạng hút thuốc trong thai kỳ (smoke), chủng tộc (race), và cân nặng của con (bwt) theo tình trạng trẻ thiếu cân (low)

 # 2. Chuyển các biến thành factor với nhãn trực quan
n3$low   <- factor(n3$low,   levels = c(0,1), labels = c("Normal", "Low"))
n3$smoke <- factor(n3$smoke, levels = c(0,1), labels = c("No", "Yes"))
n3$race  <- factor(n3$race,  levels = c(1,2,3),
labels = c("Caucasian", "African American", "Other"))

# 3. Đặt nhãn cho biến (table1 sẽ dùng nhãn này thay tên cột)
label(n3$age)   <- "Mother's Age (years)"
label(n3$lwt)   <- "Mother's Weight (lbs)"
label(n3$smoke) <- "Smoking During Pregnancy"
label(n3$race)  <- "Mother's Race"
label(n3$bwt)   <- "Infant Birth Weight (g)"

# 4. Tạo bảng mô tả
table1(~ age + lwt + smoke + race + bwt | low,
data = n3,
caption = "Descriptive characteristics by birthweight status",
render.missing = FALSE)
Descriptive characteristics by birthweight status
Normal
(N=130)
Low
(N=59)
Overall
(N=189)
Mother's Age (years)
Mean (SD) 23.7 (5.58) 22.3 (4.51) 23.2 (5.30)
Median [Min, Max] 23.0 [14.0, 45.0] 22.0 [14.0, 34.0] 23.0 [14.0, 45.0]
Mother's Weight (lbs)
Mean (SD) 133 (31.7) 122 (26.6) 130 (30.6)
Median [Min, Max] 124 [85.0, 250] 120 [80.0, 200] 121 [80.0, 250]
Smoking During Pregnancy
No 86 (66.2%) 29 (49.2%) 115 (60.8%)
Yes 44 (33.8%) 30 (50.8%) 74 (39.2%)
Mother's Race
Caucasian 73 (56.2%) 23 (39.0%) 96 (50.8%)
African American 15 (11.5%) 11 (18.6%) 26 (13.8%)
Other 42 (32.3%) 25 (42.4%) 67 (35.4%)
Infant Birth Weight (g)
Mean (SD) 3330 (478) 2100 (391) 2940 (729)
Median [Min, Max] 3270 [2520, 4990] 2210 [709, 2500] 2980 [709, 4990]

##5. Vẽ histogram đánh giá phân bố mật độ xương

xu = read.csv("D:\\OneDrive\\Documents\\HPK1 2018\\NVPS2025\\Viet bao khoa hoc paper\\R\\DU LIEU THUC HANH\\Bone data.csv")
dim(xu)
## [1] 2162    9
library(lessR)
## 
## lessR 4.4.2                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:table1':
## 
##     label
## The following object is masked from 'package:base':
## 
##     sort_by
Histogram(fnbmd, fill = "blue", xlab = "Mật độ xương (g/cm2)", ylab = "Frequency",  data = xu)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(fnbmd, density=TRUE)  # smoothed curve + histogram 
## Plot(fnbmd)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- fnbmd --- 
##  
##        n    miss      mean        sd       min       mdn       max 
##      2122      40     0.829     0.155     0.280     0.820     1.510 
##  
## 
##   
## --- Outliers ---     from the box plot: 33 
##  
## Small      Large 
## -----      ----- 
##  0.3      1.5 
##  0.3      1.5 
##  0.4      1.4 
##  0.4      1.4 
##  0.4      1.4 
##  0.4      1.4 
##  0.4      1.4 
##  0.4      1.4 
##  0.4      1.3 
##  0.4      1.3 
##  0.4      1.3 
##           1.3 
##           1.3 
##           1.3 
##           1.3 
##           1.2 
##           1.2 
##           1.2 
## 
## + 15 more outliers 
## 
## 
## Bin Width: 0.1 
## Number of Bins: 14 
##  
##        Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## --------------------------------------------------- 
##  0.2 > 0.3    0.25      1    0.00        1     0.00 
##  0.3 > 0.4    0.35      9    0.00       10     0.00 
##  0.4 > 0.5    0.45     15    0.01       25     0.01 
##  0.5 > 0.6    0.55    103    0.05      128     0.06 
##  0.6 > 0.7    0.65    306    0.14      434     0.20 
##  0.7 > 0.8    0.75    522    0.24      956     0.44 
##  0.8 > 0.9    0.85    534    0.25     1490     0.69 
##  0.9 > 1.0    0.95    371    0.17     1861     0.86 
##  1.0 > 1.1    1.05    183    0.08     2044     0.95 
##  1.1 > 1.2    1.15     48    0.02     2092     0.97 
##  1.2 > 1.3    1.25     21    0.01     2113     0.98 
##  1.3 > 1.4    1.35      6    0.00     2119     0.98 
##  1.4 > 1.5    1.45      2    0.00     2121     0.98 
##  1.5 > 1.6    1.55      1    0.00     2122     0.98 
## 

##6. So sánh mật độ xương cổ xương đùi giữa namvà nữ ữfnbmd: mật độ xương

 library(table1)
table1(~ fnbmd | sex, data = xu)
Female
(N=1317)
Male
(N=845)
Overall
(N=2162)
fnbmd
Mean (SD) 0.778 (0.132) 0.910 (0.153) 0.829 (0.155)
Median [Min, Max] 0.770 [0.280, 1.31] 0.900 [0.340, 1.51] 0.820 [0.280, 1.51]
Missing 17 (1.3%) 23 (2.7%) 40 (1.9%)
t.test(fnbmd ~ sex, data = xu)
## 
##  Welch Two Sample t-test
## 
## data:  fnbmd by sex
## t = -20.407, df = 1561, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -0.1448770 -0.1194686
## sample estimates:
## mean in group Female   mean in group Male 
##            0.7775231            0.9096959

##7.T-Test

library(lessR)
ttest(fnbmd ~ sex, data = xu)
## 
## Compare fnbmd across sex with levels Male and Female 
## Grouping Variable:  sex
## Response Variable:  fnbmd
## 
## 
## ------ Describe ------
## 
## fnbmd for sex Male:  n.miss = 23,  n = 822,  mean = 0.910,  sd = 0.153
## fnbmd for sex Female:  n.miss = 17,  n = 1300,  mean = 0.778,  sd = 0.132
## 
## Mean Difference of fnbmd:  0.132
## 
## Weighted Average Standard Deviation:   0.141 
## 
## 
## ------ Assumptions ------
## 
## Note: These hypothesis tests can perform poorly, and the 
##       t-test is typically robust to violations of assumptions. 
##       Use as heuristic guides instead of interpreting literally. 
## 
## Null hypothesis, for each group, is a normal distribution of fnbmd.
## Group Male: Sample mean assumed normal because n > 30, so no test needed.
## Group Female: Sample mean assumed normal because n > 30, so no test needed.
## 
## Null hypothesis is equal variances of fnbmd, homogeneous.
## Variance Ratio test:  F = 0.023/0.018 = 1.336,  df = 821;1299,  p-value = 0.000
## Levene's test, Brown-Forsythe:  t = 3.449,  df = 2120,  p-value = 0.001
## 
## 
## ------ Infer ------
## 
## --- Assume equal population variances of fnbmd for each sex 
## 
## t-cutoff for 95% range of variation: tcut =  1.961 
## Standard Error of Mean Difference: SE =  0.006 
## 
## Hypothesis Test of 0 Mean Diff:  t-value = 21.080,  df = 2120,  p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.012
## 95% Confidence Interval for Mean Difference:  0.120 to 0.144
## 
## 
## --- Do not assume equal population variances of fnbmd for each sex 
## 
## t-cutoff: tcut =  1.961 
## Standard Error of Mean Difference: SE =  0.006 
## 
## Hypothesis Test of 0 Mean Diff:  t = 20.407,  df = 1560.981, p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.013
## 95% Confidence Interval for Mean Difference:  0.119 to 0.145
## 
## 
## ------ Effect Size ------
## 
## --- Assume equal population variances of fnbmd for each sex 
## 
## Standardized Mean Difference of fnbmd, Cohen's d:  0.939
## 
## 
## ------ Practical Importance ------
## 
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for sex Male: 0.044
## Density bandwidth for sex Female: 0.034

##8.Đánh giá ảnh hưởng của cafe lên RER, phân tích sự khác biệt

placebo = c(105, 119, 100, 97, 96, 101, 94, 95, 98)
coffee = c(96, 99, 94, 89, 96, 93, 88, 105, 88)

###8.1. Cài pack simpleboot ###8.2. Chạy bootstrap phân tích

library(simpleboot)
## Simple Bootstrap Routines (1.1-8)
library(boot)
b = two.boot(placebo, coffee, mean, R = 500)
boot.ci(b)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.639, 12.122 )   ( 0.563, 11.889 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.778, 12.104 )   ( 0.867, 12.519 )  
## Calculations and Intervals on Original Scale

###8.3. vẽ histogram

hist(b, breaks = 50)

### chat GPT # 1. Khởi tạo dữ liệu placebo <- c(105, 119, 100, 97, 96, 101, 94, 95, 98) coffee <- c( 96, 99, 94, 89, 96, 93, 88, 105, 88)

2. Định nghĩa hàm tính hiệu số trung bình

data: vector dài 2*n, indices: chỉ số bootstrap

diff_mean <- function(data, indices) { n <- length(data) / 2 x <- data[indices] mean(x[1:n]) - mean(x[(n+1):(2*n)]) }

3. Kết hợp hai nhóm vào 1 vector

d_all <- c(placebo, coffee)

4. Chạy bootstrap với 10.000 lặp

library(boot) set.seed(123) # để kết quả có thể tái lập boot_res <- boot(data = d_all, statistic = diff_mean, R = 10000)

5. Xem kết quả

boot_res

ORDINARY NONPARAMETRIC BOOTSTRAP

Call: boot(data = d_all, statistic = diff_mean, R = 10000)

Bootstrap Statistics : original bias std. error t1* 6.333333 -6.341222 3.341927 # 6. Khoảng tin cậy 95% boot.ci(boot_res, type = c(“perc”, “bca”)) BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS Based on 10000 bootstrap replicates

CALL : boot.ci(boot.out = boot_res, type = c(“perc”, “bca”))

Intervals : Level Percentile BCa
95% (-6.556, 6.556 ) ( 6.111, 11.556 )
Calculations and Intervals on Original Scale Warning : BCa Intervals used Extreme Quantiles Some BCa intervals may be unstable

7. Vẽ histogram phân phối bootstrap

plot(boot_res) ###

##9.So sánh 2 nhóm - biến phân loại ###9.1.Việc 5. So sánh tỉ lệ gãy xương giữa nam và nữ

table1(~ as.factor(fx) | sex, data = xu)
Female
(N=1317)
Male
(N=845)
Overall
(N=2162)
as.factor(fx)
0 916 (69.6%) 701 (83.0%) 1617 (74.8%)
1 401 (30.4%) 144 (17.0%) 545 (25.2%)

###9.2.ki bình phương chi sq test

chisq.test(xu$fx, xu$sex)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  xu$fx and xu$sex
## X-squared = 48.363, df = 1, p-value = 0.000000000003542

##Kết thúc