R Markdown

Việc 1 phân tích mô tả

Đọc dữ liệu

ob <- read.csv("C:\\Users\\DELL\\Downloads\\tap huan khoa hoc\\Obesity data.csv")
head(ob)
##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
##   diabetes
## 1        1
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0

Mô tả các đặc điểm tuổi, giới tính, tiền cao huyết áp, tiểu đường

summary(ob[c("age", "weight", "height", "pcfat")])
##       age            weight          height          pcfat     
##  Min.   :13.00   Min.   :34.00   Min.   :136.0   Min.   : 9.2  
##  1st Qu.:35.00   1st Qu.:49.00   1st Qu.:151.0   1st Qu.:27.0  
##  Median :48.00   Median :54.00   Median :155.0   Median :32.4  
##  Mean   :47.15   Mean   :55.14   Mean   :156.7   Mean   :31.6  
##  3rd Qu.:58.00   3rd Qu.:61.00   3rd Qu.:162.0   3rd Qu.:36.8  
##  Max.   :88.00   Max.   :95.00   Max.   :185.0   Max.   :48.4

nhận xét kết quả của tiền căn bệnh cao huyết áp và tiểu đường.

table(ob$hypertension)
table(ob$diabetes)

Trình bày trung vị (Q1, Q3) cho biến liên tục

quantile(ob$age, probs = c(0.25, 0.5, 0.75))
## 25% 50% 75% 
##  35  48  58
quantile(ob$weight, probs = c(0.25, 0.5, 0.75))
## 25% 50% 75% 
##  49  54  61
quantile(ob$height, probs = c(0.25, 0.5, 0.75))
## 25% 50% 75% 
## 151 155 162
quantile(ob$pcfat, probs = c(0.25, 0.5, 0.75))
##  25%  50%  75% 
## 27.0 32.4 36.8

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot. ## Mô tả đặc điểm tuổi, cân nặng theo giới tính

aggregate(cbind(age, weight, height, pcfat) ~ gender, data = ob, FUN = mean)
##   gender      age   weight   height    pcfat
## 1      F 48.57309 52.31090 153.2912 34.67241
## 2      M 43.70141 62.02254 165.0592 24.15607
aggregate(cbind(hypertension, diabetes) ~ gender, data = ob, FUN = mean)
##   gender hypertension   diabetes
## 1      F    0.5011601 0.11832947
## 2      M    0.5211268 0.09295775

sự khác biệt giữa nam và nữ

t.test(age ~ gender, data = ob)
## 
##  Welch Two Sample t-test
## 
## data:  age by gender
## t = 4.2608, df = 587.49, p-value = 2.372e-05
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  2.626083 7.117272
## sample estimates:
## mean in group F mean in group M 
##        48.57309        43.70141
t.test(weight ~ gender, data = ob)
## 
##  Welch Two Sample t-test
## 
## data:  weight by gender
## t = -16.952, df = 551.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -10.836942  -8.586319
## sample estimates:
## mean in group F mean in group M 
##        52.31090        62.02254
t.test(height ~ gender, data = ob)
## 
##  Welch Two Sample t-test
## 
## data:  height by gender
## t = -29.125, df = 562.31, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -12.56161 -10.97433
## sample estimates:
## mean in group F mean in group M 
##        153.2912        165.0592
t.test(pcfat ~ gender, data = ob)
## 
##  Welch Two Sample t-test
## 
## data:  pcfat by gender
## t = 29.768, df = 602.01, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##   9.822548 11.210140
## sample estimates:
## mean in group F mean in group M 
##        34.67241        24.15607
chisq.test(table(ob$gender, ob$hypertension))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(ob$gender, ob$hypertension)
## X-squared = 0.32515, df = 1, p-value = 0.5685
chisq.test(table(ob$gender, ob$diabetes))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(ob$gender, ob$diabetes)
## X-squared = 1.394, df = 1, p-value = 0.2377

Việc 2 Phân tích khác biệt giữa 2 nhóm

2.1 Nhập dữ liệu nhóm A và B vào R

Nhom_A <- c(14, 4, 10, 6, 3, 11, 12)
Nhom_B <- c(16, 17, 13, 12, 7, 16, 11, 8, 7)

2.2 Kiểm tra tải trọng có tuân theo phân bố chuẩn không

shapiro.test(Nhom_A)
## 
##  Shapiro-Wilk normality test
## 
## data:  Nhom_A
## W = 0.92541, p-value = 0.5126
shapiro.test(Nhom_B)
## 
##  Shapiro-Wilk normality test
## 
## data:  Nhom_B
## W = 0.89641, p-value = 0.2319

2.3 Mô tả đặc điểm tài trọng giữa hai nhóm

summary(Nhom_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000  10.000   8.571  11.500  14.000
summary(Nhom_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00    8.00   12.00   11.89   16.00   17.00

2.4 Thực hiện kiểm định t-test

t.test(Nhom_A, Nhom_B)
## 
##  Welch Two Sample t-test
## 
## data:  Nhom_A and Nhom_B
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean of x mean of y 
##  8.571429 11.888889

2.5 Thực hiện bootstrap để đánh giá khác biệt trung bình

set.seed(123)
bootstrap_diff_mean <- replicate(1000, {
  sample_A <- sample(Nhom_A, replace = TRUE)
  sample_B <- sample(Nhom_B, replace = TRUE)
  mean(sample_A) - mean(sample_B)
})

quantile(bootstrap_diff_mean, c(0.025, 0.5, 0.975))
##       2.5%        50%      97.5% 
## -7.0952381 -3.3174603  0.4924603
hist(bootstrap_diff_mean, main = "Bootstrap Difference in Means", xlab = "Mean Difference")

2.6 Bootstrap so sánh trung vị

set.seed(123)
bootstrap_diff_median <- replicate(1000, {
  sample_A <- sample(Nhom_A, replace = TRUE)
  sample_B <- sample(Nhom_B, replace = TRUE)
  median(sample_A) - median(sample_B)
})

quantile(bootstrap_diff_median, c(0.025, 0.5, 0.975))
##  2.5%   50% 97.5% 
##   -10    -2     4
hist(bootstrap_diff_median, main = "Bootstrap Difference in Medians", xlab = "Median Difference")