#bai tap ngay 1

viec 1

viec 2: cai dat

ob=read.csv("/Users/meikoi/Downloads/Obesity data.csv")
dim(ob)
## [1] 1217   13
head(ob)
##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
##   diabetes
## 1        1
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
tail(ob)
##        id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat
## 1212 1222      F    153     50 21.4  59  1309  0.87 18328 29147  37.6
## 1213 1223      F    150     44 19.6  44  1474  0.95 12906 28534  30.1
## 1214 1224      F    148     51 23.3  58  1522  0.97 14938 33931  29.6
## 1215 1225      F    149     50 22.5  57  1409  0.93 16777 30598  34.4
## 1216 1226      F    144     49 23.6  67  1266  0.90 20094 27272  41.3
## 1217 1227      F    141     45 22.6  58  1228  0.91 14567 28111  33.2
##      hypertension diabetes
## 1212            1        0
## 1213            0        1
## 1214            0        0
## 1215            1        0
## 1216            1        0
## 1217            0        0

##viec 3

ob=read.csv("/Users/meikoi/Downloads/Obesity data.csv")
head(ob)
##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
##   diabetes
## 1        1
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
dim(ob)
## [1] 1217   13
men.overweight = subset(ob, gender == "M" & bmi>= 25)
head(men.overweight)
##    id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 16 16      M    150     70 31.1  49  2084  1.00 16540 49512  24.3            1
## 18 18      M    158     65 26.0  49  2557  1.29 11716 45567  19.6            1
## 19 19      M    162     72 27.4  66  1535  0.86 25416 44577  35.5            1
## 26 26      M    172     78 26.4  52  1991  0.95 19772 55200  25.7            0
## 40 40      M    171     75 25.6  51  2192  1.13 18188 51801  25.2            1
## 69 69      M    168     80 28.3  53  2043  1.06 17907 54594  24.0            0
##    diabetes
## 16        0
## 18        0
## 19        0
## 26        0
## 40        0
## 69        0
dim(men.overweight)
## [1] 85 13
demo=subset(ob, select=c(id, age, gender, weight, height, pcfat))
dim(demo)
## [1] 1217    6

bai thuc hanh ngay 2

##viec 3

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + gender + height + weight + pcfat + hypertension + diabetes, data = ob)
Overall
(N=1217)
age
Mean (SD) 47.2 (17.3)
Median [Min, Max] 48.0 [13.0, 88.0]
gender
F 862 (70.8%)
M 355 (29.2%)
height
Mean (SD) 157 (7.98)
Median [Min, Max] 155 [136, 185]
weight
Mean (SD) 55.1 (9.40)
Median [Min, Max] 54.0 [34.0, 95.0]
pcfat
Mean (SD) 31.6 (7.18)
Median [Min, Max] 32.4 [9.20, 48.4]
hypertension
Mean (SD) 0.507 (0.500)
Median [Min, Max] 1.00 [0, 1.00]
diabetes
Mean (SD) 0.111 (0.314)
Median [Min, Max] 0 [0, 1.00]

viec 4: tao bien so dinh tinh

ob$hyper = as.factor(ob$hypertension)
ob$dm = as.factor(ob$diabete)
table1(~ age + gender + height + weight + pcfat + hypertension + hyper + diabetes + dm, data = ob)
Overall
(N=1217)
age
Mean (SD) 47.2 (17.3)
Median [Min, Max] 48.0 [13.0, 88.0]
gender
F 862 (70.8%)
M 355 (29.2%)
height
Mean (SD) 157 (7.98)
Median [Min, Max] 155 [136, 185]
weight
Mean (SD) 55.1 (9.40)
Median [Min, Max] 54.0 [34.0, 95.0]
pcfat
Mean (SD) 31.6 (7.18)
Median [Min, Max] 32.4 [9.20, 48.4]
hypertension
Mean (SD) 0.507 (0.500)
Median [Min, Max] 1.00 [0, 1.00]
hyper
0 600 (49.3%)
1 617 (50.7%)
diabetes
Mean (SD) 0.111 (0.314)
Median [Min, Max] 0 [0, 1.00]
dm
0 1082 (88.9%)
1 135 (11.1%)

viec 5: so sanh nam va nu

table1(~ age  + height + weight + pcfat + hypertension + hyper + diabetes + dm | gender, data = ob)
F
(N=862)
M
(N=355)
Overall
(N=1217)
age
Mean (SD) 48.6 (16.4) 43.7 (18.8) 47.2 (17.3)
Median [Min, Max] 49.0 [14.0, 85.0] 44.0 [13.0, 88.0] 48.0 [13.0, 88.0]
height
Mean (SD) 153 (5.55) 165 (6.73) 157 (7.98)
Median [Min, Max] 153 [136, 170] 165 [146, 185] 155 [136, 185]
weight
Mean (SD) 52.3 (7.72) 62.0 (9.59) 55.1 (9.40)
Median [Min, Max] 51.0 [34.0, 95.0] 62.0 [38.0, 95.0] 54.0 [34.0, 95.0]
pcfat
Mean (SD) 34.7 (5.19) 24.2 (5.76) 31.6 (7.18)
Median [Min, Max] 34.7 [14.6, 48.4] 24.6 [9.20, 39.0] 32.4 [9.20, 48.4]
hypertension
Mean (SD) 0.501 (0.500) 0.521 (0.500) 0.507 (0.500)
Median [Min, Max] 1.00 [0, 1.00] 1.00 [0, 1.00] 1.00 [0, 1.00]
hyper
0 430 (49.9%) 170 (47.9%) 600 (49.3%)
1 432 (50.1%) 185 (52.1%) 617 (50.7%)
diabetes
Mean (SD) 0.118 (0.323) 0.0930 (0.291) 0.111 (0.314)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
dm
0 760 (88.2%) 322 (90.7%) 1082 (88.9%)
1 102 (11.8%) 33 (9.3%) 135 (11.1%)

viec 6: danh gia su khac biet giua hai nhom

library(compareGroups)
createTable(compareGroups(gender ~ age + weight + height + pcfat + hyper + dm, data =ob))
## 
## --------Summary descriptives table by 'gender'---------
## 
## ________________________________________ 
##             F           M      p.overall 
##           N=862       N=355              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age    48.6 (16.4) 43.7 (18.8)  <0.001   
## weight 52.3 (7.72) 62.0 (9.59)  <0.001   
## height 153 (5.55)  165 (6.73)   <0.001   
## pcfat  34.7 (5.19) 24.2 (5.76)  <0.001   
## hyper:                           0.569   
##     0  430 (49.9%) 170 (47.9%)           
##     1  432 (50.1%) 185 (52.1%)           
## dm:                              0.238   
##     0  760 (88.2%) 322 (90.7%)           
##     1  102 (11.8%) 33 (9.30%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

viec 7: phan tich khac biet giua hai nhom : du lieu tai trong

A = c(14, 4, 10, 6, 3, 11, 12)
B = c(16, 17, 13, 12, 7, 16, 11, 8, 7)
wt = c(A, B)
group = c(rep("A", 7), rep("B", 9))
df = data.frame(wt, group)
dim(df)
## [1] 16  2

viec 8: danh gia phan bo: dung bieu do

library(lessR)
## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:table1':
## 
##     label
Histogram(wt, data = df)
## >>> Note: wt is not in a data frame (table)
## >>> Note: wt is not in a data frame (table)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(wt, density=TRUE)  # smoothed curve + histogram 
## Plot(wt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- wt --- 
##  
##      n   miss     mean       sd      min      mdn      max 
##      16      0    10.44     4.29     3.00    11.00    17.00 
##  
## 
## No (Box plot) outliers 
## 
## 
## Bin Width: 2 
## Number of Bins: 8 
##  
##      Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ------------------------------------------------- 
##   2 >  4       3      2    0.12        2     0.12 
##   4 >  6       5      1    0.06        3     0.19 
##   6 >  8       7      3    0.19        6     0.38 
##   8 > 10       9      1    0.06        7     0.44 
##  10 > 12      11      4    0.25       11     0.69 
##  12 > 14      13      2    0.12       13     0.81 
##  14 > 16      15      2    0.12       15     0.94 
##  16 > 18      17      1    0.06       16     1.00 
## 
shapiro.test(df$wt)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$wt
## W = 0.96213, p-value = 0.7006

viec 9: mo ta dac diem tai trong

library(table1)
table1(~ wt | group, data = df, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))
A
(N=7)
B
(N=9)
Overall
(N=16)
wt
Mean (SD) 8.57 (4.24) 11.9 (3.95) 10.4 (4.29)
Median [Q1, Q3] 10.0 [5.00, 11.5] 12.0 [8.00, 16.0] 11.0 [7.00, 13.3]

viec 10: thuc hien phep kiem t

t.test(A, B)
## 
##  Welch Two Sample t-test
## 
## data:  A and B
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean of x mean of y 
##  8.571429 11.888889
t.test(wt ~ group, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  wt by group
## t = -1.6, df = 12.554, p-value = 0.1345
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -7.813114  1.178194
## sample estimates:
## mean in group A mean in group B 
##        8.571429       11.888889

viec 11: thuc hien bootstrap

library(simpleboot)
## Simple Bootstrap Routines (1.1-8)
library(boot)
b = two.boot(A, B, mean, R = 1000)
boot.ci(b)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-7.036,  0.402 )   (-6.919,  0.347 )  
## 
## Level     Percentile            BCa          
## 95%   (-6.982,  0.284 )   (-6.870,  0.319 )  
## Calculations and Intervals on Original Scale
hist(b, breaks = 50)

r <- getOption("repos")
r["CRAN"] <- "http://cran.us.r-project.org"
options(repos = r)
install.packages("boot", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/7k/s0hp0qw11616m5s1zvz3zx640000gn/T//RtmpVr97FL/downloaded_packages
install.packages("boot")
## 
## The downloaded binary packages are in
##  /var/folders/7k/s0hp0qw11616m5s1zvz3zx640000gn/T//RtmpVr97FL/downloaded_packages
library(boot)
A <- c(14, 4, 10, 6, 3, 11, 12)
B <- c(16, 17, 13, 12, 7, 16, 11, 8, 7)

Ham thong ke: hieu so trung binh giua hai nhom

diff_mean <- function(data, indices)
{A_resampled <- data$A[indices[[1]]]
B_resampled <- data$B[indices[[2]]]
return(mean(A_resampled) - mean(B_resampled))}

Ham tien ich de tao chi so bootstrap rieng cho moi nhom

boot_2groups <- function(A, B, R = 2000)
{nA <- length(A)
nB <- length(B)
boot_values <- numeric(R)
for (i in 1:R) {idxA <- sample(1:nA, nA, replace = TRUE)
idxB <- sample(1:nB, nB, replace = TRUE)
boot_values[i] <- mean(A[idxA] - mean(B[idxB]))}
return(boot_values)}

Chay boostrap

set.seed(123)
boot_values <- boot_2groups(A, B, R = 2000)
diff_estimate <- mean(A) - mean(B)
diff_estimate
## [1] -3.31746

Tao doi tuong “boot” de dung ham boot.ci

library(boot)
boot_obj <- boot(data = list(A = A, B = B),
                 statistic = function(data, i) {
                   # i ở đây là vector cho A và B (chúng ta tạo tách riêng)
                   idxA <- sample(1:length(data$A), length(data$A), replace = TRUE)
                   idxB <- sample(1:length(data$B), length(data$B), replace = TRUE)
                   mean(data$A[idxA]) - mean(data$B[idxB])
                 },
                 R = 2000)
boot_obj
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = list(A = A, B = B), statistic = function(data, i) {
##     idxA <- sample(1:length(data$A), length(data$A), replace = TRUE)
##     idxB <- sample(1:length(data$B), length(data$B), replace = TRUE)
##     mean(data$A[idxA]) - mean(data$B[idxB])
## }, R = 2000)
## 
## 
## Bootstrap Statistics :
##      original    bias    std. error
## t1* 0.1746032 -3.404722    1.901877
boot.ci(boot_obj, conf = 0.95, type = c("norm", "basic", "perc", "bca"))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_obj, conf = 0.95, type = c("norm", "basic", 
##     "perc", "bca"))
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (-0.1483,  7.3069 )   (-0.0635,  7.3333 )  
## 
## Level     Percentile            BCa          
## 95%   (-6.9841,  0.4127 )   (-0.2647,  2.6032 )  
## Calculations and Intervals on Original Scale
## Warning : BCa Intervals used Extreme Quantiles
## Some BCa intervals may be unstable