df = read.csv("/Users/trangan/Desktop/AI in Data Analysis Course/Data Practice/Bone data.csv")
dim(df)
## [1] 2162 9
head(df)
## id sex age weight height prior.fx fnbmd smoking fx
## 1 1 Male 73 98 175 0 1.08 1 0
## 2 2 Female 68 72 166 0 0.97 0 0
## 3 3 Male 68 87 184 0 1.01 0 0
## 4 4 Female 62 72 173 0 0.84 1 0
## 5 5 Male 61 72 173 0 0.81 1 0
## 6 6 Female 76 57 156 0 0.74 0 0
summary(df)
## id sex age weight
## Min. : 1.0 Length:2162 Min. :57.00 Min. : 34.00
## 1st Qu.: 545.2 Class :character 1st Qu.:65.00 1st Qu.: 60.00
## Median :1093.5 Mode :character Median :69.00 Median : 69.00
## Mean :1094.7 Mean :70.65 Mean : 70.14
## 3rd Qu.:1639.8 3rd Qu.:75.00 3rd Qu.: 79.00
## Max. :2216.0 Max. :94.00 Max. :133.00
## NA's :1
## height prior.fx fnbmd smoking
## Min. :136.0 Min. :0.0000 Min. :0.2800 Min. :0.0000
## 1st Qu.:158.0 1st Qu.:0.0000 1st Qu.:0.7300 1st Qu.:0.0000
## Median :164.0 Median :0.0000 Median :0.8200 Median :0.0000
## Mean :164.9 Mean :0.1554 Mean :0.8287 Mean :0.4251
## 3rd Qu.:171.0 3rd Qu.:0.0000 3rd Qu.:0.9300 3rd Qu.:1.0000
## Max. :196.0 Max. :1.0000 Max. :1.5100 Max. :1.0000
## NA's :40
## fx
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2521
## 3rd Qu.:1.0000
## Max. :1.0000
##
#install.packages("lessR") # chỉ cần chạy 1 lần
library(lessR)
## Warning: package 'lessR' was built under R version 4.4.1
##
## lessR 4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including modern time series forecasting
## and many, new Plotly interactive visualizations output. Most
## visualization functions are now reorganized to three functions:
## Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
## X(): type="histogram", "density", "vbs" and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## Most previous function calls still work, such as:
## BarChart(), Histogram, and Plot().
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
# Vẽ histogram
Histogram(fnbmd,
data = df,
fill = "blue")
## lessR visualizations are now unified over just three core functions:
## - Chart() for pivot tables, such as bar charts. More info: ?Chart
## - X() for a single variable x, such as histograms. More info: ?X
## - XY() for scatterplots of two variables, x and y. More info: ?XY
##
## Histogram() is deprecated, though still working for now.
## Please use X(..., type = "histogram") going forward.
## [Interactive plot from the Plotly R package (Sievert, 2020)]
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## X(fnbmd, type="density") # smoothed curve + histogram
## X(fnbmd, type="vbs") # Violin/Box/Scatterplot (VBS) plot
##
## --- fnbmd ---
##
## n miss mean sd min mdn max
## 2122 40 0.829 0.155 0.280 0.820 1.510
##
##
##
## --- Outliers --- from the box plot: 33
##
## Small Large
## ----- -----
## 0.3 1.5
## 0.3 1.5
## 0.4 1.4
## 0.4 1.4
## 0.4 1.4
## 0.4 1.4
## 0.4 1.4
## 0.4 1.4
## 0.4 1.3
## 0.4 1.3
## 0.4 1.3
## 1.3
## 1.3
## 1.3
## 1.3
## 1.2
## 1.2
## 1.2
##
## + 15 more outliers
##
##
## Bin Width: 0.1
## Number of Bins: 14
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## ---------------------------------------------------
## 0.2 > 0.3 0.25 1 0.00 1 0.00
## 0.3 > 0.4 0.35 9 0.00 10 0.00
## 0.4 > 0.5 0.45 15 0.01 25 0.01
## 0.5 > 0.6 0.55 103 0.05 128 0.06
## 0.6 > 0.7 0.65 306 0.14 434 0.20
## 0.7 > 0.8 0.75 522 0.25 956 0.45
## 0.8 > 0.9 0.85 534 0.25 1490 0.70
## 0.9 > 1.0 0.95 371 0.17 1861 0.88
## 1.0 > 1.1 1.05 183 0.09 2044 0.96
## 1.1 > 1.2 1.15 48 0.02 2092 0.99
## 1.2 > 1.3 1.25 21 0.01 2113 1.00
## 1.3 > 1.4 1.35 6 0.00 2119 1.00
## 1.4 > 1.5 1.45 2 0.00 2121 1.00
## 1.5 > 1.6 1.55 1 0.00 2122 1.00
##
library(table1)
## Warning: package 'table1' was built under R version 4.4.1
##
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
##
## label
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ fnbmd | sex, data = df)
| Female (N=1317) |
Male (N=845) |
Overall (N=2162) |
|
|---|---|---|---|
| fnbmd | |||
| Mean (SD) | 0.778 (0.132) | 0.910 (0.153) | 0.829 (0.155) |
| Median [Min, Max] | 0.770 [0.280, 1.31] | 0.900 [0.340, 1.51] | 0.820 [0.280, 1.51] |
| Missing | 17 (1.3%) | 23 (2.7%) | 40 (1.9%) |
library(lessR)
ttest(fnbmd ~ sex, data = df)
##
## Compare fnbmd across sex with levels Male and Female
## Grouping Variable: sex
## Response Variable: fnbmd
##
##
## ------ Describe ------
##
## fnbmd for sex Male: n.miss = 23, n = 822, mean = 0.910, sd = 0.153
## fnbmd for sex Female: n.miss = 17, n = 1300, mean = 0.778, sd = 0.132
##
## Mean Difference of fnbmd: 0.132
##
## Weighted Average Standard Deviation: 0.141
##
##
## ------ Assumptions ------
##
## Note: These hypothesis tests can perform poorly, and the
## t-test is typically robust to violations of assumptions.
## Use as heuristic guides instead of interpreting literally.
##
## Null hypothesis, for each group, is a normal distribution of fnbmd.
## Group Male: Sample mean assumed normal because n > 30, so no test needed.
## Group Female: Sample mean assumed normal because n > 30, so no test needed.
##
## Null hypothesis is equal variances of fnbmd, homogeneous.
## Variance Ratio test: F = 0.023/0.018 = 1.336, df = 821;1299, p-value = 0.000
## Levene's test, Brown-Forsythe: t = 3.449, df = 2120, p-value = 0.001
##
##
## ------ Infer ------
##
## --- Assume equal population variances of fnbmd for each sex
##
## t-cutoff for 95% range of variation: tcut = 1.961
## Standard Error of Mean Difference: SE = 0.006
##
## Hypothesis Test of 0 Mean Diff: t-value = 21.080, df = 2120, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.012
## 95% Confidence Interval for Mean Difference: 0.120 to 0.144
##
##
## --- Do not assume equal population variances of fnbmd for each sex
##
## t-cutoff: tcut = 1.961
## Standard Error of Mean Difference: SE = 0.006
##
## Hypothesis Test of 0 Mean Diff: t = 20.407, df = 1560.981, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.013
## 95% Confidence Interval for Mean Difference: 0.119 to 0.145
##
##
## ------ Effect Size ------
##
## --- Assume equal population variances of fnbmd for each sex
##
## Standardized Mean Difference of fnbmd, Cohen's d: 0.939
##
##
## ------ Practical Importance ------
##
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
##
##
## ------ Graphics Smoothing Parameter ------
##
## Density bandwidth for sex Male: 0.044
## Density bandwidth for sex Female: 0.034
### 2.3 Sử dụng chatGPT 2.3.1 Biểu đồ histogram
PROMPT: Dùng gói lệnh ‘lessR’ vẽ biểu đồ histogram mô tả phân bố của mật độ xương (fnbmd), tô màu xanh và đặt tựa trục x là ‘Mật độ xương ở cổ xương đùi (g/cm2)’, trục y là ‘Số người’, tựa là ‘Phân bố mật độ xương’
# Nhóm chứng (placebo)
placebo <- c(105, 119, 100, 97, 96, 101, 94, 95, 98)
# Nhóm café
cafe <- c(96, 99, 94, 89, 96, 93, 88, 105, 88)
ttest(placebo, cafe, data = df)
##
## Compare Y across X with levels Group1 and Group2
## Grouping Variable: X
## Response Variable: Y
##
##
## ------ Describe ------
##
## Y for X Group1: n.miss = 0, n = 9, mean = 100.556, sd = 7.699
## Y for X Group2: n.miss = 0, n = 9, mean = 94.222, sd = 5.608
##
## Mean Difference of Y: 6.333
##
## Weighted Average Standard Deviation: 6.735
##
##
## ------ Assumptions ------
##
## Note: These hypothesis tests can perform poorly, and the
## t-test is typically robust to violations of assumptions.
## Use as heuristic guides instead of interpreting literally.
##
## Null hypothesis, for each group, is a normal distribution of Y.
## Group Group1 Shapiro-Wilk normality test: W = 0.780, p-value = 0.012
## Group Group2 Shapiro-Wilk normality test: W = 0.924, p-value = 0.427
##
## Null hypothesis is equal variances of Y, homogeneous.
## Variance Ratio test: F = 59.278/31.444 = 1.885, df = 8;8, p-value = 0.389
## Levene's test, Brown-Forsythe: t = 0.230, df = 16, p-value = 0.821
##
##
## ------ Infer ------
##
## --- Assume equal population variances of Y for each X
##
## t-cutoff for 95% range of variation: tcut = 2.120
## Standard Error of Mean Difference: SE = 3.175
##
## Hypothesis Test of 0 Mean Diff: t-value = 1.995, df = 16, p-value = 0.063
##
## Margin of Error for 95% Confidence Level: 6.731
## 95% Confidence Interval for Mean Difference: -0.397 to 13.064
##
##
## --- Do not assume equal population variances of Y for each X
##
## t-cutoff: tcut = 2.136
## Standard Error of Mean Difference: SE = 3.175
##
## Hypothesis Test of 0 Mean Diff: t = 1.995, df = 14.624, p-value = 0.065
##
## Margin of Error for 95% Confidence Level: 6.782
## 95% Confidence Interval for Mean Difference: -0.449 to 13.116
##
##
## ------ Effect Size ------
##
## --- Assume equal population variances of Y for each X
##
## Standardized Mean Difference of Y, Cohen's d: 0.940
##
##
## ------ Practical Importance ------
##
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
##
##
## ------ Graphics Smoothing Parameter ------
##
## Density bandwidth for X Group1: 5.642
## Density bandwidth for X Group2: 4.112
placebo <- c(105, 119, 100, 97, 96, 101, 94, 95, 98)
coffee <- c(96, 99, 94, 89, 96, 93, 88, 105, 88)
ttest (placebo, cafe, data = df)
##
## Compare Y across X with levels Group1 and Group2
## Grouping Variable: X
## Response Variable: Y
##
##
## ------ Describe ------
##
## Y for X Group1: n.miss = 0, n = 9, mean = 100.556, sd = 7.699
## Y for X Group2: n.miss = 0, n = 9, mean = 94.222, sd = 5.608
##
## Mean Difference of Y: 6.333
##
## Weighted Average Standard Deviation: 6.735
##
##
## ------ Assumptions ------
##
## Note: These hypothesis tests can perform poorly, and the
## t-test is typically robust to violations of assumptions.
## Use as heuristic guides instead of interpreting literally.
##
## Null hypothesis, for each group, is a normal distribution of Y.
## Group Group1 Shapiro-Wilk normality test: W = 0.780, p-value = 0.012
## Group Group2 Shapiro-Wilk normality test: W = 0.924, p-value = 0.427
##
## Null hypothesis is equal variances of Y, homogeneous.
## Variance Ratio test: F = 59.278/31.444 = 1.885, df = 8;8, p-value = 0.389
## Levene's test, Brown-Forsythe: t = 0.230, df = 16, p-value = 0.821
##
##
## ------ Infer ------
##
## --- Assume equal population variances of Y for each X
##
## t-cutoff for 95% range of variation: tcut = 2.120
## Standard Error of Mean Difference: SE = 3.175
##
## Hypothesis Test of 0 Mean Diff: t-value = 1.995, df = 16, p-value = 0.063
##
## Margin of Error for 95% Confidence Level: 6.731
## 95% Confidence Interval for Mean Difference: -0.397 to 13.064
##
##
## --- Do not assume equal population variances of Y for each X
##
## t-cutoff: tcut = 2.136
## Standard Error of Mean Difference: SE = 3.175
##
## Hypothesis Test of 0 Mean Diff: t = 1.995, df = 14.624, p-value = 0.065
##
## Margin of Error for 95% Confidence Level: 6.782
## 95% Confidence Interval for Mean Difference: -0.449 to 13.116
##
##
## ------ Effect Size ------
##
## --- Assume equal population variances of Y for each X
##
## Standardized Mean Difference of Y, Cohen's d: 0.940
##
##
## ------ Practical Importance ------
##
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
##
##
## ------ Graphics Smoothing Parameter ------
##
## Density bandwidth for X Group1: 5.642
## Density bandwidth for X Group2: 4.112
library(simpleboot)
## Simple Bootstrap Routines (1.1-8)
library(boot)
## Warning: package 'boot' was built under R version 4.4.1
set.seed(123) # để kết quả tái lập được
boot_result <- two.boot (placebo, cafe, FUN = mean, R = 10000)
boot.ci(boot_result, type = c("perc", "bca", "norm", "basic"))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_result, type = c("perc", "bca", "norm",
## "basic"))
##
## Intervals :
## Level Normal Basic
## 95% ( 0.383, 12.188 ) ( 0.111, 11.778 )
##
## Level Percentile BCa
## 95% ( 0.889, 12.556 ) ( 0.778, 12.444 )
## Calculations and Intervals on Original Scale
BCa không chứa 0 với CI 95% nên kết quả là có sự khác biệt trong chuyển hoá RER giữa nhóm dùng cafe và nhóm không dùng cafe là có ý nghĩa thống kê
PROMPT 1: Tôi có dữ liệu của 18 người trong 2 nhóm như sau: placebo = 105, 119, 100, 97, 96, 101, 94, 95, 98 và coffee = 96, 99, 94, 89, 96, 93, 88, 105, 88. Bạn giúp viết lệnh R từ gói lệnh ‘simpleboot’ và ‘boot’ để phân tích sự khác biệt giữa 2 nhóm bằng phương pháp bootstrap và trình bày nhiều giá trị 95% CI (percentile, BCA, normal, basic) không?
Trả lời câu hỏi: Có mối tương quan giữa tỷ lệ gãy xương và sex không Vì là biến định tính nên phải dùng Chisquare
library(table1)
table1(~ fx + as.factor(fx)| sex, data = df)
| Female (N=1317) |
Male (N=845) |
Overall (N=2162) |
|
|---|---|---|---|
| fx | |||
| Mean (SD) | 0.304 (0.460) | 0.170 (0.376) | 0.252 (0.434) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| as.factor(fx) | |||
| 0 | 916 (69.6%) | 701 (83.0%) | 1617 (74.8%) |
| 1 | 401 (30.4%) | 144 (17.0%) | 545 (25.2%) |
chisq.test(df$fx, df$sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$fx and df$sex
## X-squared = 48.363, df = 1, p-value = 3.542e-12
p-value = 0.000000000003542, p rất nhỏ nên có giá trị thống kê hay nói cách khác sự khác biệt về tỷ lệ gãy xương của nam và nữ là có giá trị thống kê