R stats tutorial
Liên hệ:
Facebook: R stats
Email: research@raisinghopevn.com
Điện thoại: 0965276046
Thư viện cần thiết
pacman::p_load(tidyverse, readxl)Chi square & Fisher’s exact test
Data
data <- read_excel("chi_square_independence_sheet3.xlsx",
sheet = "xàm")
data = data %>% janitor::clean_names()
data$nghe[data$nghe == '8.0'] = 'Bác sỹ'
data$nghe[data$nghe == 'Bác sĩ'] = 'Bác sỹ'
data## # A tibble: 80 × 2
## gioi_tinh nghe
## <chr> <chr>
## 1 Nữ Bác sỹ
## 2 Nữ Bác sỹ
## 3 Nữ Bác sỹ
## 4 Nữ Bác sỹ
## 5 Nữ Bác sỹ
## 6 Nữ Bác sỹ
## 7 Nữ Bác sỹ
## 8 Nữ Bác sỹ
## 9 Nữ Giáo viên
## 10 Nữ Giáo viên
## # … with 70 more rows
Chi square
chisq = chisq.test(table(data))
chisq##
## Pearson's Chi-squared test
##
## data: table(data)
## X-squared = 12.578, df = 2, p-value = 0.001856
Giá trị kỳ vọng (Expected frequencies)
chisq$expected## nghe
## gioi_tinh Bác sỹ Giáo viên Kỹ sư
## Nam 10.2 14.875 8.925
## Nữ 13.8 20.125 12.075
Fisher’s exact test
fisher.test(table(data))##
## Fisher's Exact Test for Count Data
##
## data: table(data)
## p-value = 0.001824
## alternative hypothesis: two.sided
ANOVA
Data
library(readr)
data <- read_csv("OneWayANOVAExcel.csv")## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): nhom_1, nhom_2, nhom_3, nhom_4
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data## # A tibble: 10 × 4
## nhom_1 nhom_2 nhom_3 nhom_4
## <dbl> <dbl> <dbl> <dbl>
## 1 11.7 10.6 10.3 6.90
## 2 12.0 13.5 12.2 8.99
## 3 8.04 7.42 10.6 6.97
## 4 10.6 12.0 9.66 9.16
## 5 14.1 7.78 8.79 8.68
## 6 10.8 10.7 10.9 11.4
## 7 7.86 10.7 10.4 10.8
## 8 11.9 4.48 10.2 5.67
## 9 11.9 6.80 11.6 10.8
## 10 13.2 5.37 12.3 9.01
Reshape data to long-format
data = data %>% pivot_longer(names_to = 'nhom',
values_to = 'gia_tri',
cols = starts_with('nhom'))
data## # A tibble: 40 × 2
## nhom gia_tri
## <chr> <dbl>
## 1 nhom_1 11.7
## 2 nhom_2 10.6
## 3 nhom_3 10.3
## 4 nhom_4 6.90
## 5 nhom_1 12.0
## 6 nhom_2 13.5
## 7 nhom_3 12.2
## 8 nhom_4 8.99
## 9 nhom_1 8.04
## 10 nhom_2 7.42
## # … with 30 more rows
One-way ANOVA
ket_qua = aov(gia_tri~nhom, data = data)
summary(ket_qua)## Df Sum Sq Mean Sq F value Pr(>F)
## nhom 3 43.62 14.540 3.303 0.0311 *
## Residuals 36 158.47 4.402
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
One-way ANOVA on Rank
(Kruskal-Wallis test H test) ### Data
library(readr)
data <- read_csv("anova.csv", show_col_types = FALSE)
data## # A tibble: 10 × 4
## nhom_1 nhom_2 nhom_3 nhom_4
## <dbl> <dbl> <dbl> <dbl>
## 1 11.7 10.6 10.3 6.90
## 2 12.0 13.5 12.2 8.99
## 3 8.04 7.42 10.6 6.97
## 4 10.6 12.0 9.66 9.16
## 5 14.1 7.78 8.79 8.68
## 6 10.8 10.7 10.9 11.4
## 7 7.86 10.7 10.4 10.8
## 8 11.9 4.48 10.2 5.67
## 9 11.9 6.80 11.6 10.8
## 10 13.2 5.37 12.3 9.01
Reshape data to long-format
data = data %>% pivot_longer(names_to = 'nhom',
values_to = 'gia_tri',
cols = starts_with('nhom'))
data## # A tibble: 40 × 2
## nhom gia_tri
## <chr> <dbl>
## 1 nhom_1 11.7
## 2 nhom_2 10.6
## 3 nhom_3 10.3
## 4 nhom_4 6.90
## 5 nhom_1 12.0
## 6 nhom_2 13.5
## 7 nhom_3 12.2
## 8 nhom_4 8.99
## 9 nhom_1 8.04
## 10 nhom_2 7.42
## # … with 30 more rows
Test
kruskal.test(gia_tri ~ nhom, data = data)##
## Kruskal-Wallis rank sum test
##
## data: gia_tri by nhom
## Kruskal-Wallis chi-squared = 7.5468, df = 3, p-value = 0.05637
Post hoc
pairwise.wilcox.test(data$gia_tri, data$nhom,
p.adjust.method = "BH")##
## Pairwise comparisons using Wilcoxon rank sum exact test
##
## data: data$gia_tri and data$nhom
##
## nhom_1 nhom_2 nhom_3
## nhom_2 0.15 - -
## nhom_3 0.42 0.42 -
## nhom_4 0.11 0.91 0.11
##
## P value adjustment method: BH
Paired T-test
Data
library(readr)
data <- read_csv("paired_t_test.csv")## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): time_1, time_2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data## # A tibble: 63 × 2
## time_1 time_2
## <dbl> <dbl>
## 1 55.2 63.7
## 2 43.0 35.9
## 3 49.6 43.8
## 4 43.0 34.8
## 5 46.9 43.1
## 6 49.7 44.3
## 7 43.0 34.8
## 8 51.0 43.4
## 9 49.0 56.2
## 10 46.5 43.8
## # … with 53 more rows
Test
t.test(data$time_1, data$time_2,
paired = T, var.equal = T)##
## Paired t-test
##
## data: data$time_1 and data$time_2
## t = -2.9615, df = 62, p-value = 0.004335
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -3.6850713 -0.7150388
## sample estimates:
## mean difference
## -2.200055
Wilcoxon-signed rank test
Data
data <- read_csv("paired_t_test.csv")## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): time_1, time_2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data## # A tibble: 63 × 2
## time_1 time_2
## <dbl> <dbl>
## 1 55.2 63.7
## 2 43.0 35.9
## 3 49.6 43.8
## 4 43.0 34.8
## 5 46.9 43.1
## 6 49.7 44.3
## 7 43.0 34.8
## 8 51.0 43.4
## 9 49.0 56.2
## 10 46.5 43.8
## # … with 53 more rows
Test
wilcox.test(data$time_1, data$time_2)##
## Wilcoxon rank sum test with continuity correction
##
## data: data$time_1 and data$time_2
## W = 1633, p-value = 0.08679
## alternative hypothesis: true location shift is not equal to 0
McNemar
Data
library(readr)
data <- read_csv("mcnemar.csv")## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): sample_population, sample_virus
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data## # A tibble: 100 × 2
## sample_population sample_virus
## <dbl> <dbl>
## 1 0 1
## 2 1 1
## 3 0 0
## 4 1 1
## 5 1 1
## 6 0 1
## 7 0 0
## 8 1 1
## 9 0 0
## 10 0 0
## # … with 90 more rows
Test
mcnemar.test(table(data))##
## McNemar's Chi-squared test with continuity correction
##
## data: table(data)
## McNemar's chi-squared = 0.02381, df = 1, p-value = 0.8774
One-way repeated measures ANOVA
Data
data("selfesteem", package = "datarium")
head(selfesteem, 3)## # A tibble: 3 × 4
## id t1 t2 t3
## <int> <dbl> <dbl> <dbl>
## 1 1 4.01 5.18 7.11
## 2 2 2.56 6.91 6.31
## 3 3 3.24 4.44 9.78
Reshape to long-format
library(rstatix)##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
data <- selfesteem %>%
gather(key = "time", value = "score", t1, t2, t3) %>%
convert_as_factor(id, time)
head(data, 3)## # A tibble: 3 × 3
## id time score
## <fct> <fct> <dbl>
## 1 1 t1 4.01
## 2 2 t1 2.56
## 3 3 t1 3.24
Test
ket_qua <- anova_test(data, dv = score, wid = id, within = time)
ket_qua## ANOVA Table (type III tests)
##
## $ANOVA
## Effect DFn DFd F p p<.05 ges
## 1 time 2 18 55.469 2.01e-08 * 0.829
##
## $`Mauchly's Test for Sphericity`
## Effect W p p<.05
## 1 time 0.551 0.092
##
## $`Sphericity Corrections`
## Effect GGe DF[GG] p[GG] p[GG]<.05 HFe DF[HF] p[HF]
## 1 time 0.69 1.38, 12.42 2.16e-06 * 0.774 1.55, 13.94 6.03e-07
## p[HF]<.05
## 1 *
Friedman test
Data
library(readr)
data <- read_csv("friedman.csv", show_col_types = FALSE)
data## # A tibble: 20 × 3
## plant_var locations disease
## <chr> <chr> <dbl>
## 1 P1 L1 4
## 2 P2 L1 3
## 3 P3 L1 5
## 4 P4 L1 5
## 5 P5 L1 4
## 6 P1 L2 2
## 7 P2 L2 1
## 8 P3 L2 2
## 9 P4 L2 1
## 10 P5 L2 1
## 11 P1 L3 5
## 12 P2 L3 4
## 13 P3 L3 3
## 14 P4 L3 4
## 15 P5 L3 4
## 16 P1 L4 4
## 17 P2 L4 3
## 18 P3 L4 4
## 19 P4 L4 4
## 20 P5 L4 5
Test
rstatix::friedman_test(data, disease~locations|plant_var)## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 disease 5 9.85 3 0.0199 Friedman test
Pairwise comparison
pwc <- data %>%
wilcox_test(disease ~ locations, paired = TRUE, p.adjust.method = "bonferroni")
pwc## # A tibble: 6 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 disease L1 L2 5 5 15 0.057 0.341 ns
## 2 disease L1 L3 5 5 6 0.85 1 ns
## 3 disease L1 L4 5 5 4 0.773 1 ns
## 4 disease L2 L3 5 5 0 0.048 0.286 ns
## 5 disease L2 L4 5 5 0 0.054 0.327 ns
## 6 disease L3 L4 5 5 5 1 1 ns
Factorial ANOVA
Data
library(readr)
data <- read_csv("factorial_anova.csv", show_col_types = FALSE)
data = data %>% janitor::clean_names()
data$temp = factor(data$temp)
data$glass = factor(data$glass)
data## # A tibble: 27 × 3
## glass temp light
## <fct> <fct> <dbl>
## 1 A 100 580
## 2 A 100 568
## 3 A 100 570
## 4 B 100 550
## 5 B 100 530
## 6 B 100 579
## 7 C 100 546
## 8 C 100 575
## 9 C 100 599
## 10 A 125 1090
## # … with 17 more rows
Test
ket_qua <- aov(light ~ temp + glass, data = data)
summary(ket_qua)## Df Sum Sq Mean Sq F value Pr(>F)
## temp 2 1970335 985167 72.943 1.96e-10 ***
## glass 2 150865 75432 5.585 0.0109 *
## Residuals 22 297131 13506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
One-way MANOVA
Data
library(readr)
data <- read_csv("manova.csv", show_col_types = FALSE)
data## # A tibble: 40 × 3
## plant_var height canopy_vol
## <chr> <dbl> <dbl>
## 1 A 20 0.7
## 2 A 22 0.8
## 3 A 24 0.95
## 4 A 18 0.6
## 5 A 20 0.74
## 6 A 20 0.76
## 7 A 16 0.84
## 8 A 17 0.66
## 9 A 18 0.99
## 10 A 14 0.8
## # … with 30 more rows
Test
ket_qua = manova(cbind(height, canopy_vol)~plant_var,
data = data)
summary(ket_qua)## Df Pillai approx F num Df den Df Pr(>F)
## plant_var 3 1.0365 12.909 6 72 7.575e-10 ***
## Residuals 36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Post hoc test
iris_lda <- MASS::lda(plant_var ~ cbind(height, canopy_vol), CV = F, data = data)
iris_lda## Call:
## lda(plant_var ~ cbind(height, canopy_vol), data = data, CV = F)
##
## Prior probabilities of groups:
## A B C D
## 0.25 0.25 0.25 0.25
##
## Group means:
## cbind(height, canopy_vol)height cbind(height, canopy_vol)canopy_vol
## A 18.90 0.784
## B 16.54 0.608
## C 3.05 0.272
## D 9.35 0.474
##
## Coefficients of linear discriminants:
## LD1 LD2
## cbind(height, canopy_vol)height -0.4388374 -0.2751091
## cbind(height, canopy_vol)canopy_vol -1.3949158 9.3256280
##
## Proportion of trace:
## LD1 LD2
## 0.9855 0.0145
Survival analysis
Required libraries
pacman::p_load(survival, ggplot2, survminer)Data
df = survival::veteran
head(df)## trt celltype time status karno diagtime age prior
## 1 1 squamous 72 1 60 7 69 0
## 2 1 squamous 411 1 70 5 64 10
## 3 1 squamous 228 1 60 3 38 0
## 4 1 squamous 126 1 60 9 63 10
## 5 1 squamous 118 1 70 11 65 10
## 6 1 squamous 10 1 20 5 49 0
Build standard survival object with Surv()
surv_object = with(df, Surv(time, status))
surv_object## [1] 72 411 228 126 118 10 82 110 314 100+ 42 8 144 25+ 11
## [16] 30 384 4 54 13 123+ 97+ 153 59 117 16 151 22 56 21
## [31] 18 139 20 31 52 287 18 51 122 27 54 7 63 392 10
## [46] 8 92 35 117 132 12 162 3 95 177 162 216 553 278 12
## [61] 260 200 156 182+ 143 105 103 250 100 999 112 87+ 231+ 242 991
## [76] 111 1 587 389 33 25 357 467 201 1 30 44 283 15 25
## [91] 103+ 21 13 87 2 20 7 24 99 8 99 61 25 95 80
## [106] 51 29 24 18 83+ 31 51 90 52 73 8 36 48 7 140
## [121] 186 84 19 45 80 52 164 19 53 15 43 340 133 111 231
## [136] 378 49
Begin analysis
Function: survfit(formula, data) Our formula is Surv(time, status)~1 1 means all data, no group
surv_fit = survfit(Surv(time, status)~1, data = df)
summary(surv_fit, time = c(1, 300, 600, 900))## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 137 2 0.985 0.0102 0.96552 1.0000
## 300 13 113 0.117 0.0295 0.07147 0.1917
## 600 2 11 0.018 0.0126 0.00459 0.0707
## 900 2 0 0.018 0.0126 0.00459 0.0707
Plot
ggsurvplot(surv_fit)Calculate median
Median
surv_fit## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
##
## n events median 0.95LCL 0.95UCL
## [1,] 137 128 80 52 105
Plot median
surv.median.line = c(‘h’, ‘v’, ‘hv’)
ggsurvplot(surv_fit, surv.median.line = "hv",
ggtheme = theme_grey())Compare survival curve by group treatment (trt)
Change 1 into trt in the previous formula Also show log-rank test p value
surv_fit = survfit(Surv(time, status)~trt, data = df)
ggsurvplot(surv_fit, pval = T, pval.method = T,
risk.table = T,
ggtheme = theme_grey())Cox model (Cox proportional hazards model)
cox = coxph(Surv(time, status) ~ trt + celltype +
karno + diagtime +
age + prior ,
data = df)
summary(cox)## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno +
## diagtime + age + prior, data = df)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## trt 2.946e-01 1.343e+00 2.075e-01 1.419 0.15577
## celltypesmallcell 8.616e-01 2.367e+00 2.753e-01 3.130 0.00175 **
## celltypeadeno 1.196e+00 3.307e+00 3.009e-01 3.975 7.05e-05 ***
## celltypelarge 4.013e-01 1.494e+00 2.827e-01 1.420 0.15574
## karno -3.282e-02 9.677e-01 5.508e-03 -5.958 2.55e-09 ***
## diagtime 8.132e-05 1.000e+00 9.136e-03 0.009 0.99290
## age -8.706e-03 9.913e-01 9.300e-03 -0.936 0.34920
## prior 7.159e-03 1.007e+00 2.323e-02 0.308 0.75794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## trt 1.3426 0.7448 0.8939 2.0166
## celltypesmallcell 2.3669 0.4225 1.3799 4.0597
## celltypeadeno 3.3071 0.3024 1.8336 5.9647
## celltypelarge 1.4938 0.6695 0.8583 2.5996
## karno 0.9677 1.0334 0.9573 0.9782
## diagtime 1.0001 0.9999 0.9823 1.0182
## age 0.9913 1.0087 0.9734 1.0096
## prior 1.0072 0.9929 0.9624 1.0541
##
## Concordance= 0.736 (se = 0.021 )
## Likelihood ratio test= 62.1 on 8 df, p=2e-10
## Wald test = 62.37 on 8 df, p=2e-10
## Score (logrank) test = 66.74 on 8 df, p=2e-11
Lưu ý: dù Cox model khá mạnh mẽ, tuy nhiên cần lưu ý những điều kiện cần phải thỏa mãn trước khi áp dụng.
Normality exploring
Required libs
pacman::p_load(ggpubr)Density plot
The density plot provides a visual judgment about whether the distribution is bell shaped.
my_data = read_csv("anova.csv")## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): nhom_1, nhom_2, nhom_3, nhom_4
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p1 = ggdensity(my_data$nhom_1,
main = "Density plot of tooth length",
xlab = "Tooth length")
p2 = ggdensity(my_data$nhom_2, ylab = "")
p3 = ggdensity(my_data$nhom_3, ylab = "")
ggarrange(p1, p2, p3)Q-Q plot
Q-Q plot (or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted.
p1 = ggqqplot(my_data$nhom_1)
p2 = ggqqplot(my_data$nhom_2)
p3 = ggqqplot(my_data$nhom_3)
ggarrange(p1, p2 ,p3)Normality tests
Shapiro-Wilk
shapiro.test(my_data$nhom_1)##
## Shapiro-Wilk normality test
##
## data: my_data$nhom_1
## W = 0.91111, p-value = 0.2887
shapiro.test(my_data$nhom_2)##
## Shapiro-Wilk normality test
##
## data: my_data$nhom_2
## W = 0.94807, p-value = 0.6458
shapiro.test(my_data$nhom_3)##
## Shapiro-Wilk normality test
##
## data: my_data$nhom_3
## W = 0.95563, p-value = 0.7351
Anderson-Darling
Lilliefors
Kolmogorov-Smirnov
Test Shapiro-Wilk là mạnh mẽ nhất (tham khảo tại đây). Cả 4 test trên đều bị ảnh hưởng nếu mẫu nhỏ (30 hoặc thấp hơn)
Skewness và Kurtosis
Skewness is usually described as a measure of a dataset’s symmetry.
Kurtosis is the degree of peakedness of a distribution Range accepted for normal distribution -2 to +2
psych::describe(my_data$nhom_1)## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10 11.2 1.99 11.8 11.26 1.68 7.86 14.08 6.22 -0.46 -1.05 0.63