R stats tutorial


Liên hệ:


Thư viện cần thiết

pacman::p_load(tidyverse, readxl)

Chi square & Fisher’s exact test

Data

data <- read_excel("chi_square_independence_sheet3.xlsx", 
                   sheet = "xàm")
data = data %>% janitor::clean_names()
data$nghe[data$nghe == '8.0'] = 'Bác sỹ'
data$nghe[data$nghe == 'Bác sĩ'] = 'Bác sỹ'
data
## # A tibble: 80 × 2
##    gioi_tinh nghe     
##    <chr>     <chr>    
##  1 Nữ        Bác sỹ   
##  2 Nữ        Bác sỹ   
##  3 Nữ        Bác sỹ   
##  4 Nữ        Bác sỹ   
##  5 Nữ        Bác sỹ   
##  6 Nữ        Bác sỹ   
##  7 Nữ        Bác sỹ   
##  8 Nữ        Bác sỹ   
##  9 Nữ        Giáo viên
## 10 Nữ        Giáo viên
## # … with 70 more rows

Chi square

chisq = chisq.test(table(data))
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  table(data)
## X-squared = 12.578, df = 2, p-value = 0.001856

Giá trị kỳ vọng (Expected frequencies)

chisq$expected
##          nghe
## gioi_tinh Bác sỹ Giáo viên  Kỹ sư
##       Nam   10.2    14.875  8.925
##       Nữ    13.8    20.125 12.075

Fisher’s exact test

fisher.test(table(data))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(data)
## p-value = 0.001824
## alternative hypothesis: two.sided

ANOVA

Data

library(readr)
data <- read_csv("OneWayANOVAExcel.csv")
## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): nhom_1, nhom_2, nhom_3, nhom_4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
## # A tibble: 10 × 4
##    nhom_1 nhom_2 nhom_3 nhom_4
##     <dbl>  <dbl>  <dbl>  <dbl>
##  1  11.7   10.6   10.3    6.90
##  2  12.0   13.5   12.2    8.99
##  3   8.04   7.42  10.6    6.97
##  4  10.6   12.0    9.66   9.16
##  5  14.1    7.78   8.79   8.68
##  6  10.8   10.7   10.9   11.4 
##  7   7.86  10.7   10.4   10.8 
##  8  11.9    4.48  10.2    5.67
##  9  11.9    6.80  11.6   10.8 
## 10  13.2    5.37  12.3    9.01

Reshape data to long-format

data = data %>% pivot_longer(names_to = 'nhom',
                      values_to = 'gia_tri',
                      cols = starts_with('nhom'))
data
## # A tibble: 40 × 2
##    nhom   gia_tri
##    <chr>    <dbl>
##  1 nhom_1   11.7 
##  2 nhom_2   10.6 
##  3 nhom_3   10.3 
##  4 nhom_4    6.90
##  5 nhom_1   12.0 
##  6 nhom_2   13.5 
##  7 nhom_3   12.2 
##  8 nhom_4    8.99
##  9 nhom_1    8.04
## 10 nhom_2    7.42
## # … with 30 more rows

One-way ANOVA

ket_qua = aov(gia_tri~nhom, data = data) 
summary(ket_qua)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## nhom         3  43.62  14.540   3.303 0.0311 *
## Residuals   36 158.47   4.402                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way ANOVA on Rank

(Kruskal-Wallis test H test) ### Data

library(readr)
data <- read_csv("anova.csv", show_col_types = FALSE)
data
## # A tibble: 10 × 4
##    nhom_1 nhom_2 nhom_3 nhom_4
##     <dbl>  <dbl>  <dbl>  <dbl>
##  1  11.7   10.6   10.3    6.90
##  2  12.0   13.5   12.2    8.99
##  3   8.04   7.42  10.6    6.97
##  4  10.6   12.0    9.66   9.16
##  5  14.1    7.78   8.79   8.68
##  6  10.8   10.7   10.9   11.4 
##  7   7.86  10.7   10.4   10.8 
##  8  11.9    4.48  10.2    5.67
##  9  11.9    6.80  11.6   10.8 
## 10  13.2    5.37  12.3    9.01

Reshape data to long-format

data = data %>% pivot_longer(names_to = 'nhom',
                      values_to = 'gia_tri',
                      cols = starts_with('nhom'))
data
## # A tibble: 40 × 2
##    nhom   gia_tri
##    <chr>    <dbl>
##  1 nhom_1   11.7 
##  2 nhom_2   10.6 
##  3 nhom_3   10.3 
##  4 nhom_4    6.90
##  5 nhom_1   12.0 
##  6 nhom_2   13.5 
##  7 nhom_3   12.2 
##  8 nhom_4    8.99
##  9 nhom_1    8.04
## 10 nhom_2    7.42
## # … with 30 more rows

Test

kruskal.test(gia_tri ~ nhom, data = data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  gia_tri by nhom
## Kruskal-Wallis chi-squared = 7.5468, df = 3, p-value = 0.05637

Post hoc

pairwise.wilcox.test(data$gia_tri, data$nhom,
                 p.adjust.method = "BH")
## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  data$gia_tri and data$nhom 
## 
##        nhom_1 nhom_2 nhom_3
## nhom_2 0.15   -      -     
## nhom_3 0.42   0.42   -     
## nhom_4 0.11   0.91   0.11  
## 
## P value adjustment method: BH

Paired T-test

Data

library(readr)
data <- read_csv("paired_t_test.csv")
## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): time_1, time_2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
## # A tibble: 63 × 2
##    time_1 time_2
##     <dbl>  <dbl>
##  1   55.2   63.7
##  2   43.0   35.9
##  3   49.6   43.8
##  4   43.0   34.8
##  5   46.9   43.1
##  6   49.7   44.3
##  7   43.0   34.8
##  8   51.0   43.4
##  9   49.0   56.2
## 10   46.5   43.8
## # … with 53 more rows

Test

t.test(data$time_1, data$time_2,
       paired = T, var.equal = T)
## 
##  Paired t-test
## 
## data:  data$time_1 and data$time_2
## t = -2.9615, df = 62, p-value = 0.004335
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -3.6850713 -0.7150388
## sample estimates:
## mean difference 
##       -2.200055

Wilcoxon-signed rank test

Data

data <- read_csv("paired_t_test.csv")
## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): time_1, time_2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
## # A tibble: 63 × 2
##    time_1 time_2
##     <dbl>  <dbl>
##  1   55.2   63.7
##  2   43.0   35.9
##  3   49.6   43.8
##  4   43.0   34.8
##  5   46.9   43.1
##  6   49.7   44.3
##  7   43.0   34.8
##  8   51.0   43.4
##  9   49.0   56.2
## 10   46.5   43.8
## # … with 53 more rows

Test

wilcox.test(data$time_1, data$time_2)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data$time_1 and data$time_2
## W = 1633, p-value = 0.08679
## alternative hypothesis: true location shift is not equal to 0

McNemar

Data

library(readr)
data <- read_csv("mcnemar.csv")
## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): sample_population, sample_virus
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
## # A tibble: 100 × 2
##    sample_population sample_virus
##                <dbl>        <dbl>
##  1                 0            1
##  2                 1            1
##  3                 0            0
##  4                 1            1
##  5                 1            1
##  6                 0            1
##  7                 0            0
##  8                 1            1
##  9                 0            0
## 10                 0            0
## # … with 90 more rows

Test

mcnemar.test(table(data))
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  table(data)
## McNemar's chi-squared = 0.02381, df = 1, p-value = 0.8774

One-way repeated measures ANOVA

Data

data("selfesteem", package = "datarium")
head(selfesteem, 3)
## # A tibble: 3 × 4
##      id    t1    t2    t3
##   <int> <dbl> <dbl> <dbl>
## 1     1  4.01  5.18  7.11
## 2     2  2.56  6.91  6.31
## 3     3  3.24  4.44  9.78

Reshape to long-format

library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
data <- selfesteem %>%
  gather(key = "time", value = "score", t1, t2, t3) %>%
  convert_as_factor(id, time)
head(data, 3)
## # A tibble: 3 × 3
##   id    time  score
##   <fct> <fct> <dbl>
## 1 1     t1     4.01
## 2 2     t1     2.56
## 3 3     t1     3.24

Test

ket_qua <- anova_test(data, dv = score, wid = id, within = time)
ket_qua
## ANOVA Table (type III tests)
## 
## $ANOVA
##   Effect DFn DFd      F        p p<.05   ges
## 1   time   2  18 55.469 2.01e-08     * 0.829
## 
## $`Mauchly's Test for Sphericity`
##   Effect     W     p p<.05
## 1   time 0.551 0.092      
## 
## $`Sphericity Corrections`
##   Effect  GGe      DF[GG]    p[GG] p[GG]<.05   HFe      DF[HF]    p[HF]
## 1   time 0.69 1.38, 12.42 2.16e-06         * 0.774 1.55, 13.94 6.03e-07
##   p[HF]<.05
## 1         *

Friedman test

Data

library(readr)
data <- read_csv("friedman.csv", show_col_types = FALSE)
data
## # A tibble: 20 × 3
##    plant_var locations disease
##    <chr>     <chr>       <dbl>
##  1 P1        L1              4
##  2 P2        L1              3
##  3 P3        L1              5
##  4 P4        L1              5
##  5 P5        L1              4
##  6 P1        L2              2
##  7 P2        L2              1
##  8 P3        L2              2
##  9 P4        L2              1
## 10 P5        L2              1
## 11 P1        L3              5
## 12 P2        L3              4
## 13 P3        L3              3
## 14 P4        L3              4
## 15 P5        L3              4
## 16 P1        L4              4
## 17 P2        L4              3
## 18 P3        L4              4
## 19 P4        L4              4
## 20 P5        L4              5

Test

rstatix::friedman_test(data, disease~locations|plant_var)
## # A tibble: 1 × 6
##   .y.         n statistic    df      p method       
## * <chr>   <int>     <dbl> <dbl>  <dbl> <chr>        
## 1 disease     5      9.85     3 0.0199 Friedman test

Pairwise comparison

pwc <- data %>%
  wilcox_test(disease ~ locations, paired = TRUE, p.adjust.method = "bonferroni")
pwc
## # A tibble: 6 × 9
##   .y.     group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>   <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 disease L1     L2         5     5        15 0.057 0.341 ns          
## 2 disease L1     L3         5     5         6 0.85  1     ns          
## 3 disease L1     L4         5     5         4 0.773 1     ns          
## 4 disease L2     L3         5     5         0 0.048 0.286 ns          
## 5 disease L2     L4         5     5         0 0.054 0.327 ns          
## 6 disease L3     L4         5     5         5 1     1     ns

Factorial ANOVA

Data

library(readr)
data <- read_csv("factorial_anova.csv", show_col_types = FALSE)
data = data %>% janitor::clean_names()
data$temp = factor(data$temp)
data$glass = factor(data$glass)
data
## # A tibble: 27 × 3
##    glass temp  light
##    <fct> <fct> <dbl>
##  1 A     100     580
##  2 A     100     568
##  3 A     100     570
##  4 B     100     550
##  5 B     100     530
##  6 B     100     579
##  7 C     100     546
##  8 C     100     575
##  9 C     100     599
## 10 A     125    1090
## # … with 17 more rows

Test

ket_qua <- aov(light ~ temp + glass, data = data)
summary(ket_qua)
##             Df  Sum Sq Mean Sq F value   Pr(>F)    
## temp         2 1970335  985167  72.943 1.96e-10 ***
## glass        2  150865   75432   5.585   0.0109 *  
## Residuals   22  297131   13506                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way MANOVA

Data

library(readr)
data <- read_csv("manova.csv", show_col_types = FALSE)
data
## # A tibble: 40 × 3
##    plant_var height canopy_vol
##    <chr>      <dbl>      <dbl>
##  1 A             20       0.7 
##  2 A             22       0.8 
##  3 A             24       0.95
##  4 A             18       0.6 
##  5 A             20       0.74
##  6 A             20       0.76
##  7 A             16       0.84
##  8 A             17       0.66
##  9 A             18       0.99
## 10 A             14       0.8 
## # … with 30 more rows

Test

ket_qua = manova(cbind(height, canopy_vol)~plant_var, 
                 data = data)
summary(ket_qua)
##           Df Pillai approx F num Df den Df    Pr(>F)    
## plant_var  3 1.0365   12.909      6     72 7.575e-10 ***
## Residuals 36                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Post hoc test

iris_lda <- MASS::lda(plant_var ~ cbind(height, canopy_vol), CV = F, data = data)
iris_lda
## Call:
## lda(plant_var ~ cbind(height, canopy_vol), data = data, CV = F)
## 
## Prior probabilities of groups:
##    A    B    C    D 
## 0.25 0.25 0.25 0.25 
## 
## Group means:
##   cbind(height, canopy_vol)height cbind(height, canopy_vol)canopy_vol
## A                           18.90                               0.784
## B                           16.54                               0.608
## C                            3.05                               0.272
## D                            9.35                               0.474
## 
## Coefficients of linear discriminants:
##                                            LD1        LD2
## cbind(height, canopy_vol)height     -0.4388374 -0.2751091
## cbind(height, canopy_vol)canopy_vol -1.3949158  9.3256280
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9855 0.0145

Survival analysis

Required libraries

pacman::p_load(survival, ggplot2, survminer)

Data

df = survival::veteran
head(df)
##   trt celltype time status karno diagtime age prior
## 1   1 squamous   72      1    60        7  69     0
## 2   1 squamous  411      1    70        5  64    10
## 3   1 squamous  228      1    60        3  38     0
## 4   1 squamous  126      1    60        9  63    10
## 5   1 squamous  118      1    70       11  65    10
## 6   1 squamous   10      1    20        5  49     0

Build standard survival object with Surv()

surv_object = with(df, Surv(time, status))
surv_object
##   [1]  72  411  228  126  118   10   82  110  314  100+  42    8  144   25+  11 
##  [16]  30  384    4   54   13  123+  97+ 153   59  117   16  151   22   56   21 
##  [31]  18  139   20   31   52  287   18   51  122   27   54    7   63  392   10 
##  [46]   8   92   35  117  132   12  162    3   95  177  162  216  553  278   12 
##  [61] 260  200  156  182+ 143  105  103  250  100  999  112   87+ 231+ 242  991 
##  [76] 111    1  587  389   33   25  357  467  201    1   30   44  283   15   25 
##  [91] 103+  21   13   87    2   20    7   24   99    8   99   61   25   95   80 
## [106]  51   29   24   18   83+  31   51   90   52   73    8   36   48    7  140 
## [121] 186   84   19   45   80   52  164   19   53   15   43  340  133  111  231 
## [136] 378   49

Begin analysis

Function: survfit(formula, data) Our formula is Surv(time, status)~1 1 means all data, no group

surv_fit = survfit(Surv(time, status)~1, data = df)
summary(surv_fit, time = c(1, 300, 600, 900))
## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    137       2    0.985  0.0102      0.96552       1.0000
##   300     13     113    0.117  0.0295      0.07147       0.1917
##   600      2      11    0.018  0.0126      0.00459       0.0707
##   900      2       0    0.018  0.0126      0.00459       0.0707

Plot

ggsurvplot(surv_fit)

Calculate median

Median

surv_fit
## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 137    128     80      52     105

Plot median

surv.median.line = c(‘h’, ‘v’, ‘hv’)

ggsurvplot(surv_fit, surv.median.line = "hv", 
           ggtheme = theme_grey())

Compare survival curve by group treatment (trt)

Change 1 into trt in the previous formula Also show log-rank test p value

surv_fit = survfit(Surv(time, status)~trt, data = df)
ggsurvplot(surv_fit, pval = T, pval.method = T,
           risk.table = T,
           ggtheme = theme_grey())

Cox model (Cox proportional hazards model)

cox = coxph(Surv(time, status) ~ trt + celltype + 
              karno + diagtime + 
              age + prior , 
            data = df)
summary(cox)
## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df)
## 
##   n= 137, number of events= 128 
## 
##                         coef  exp(coef)   se(coef)      z Pr(>|z|)    
## trt                2.946e-01  1.343e+00  2.075e-01  1.419  0.15577    
## celltypesmallcell  8.616e-01  2.367e+00  2.753e-01  3.130  0.00175 ** 
## celltypeadeno      1.196e+00  3.307e+00  3.009e-01  3.975 7.05e-05 ***
## celltypelarge      4.013e-01  1.494e+00  2.827e-01  1.420  0.15574    
## karno             -3.282e-02  9.677e-01  5.508e-03 -5.958 2.55e-09 ***
## diagtime           8.132e-05  1.000e+00  9.136e-03  0.009  0.99290    
## age               -8.706e-03  9.913e-01  9.300e-03 -0.936  0.34920    
## prior              7.159e-03  1.007e+00  2.323e-02  0.308  0.75794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## trt                  1.3426     0.7448    0.8939    2.0166
## celltypesmallcell    2.3669     0.4225    1.3799    4.0597
## celltypeadeno        3.3071     0.3024    1.8336    5.9647
## celltypelarge        1.4938     0.6695    0.8583    2.5996
## karno                0.9677     1.0334    0.9573    0.9782
## diagtime             1.0001     0.9999    0.9823    1.0182
## age                  0.9913     1.0087    0.9734    1.0096
## prior                1.0072     0.9929    0.9624    1.0541
## 
## Concordance= 0.736  (se = 0.021 )
## Likelihood ratio test= 62.1  on 8 df,   p=2e-10
## Wald test            = 62.37  on 8 df,   p=2e-10
## Score (logrank) test = 66.74  on 8 df,   p=2e-11

Lưu ý: dù Cox model khá mạnh mẽ, tuy nhiên cần lưu ý những điều kiện cần phải thỏa mãn trước khi áp dụng.

Normality exploring

Required libs

pacman::p_load(ggpubr)

Density plot

The density plot provides a visual judgment about whether the distribution is bell shaped.

my_data = read_csv("anova.csv")
## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): nhom_1, nhom_2, nhom_3, nhom_4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p1 = ggdensity(my_data$nhom_1, 
          main = "Density plot of tooth length",
          xlab = "Tooth length")
p2 = ggdensity(my_data$nhom_2, ylab = "")
p3 = ggdensity(my_data$nhom_3, ylab = "")
ggarrange(p1, p2, p3)

Q-Q plot

Q-Q plot (or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted.

p1 = ggqqplot(my_data$nhom_1)
p2 = ggqqplot(my_data$nhom_2)
p3 = ggqqplot(my_data$nhom_3)
ggarrange(p1, p2 ,p3)

Normality tests

Shapiro-Wilk

shapiro.test(my_data$nhom_1)
## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$nhom_1
## W = 0.91111, p-value = 0.2887
shapiro.test(my_data$nhom_2)
## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$nhom_2
## W = 0.94807, p-value = 0.6458
shapiro.test(my_data$nhom_3)
## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$nhom_3
## W = 0.95563, p-value = 0.7351

Anderson-Darling

Lilliefors

Kolmogorov-Smirnov

Test Shapiro-Wilk là mạnh mẽ nhất (tham khảo tại đây). Cả 4 test trên đều bị ảnh hưởng nếu mẫu nhỏ (30 hoặc thấp hơn)

Skewness và Kurtosis

Skewness is usually described as a measure of a dataset’s symmetry.

Kurtosis is the degree of peakedness of a distribution Range accepted for normal distribution -2 to +2

psych::describe(my_data$nhom_1)
##    vars  n mean   sd median trimmed  mad  min   max range  skew kurtosis   se
## X1    1 10 11.2 1.99   11.8   11.26 1.68 7.86 14.08  6.22 -0.46    -1.05 0.63

Data visualization