R stats tutorial

Liên hệ:

Facebook: R stats
Email: research@raisinghopevn.com
Điện thoại: 0965276046

Thư viện cần thiết

pacman::p_load(tidyverse, readxl)

Chi square & Fisher’s exact test

Data

data <- read_excel("chi_square_independence_sheet3.xlsx", 
                   sheet = "xàm")
data = data %>% janitor::clean_names()
data$nghe[data$nghe == '8.0'] = 'Bác sỹ'
data$nghe[data$nghe == 'Bác sĩ'] = 'Bác sỹ'
data

## # A tibble: 80 × 2
##    gioi_tinh nghe     
##    <chr>     <chr>    
##  1 Nữ        Bác sỹ   
##  2 Nữ        Bác sỹ   
##  3 Nữ        Bác sỹ   
##  4 Nữ        Bác sỹ   
##  5 Nữ        Bác sỹ   
##  6 Nữ        Bác sỹ   
##  7 Nữ        Bác sỹ   
##  8 Nữ        Bác sỹ   
##  9 Nữ        Giáo viên
## 10 Nữ        Giáo viên
## # … with 70 more rows

Chi square

chisq = chisq.test(table(data))
chisq

## 
##  Pearson's Chi-squared test
## 
## data:  table(data)
## X-squared = 12.578, df = 2, p-value = 0.001856

Giá trị kỳ vọng (Expected frequencies)

chisq$expected

##          nghe
## gioi_tinh Bác sỹ Giáo viên  Kỹ sư
##       Nam   10.2    14.875  8.925
##       Nữ    13.8    20.125 12.075

Fisher’s exact test

fisher.test(table(data))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(data)
## p-value = 0.001824
## alternative hypothesis: two.sided

ANOVA

Data

library(readr)
data <- read_csv("OneWayANOVAExcel.csv")

## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): nhom_1, nhom_2, nhom_3, nhom_4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data

## # A tibble: 10 × 4
##    nhom_1 nhom_2 nhom_3 nhom_4
##     <dbl>  <dbl>  <dbl>  <dbl>
##  1  11.7   10.6   10.3    6.90
##  2  12.0   13.5   12.2    8.99
##  3   8.04   7.42  10.6    6.97
##  4  10.6   12.0    9.66   9.16
##  5  14.1    7.78   8.79   8.68
##  6  10.8   10.7   10.9   11.4 
##  7   7.86  10.7   10.4   10.8 
##  8  11.9    4.48  10.2    5.67
##  9  11.9    6.80  11.6   10.8 
## 10  13.2    5.37  12.3    9.01

Reshape data to long-format

data = data %>% pivot_longer(names_to = 'nhom',
                      values_to = 'gia_tri',
                      cols = starts_with('nhom'))
data

## # A tibble: 40 × 2
##    nhom   gia_tri
##    <chr>    <dbl>
##  1 nhom_1   11.7 
##  2 nhom_2   10.6 
##  3 nhom_3   10.3 
##  4 nhom_4    6.90
##  5 nhom_1   12.0 
##  6 nhom_2   13.5 
##  7 nhom_3   12.2 
##  8 nhom_4    8.99
##  9 nhom_1    8.04
## 10 nhom_2    7.42
## # … with 30 more rows

One-way ANOVA

ket_qua = aov(gia_tri~nhom, data = data) 
summary(ket_qua)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## nhom         3  43.62  14.540   3.303 0.0311 *
## Residuals   36 158.47   4.402                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way ANOVA on Rank

(Kruskal-Wallis test H test) ### Data

library(readr)
data <- read_csv("anova.csv", show_col_types = FALSE)
data

## # A tibble: 10 × 4
##    nhom_1 nhom_2 nhom_3 nhom_4
##     <dbl>  <dbl>  <dbl>  <dbl>
##  1  11.7   10.6   10.3    6.90
##  2  12.0   13.5   12.2    8.99
##  3   8.04   7.42  10.6    6.97
##  4  10.6   12.0    9.66   9.16
##  5  14.1    7.78   8.79   8.68
##  6  10.8   10.7   10.9   11.4 
##  7   7.86  10.7   10.4   10.8 
##  8  11.9    4.48  10.2    5.67
##  9  11.9    6.80  11.6   10.8 
## 10  13.2    5.37  12.3    9.01

Reshape data to long-format

data = data %>% pivot_longer(names_to = 'nhom',
                      values_to = 'gia_tri',
                      cols = starts_with('nhom'))
data

## # A tibble: 40 × 2
##    nhom   gia_tri
##    <chr>    <dbl>
##  1 nhom_1   11.7 
##  2 nhom_2   10.6 
##  3 nhom_3   10.3 
##  4 nhom_4    6.90
##  5 nhom_1   12.0 
##  6 nhom_2   13.5 
##  7 nhom_3   12.2 
##  8 nhom_4    8.99
##  9 nhom_1    8.04
## 10 nhom_2    7.42
## # … with 30 more rows

Test

kruskal.test(gia_tri ~ nhom, data = data)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  gia_tri by nhom
## Kruskal-Wallis chi-squared = 7.5468, df = 3, p-value = 0.05637

Post hoc

pairwise.wilcox.test(data$gia_tri, data$nhom,
                 p.adjust.method = "BH")

## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  data$gia_tri and data$nhom 
## 
##        nhom_1 nhom_2 nhom_3
## nhom_2 0.15   -      -     
## nhom_3 0.42   0.42   -     
## nhom_4 0.11   0.91   0.11  
## 
## P value adjustment method: BH

Paired T-test

Data

library(readr)
data <- read_csv("paired_t_test.csv")

## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): time_1, time_2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data

## # A tibble: 63 × 2
##    time_1 time_2
##     <dbl>  <dbl>
##  1   55.2   63.7
##  2   43.0   35.9
##  3   49.6   43.8
##  4   43.0   34.8
##  5   46.9   43.1
##  6   49.7   44.3
##  7   43.0   34.8
##  8   51.0   43.4
##  9   49.0   56.2
## 10   46.5   43.8
## # … with 53 more rows

Test

t.test(data$time_1, data$time_2,
       paired = T, var.equal = T)

## 
##  Paired t-test
## 
## data:  data$time_1 and data$time_2
## t = -2.9615, df = 62, p-value = 0.004335
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -3.6850713 -0.7150388
## sample estimates:
## mean difference 
##       -2.200055

Wilcoxon-signed rank test

Data

data <- read_csv("paired_t_test.csv")

## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): time_1, time_2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data

## # A tibble: 63 × 2
##    time_1 time_2
##     <dbl>  <dbl>
##  1   55.2   63.7
##  2   43.0   35.9
##  3   49.6   43.8
##  4   43.0   34.8
##  5   46.9   43.1
##  6   49.7   44.3
##  7   43.0   34.8
##  8   51.0   43.4
##  9   49.0   56.2
## 10   46.5   43.8
## # … with 53 more rows

Test

wilcox.test(data$time_1, data$time_2)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data$time_1 and data$time_2
## W = 1633, p-value = 0.08679
## alternative hypothesis: true location shift is not equal to 0

McNemar

Data

library(readr)
data <- read_csv("mcnemar.csv")

## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): sample_population, sample_virus
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data

## # A tibble: 100 × 2
##    sample_population sample_virus
##                <dbl>        <dbl>
##  1                 0            1
##  2                 1            1
##  3                 0            0
##  4                 1            1
##  5                 1            1
##  6                 0            1
##  7                 0            0
##  8                 1            1
##  9                 0            0
## 10                 0            0
## # … with 90 more rows

Test

mcnemar.test(table(data))

## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  table(data)
## McNemar's chi-squared = 0.02381, df = 1, p-value = 0.8774

One-way repeated measures ANOVA

Data

data("selfesteem", package = "datarium")
head(selfesteem, 3)

## # A tibble: 3 × 4
##      id    t1    t2    t3
##   <int> <dbl> <dbl> <dbl>
## 1     1  4.01  5.18  7.11
## 2     2  2.56  6.91  6.31
## 3     3  3.24  4.44  9.78

Reshape to long-format

library(rstatix)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

data <- selfesteem %>%
  gather(key = "time", value = "score", t1, t2, t3) %>%
  convert_as_factor(id, time)
head(data, 3)

## # A tibble: 3 × 3
##   id    time  score
##   <fct> <fct> <dbl>
## 1 1     t1     4.01
## 2 2     t1     2.56
## 3 3     t1     3.24

Test

ket_qua <- anova_test(data, dv = score, wid = id, within = time)
ket_qua

## ANOVA Table (type III tests)
## 
## $ANOVA
##   Effect DFn DFd      F        p p<.05   ges
## 1   time   2  18 55.469 2.01e-08     * 0.829
## 
## $`Mauchly's Test for Sphericity`
##   Effect     W     p p<.05
## 1   time 0.551 0.092      
## 
## $`Sphericity Corrections`
##   Effect  GGe      DF[GG]    p[GG] p[GG]<.05   HFe      DF[HF]    p[HF]
## 1   time 0.69 1.38, 12.42 2.16e-06         * 0.774 1.55, 13.94 6.03e-07
##   p[HF]<.05
## 1         *

Friedman test

Data

library(readr)
data <- read_csv("friedman.csv", show_col_types = FALSE)
data

## # A tibble: 20 × 3
##    plant_var locations disease
##    <chr>     <chr>       <dbl>
##  1 P1        L1              4
##  2 P2        L1              3
##  3 P3        L1              5
##  4 P4        L1              5
##  5 P5        L1              4
##  6 P1        L2              2
##  7 P2        L2              1
##  8 P3        L2              2
##  9 P4        L2              1
## 10 P5        L2              1
## 11 P1        L3              5
## 12 P2        L3              4
## 13 P3        L3              3
## 14 P4        L3              4
## 15 P5        L3              4
## 16 P1        L4              4
## 17 P2        L4              3
## 18 P3        L4              4
## 19 P4        L4              4
## 20 P5        L4              5

Test

rstatix::friedman_test(data, disease~locations|plant_var)

## # A tibble: 1 × 6
##   .y.         n statistic    df      p method       
## * <chr>   <int>     <dbl> <dbl>  <dbl> <chr>        
## 1 disease     5      9.85     3 0.0199 Friedman test

Pairwise comparison

pwc <- data %>%
  wilcox_test(disease ~ locations, paired = TRUE, p.adjust.method = "bonferroni")
pwc

## # A tibble: 6 × 9
##   .y.     group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>   <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 disease L1     L2         5     5        15 0.057 0.341 ns          
## 2 disease L1     L3         5     5         6 0.85  1     ns          
## 3 disease L1     L4         5     5         4 0.773 1     ns          
## 4 disease L2     L3         5     5         0 0.048 0.286 ns          
## 5 disease L2     L4         5     5         0 0.054 0.327 ns          
## 6 disease L3     L4         5     5         5 1     1     ns

Factorial ANOVA

Data

library(readr)
data <- read_csv("factorial_anova.csv", show_col_types = FALSE)
data = data %>% janitor::clean_names()
data$temp = factor(data$temp)
data$glass = factor(data$glass)
data

## # A tibble: 27 × 3
##    glass temp  light
##    <fct> <fct> <dbl>
##  1 A     100     580
##  2 A     100     568
##  3 A     100     570
##  4 B     100     550
##  5 B     100     530
##  6 B     100     579
##  7 C     100     546
##  8 C     100     575
##  9 C     100     599
## 10 A     125    1090
## # … with 17 more rows

Test

ket_qua <- aov(light ~ temp + glass, data = data)
summary(ket_qua)

##             Df  Sum Sq Mean Sq F value   Pr(>F)    
## temp         2 1970335  985167  72.943 1.96e-10 ***
## glass        2  150865   75432   5.585   0.0109 *  
## Residuals   22  297131   13506                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way MANOVA

Data

library(readr)
data <- read_csv("manova.csv", show_col_types = FALSE)
data

## # A tibble: 40 × 3
##    plant_var height canopy_vol
##    <chr>      <dbl>      <dbl>
##  1 A             20       0.7 
##  2 A             22       0.8 
##  3 A             24       0.95
##  4 A             18       0.6 
##  5 A             20       0.74
##  6 A             20       0.76
##  7 A             16       0.84
##  8 A             17       0.66
##  9 A             18       0.99
## 10 A             14       0.8 
## # … with 30 more rows

Test

ket_qua = manova(cbind(height, canopy_vol)~plant_var, 
                 data = data)
summary(ket_qua)

##           Df Pillai approx F num Df den Df    Pr(>F)    
## plant_var  3 1.0365   12.909      6     72 7.575e-10 ***
## Residuals 36                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Post hoc test

iris_lda <- MASS::lda(plant_var ~ cbind(height, canopy_vol), CV = F, data = data)
iris_lda

## Call:
## lda(plant_var ~ cbind(height, canopy_vol), data = data, CV = F)
## 
## Prior probabilities of groups:
##    A    B    C    D 
## 0.25 0.25 0.25 0.25 
## 
## Group means:
##   cbind(height, canopy_vol)height cbind(height, canopy_vol)canopy_vol
## A                           18.90                               0.784
## B                           16.54                               0.608
## C                            3.05                               0.272
## D                            9.35                               0.474
## 
## Coefficients of linear discriminants:
##                                            LD1        LD2
## cbind(height, canopy_vol)height     -0.4388374 -0.2751091
## cbind(height, canopy_vol)canopy_vol -1.3949158  9.3256280
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9855 0.0145

Survival analysis

Required libraries

pacman::p_load(survival, ggplot2, survminer)

Data

df = survival::veteran
head(df)

##   trt celltype time status karno diagtime age prior
## 1   1 squamous   72      1    60        7  69     0
## 2   1 squamous  411      1    70        5  64    10
## 3   1 squamous  228      1    60        3  38     0
## 4   1 squamous  126      1    60        9  63    10
## 5   1 squamous  118      1    70       11  65    10
## 6   1 squamous   10      1    20        5  49     0

Build standard survival object with Surv()

surv_object = with(df, Surv(time, status))
surv_object

##   [1]  72  411  228  126  118   10   82  110  314  100+  42    8  144   25+  11 
##  [16]  30  384    4   54   13  123+  97+ 153   59  117   16  151   22   56   21 
##  [31]  18  139   20   31   52  287   18   51  122   27   54    7   63  392   10 
##  [46]   8   92   35  117  132   12  162    3   95  177  162  216  553  278   12 
##  [61] 260  200  156  182+ 143  105  103  250  100  999  112   87+ 231+ 242  991 
##  [76] 111    1  587  389   33   25  357  467  201    1   30   44  283   15   25 
##  [91] 103+  21   13   87    2   20    7   24   99    8   99   61   25   95   80 
## [106]  51   29   24   18   83+  31   51   90   52   73    8   36   48    7  140 
## [121] 186   84   19   45   80   52  164   19   53   15   43  340  133  111  231 
## [136] 378   49

Begin analysis

Function: survfit(formula, data) Our formula is Surv(time, status)~1 1 means all data, no group

surv_fit = survfit(Surv(time, status)~1, data = df)
summary(surv_fit, time = c(1, 300, 600, 900))

## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    137       2    0.985  0.0102      0.96552       1.0000
##   300     13     113    0.117  0.0295      0.07147       0.1917
##   600      2      11    0.018  0.0126      0.00459       0.0707
##   900      2       0    0.018  0.0126      0.00459       0.0707

Plot

ggsurvplot(surv_fit)

Calculate median

Median

surv_fit

## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 137    128     80      52     105

Plot median

surv.median.line = c(‘h’, ‘v’, ‘hv’)

ggsurvplot(surv_fit, surv.median.line = "hv", 
           ggtheme = theme_grey())

Compare survival curve by group treatment (trt)

Change 1 into trt in the previous formula Also show log-rank test p value

surv_fit = survfit(Surv(time, status)~trt, data = df)
ggsurvplot(surv_fit, pval = T, pval.method = T,
           risk.table = T,
           ggtheme = theme_grey())

Cox model (Cox proportional hazards model)

cox = coxph(Surv(time, status) ~ trt + celltype + 
              karno + diagtime + 
              age + prior , 
            data = df)
summary(cox)

## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df)
## 
##   n= 137, number of events= 128 
## 
##                         coef  exp(coef)   se(coef)      z Pr(>|z|)    
## trt                2.946e-01  1.343e+00  2.075e-01  1.419  0.15577    
## celltypesmallcell  8.616e-01  2.367e+00  2.753e-01  3.130  0.00175 ** 
## celltypeadeno      1.196e+00  3.307e+00  3.009e-01  3.975 7.05e-05 ***
## celltypelarge      4.013e-01  1.494e+00  2.827e-01  1.420  0.15574    
## karno             -3.282e-02  9.677e-01  5.508e-03 -5.958 2.55e-09 ***
## diagtime           8.132e-05  1.000e+00  9.136e-03  0.009  0.99290    
## age               -8.706e-03  9.913e-01  9.300e-03 -0.936  0.34920    
## prior              7.159e-03  1.007e+00  2.323e-02  0.308  0.75794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## trt                  1.3426     0.7448    0.8939    2.0166
## celltypesmallcell    2.3669     0.4225    1.3799    4.0597
## celltypeadeno        3.3071     0.3024    1.8336    5.9647
## celltypelarge        1.4938     0.6695    0.8583    2.5996
## karno                0.9677     1.0334    0.9573    0.9782
## diagtime             1.0001     0.9999    0.9823    1.0182
## age                  0.9913     1.0087    0.9734    1.0096
## prior                1.0072     0.9929    0.9624    1.0541
## 
## Concordance= 0.736  (se = 0.021 )
## Likelihood ratio test= 62.1  on 8 df,   p=2e-10
## Wald test            = 62.37  on 8 df,   p=2e-10
## Score (logrank) test = 66.74  on 8 df,   p=2e-11

Lưu ý: dù Cox model khá mạnh mẽ, tuy nhiên cần lưu ý những điều kiện cần phải thỏa mãn trước khi áp dụng.

Normality exploring

Required libs

pacman::p_load(ggpubr)

Density plot

The density plot provides a visual judgment about whether the distribution is bell shaped.

my_data = read_csv("anova.csv")

## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): nhom_1, nhom_2, nhom_3, nhom_4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

p1 = ggdensity(my_data$nhom_1, 
          main = "Density plot of tooth length",
          xlab = "Tooth length")
p2 = ggdensity(my_data$nhom_2, ylab = "")
p3 = ggdensity(my_data$nhom_3, ylab = "")
ggarrange(p1, p2, p3)

Q-Q plot

Q-Q plot (or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted.

p1 = ggqqplot(my_data$nhom_1)
p2 = ggqqplot(my_data$nhom_2)
p3 = ggqqplot(my_data$nhom_3)
ggarrange(p1, p2 ,p3)

Normality tests

Shapiro-Wilk

shapiro.test(my_data$nhom_1)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$nhom_1
## W = 0.91111, p-value = 0.2887

shapiro.test(my_data$nhom_2)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$nhom_2
## W = 0.94807, p-value = 0.6458

shapiro.test(my_data$nhom_3)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$nhom_3
## W = 0.95563, p-value = 0.7351

Anderson-Darling

Lilliefors

Kolmogorov-Smirnov

Test Shapiro-Wilk là mạnh mẽ nhất (tham khảo tại đây). Cả 4 test trên đều bị ảnh hưởng nếu mẫu nhỏ (30 hoặc thấp hơn)

Skewness và Kurtosis

Skewness is usually described as a measure of a dataset’s symmetry.

Kurtosis is the degree of peakedness of a distribution Range accepted for normal distribution -2 to +2

psych::describe(my_data$nhom_1)

##    vars  n mean   sd median trimmed  mad  min   max range  skew kurtosis   se
## X1    1 10 11.2 1.99   11.8   11.26 1.68 7.86 14.08  6.22 -0.46    -1.05 0.63

R stats tutorial

R stats tutorial

Thư viện cần thiết

Chi square & Fisher’s exact test

Data

Chi square

Fisher’s exact test

ANOVA

Data

Reshape data to long-format

One-way ANOVA

One-way ANOVA on Rank

Reshape data to long-format

Test

Post hoc

Paired T-test

Data

Test

Wilcoxon-signed rank test

Data

Test

McNemar

Data

Test

One-way repeated measures ANOVA

Data

Reshape to long-format

Test

Friedman test

Data

Test

Pairwise comparison

Factorial ANOVA

Data

Test

One-way MANOVA

Data

Test

Post hoc test

Survival analysis

Required libraries

Data

Build standard survival object with Surv()

Begin analysis

Plot

Calculate median

Median

Plot median

Compare survival curve by group treatment (trt)

Cox model (Cox proportional hazards model)

Normality exploring

Required libs

Density plot

Q-Q plot

Normality tests

Shapiro-Wilk

Anderson-Darling

Lilliefors

Kolmogorov-Smirnov

Skewness và Kurtosis

Data visualization