基本统计

参考资料：R语言实战

研究类型：基因表达与临床变量的关系

知识点：

组间差异比较
正态分布检验
方差齐性检验
效力评估

## 准备工作
if(T){
setwd("D:/R/R-4.0.5/bin/project_writing/LIM/Statistics")
rm(list=ls())
gc()

library(tidyverse)
library(car)
library(gplots)
#install.packages("HH")
library(HH)
#install.packages("rrcov")
library(rrcov)
#install.packages("multicomp")
#library(multicomp)
#install.packages("effects")
library(effects)
#install.packages("MASS")
library(MASS)
#install.packages("mvoutlier")
library(mvoutlier)
#install.packages("pwr")
library(pwr)

load("pdata.Rdata")
}

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## 载入需要的程辑包：carData

## 
## 载入程辑包：'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

## 
## 载入程辑包：'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

## 载入需要的程辑包：lattice

## 载入需要的程辑包：grid

## 载入需要的程辑包：latticeExtra

## 
## 载入程辑包：'latticeExtra'

## The following object is masked from 'package:ggplot2':
## 
##     layer

## 载入需要的程辑包：multcomp

## 载入需要的程辑包：mvtnorm

## 载入需要的程辑包：survival

## 载入需要的程辑包：TH.data

## 载入需要的程辑包：MASS

## 
## 载入程辑包：'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## 载入程辑包：'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

## 载入需要的程辑包：gridExtra

## 
## 载入程辑包：'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## 
## 载入程辑包：'HH'

## The following object is masked from 'package:gplots':
## 
##     residplot

## The following objects are masked from 'package:car':
## 
##     logit, vif

## The following object is masked from 'package:purrr':
## 
##     transpose

## 载入需要的程辑包：robustbase

## 
## 载入程辑包：'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## Scalable Robust Estimators with High Breakdown Point (version 1.6-0)

## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.

## 载入需要的程辑包：sgeostat

组间差异比较

两组间比较还是多组比较？
参数还是非参数？

	2组	多组	事后两两检验
参数	t-检验	ANOVA	F检验；Gam es-Howell检验
非参数	wilcoxon秩和（MW检验）	KW检验	Dunn检验

2组间比较

Student’s t-Test

Description: Performs one and two sample t-tests on vectors of data.

适用条件：①under normality assumptions ②large samples（多大叫大？）

说明：比较的是均值，一般不是特别偏态分布就可以用

Wilcoxon Rank Sum and Signed Rank Tests

Description: Performs one- and two-sample Wilcoxon tests on vectors of data; the latter is also known as Mann-Whitney test.

适用条件：①不符合t-test的条件（不满足正态分布）；②因变量为等级、分类变量时；③是t-test的替代品，效力不如t-test

示例

## AJCC_N
glist = "HOXA1"
#colnames(pdata) 
GroupBy <- "AJCC_N" 
dat <- pdata %>% dplyr::select(GroupBy,all_of(glist))

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(GroupBy)` instead of `GroupBy` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

#dat <- dat %>% filter(SiteClassify != "other") 
colnames(dat) <- c("Group","Expression")
str(dat)

## 'data.frame':    500 obs. of  2 variables:
##  $ Group     : Factor w/ 2 levels "N0","N1-3": 2 2 2 2 2 2 1 1 2 2 ...
##  $ Expression: num  6.74 8.81 9.89 9.04 8.25 ...

summary(dat)

##   Group       Expression    
##  N0  :239   Min.   : 2.000  
##  N1-3:239   1st Qu.: 7.355  
##  NA's: 22   Median : 7.989  
##             Mean   : 7.806  
##             3rd Qu.: 8.582  
##             Max.   :10.295

t.test(Expression~Group,data = dat)

## 
##  Welch Two Sample t-test
## 
## data:  Expression by Group
## t = 0.51888, df = 475.53, p-value = 0.6041
## alternative hypothesis: true difference in means between group N0 and group N1-3 is not equal to 0
## 95 percent confidence interval:
##  -0.1611355  0.2767716
## sample estimates:
##   mean in group N0 mean in group N1-3 
##           7.845452           7.787634

wilcox.test(Expression~Group,data = dat)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Expression by Group
## W = 30205, p-value = 0.2763
## alternative hypothesis: true location shift is not equal to 0

## 治疗结果
#colnames(pdata) 
GroupBy <- "OUTCOME"
dat <- pdata %>% dplyr::select(GroupBy,all_of(glist)) 
#dat <- dat %>% filter(SiteClassify != "other") 
colnames(dat) <- c("Group","Expression" )
str(dat)

## 'data.frame':    500 obs. of  2 variables:
##  $ Group     : Factor w/ 2 levels "Remission","Resistent": 1 1 2 1 2 NA 1 1 NA 1 ...
##  $ Expression: num  6.74 8.81 9.89 9.04 8.25 ...

summary(dat)

##        Group       Expression    
##  Remission:322   Min.   : 2.000  
##  Resistent: 38   1st Qu.: 7.355  
##  NA's     :140   Median : 7.989  
##                  Mean   : 7.806  
##                  3rd Qu.: 8.582  
##                  Max.   :10.295

t.test(Expression~Group,data = dat)

## 
##  Welch Two Sample t-test
## 
## data:  Expression by Group
## t = -0.52498, df = 44.843, p-value = 0.6022
## alternative hypothesis: true difference in means between group Remission and group Resistent is not equal to 0
## 95 percent confidence interval:
##  -0.5793412  0.3397915
## sample estimates:
## mean in group Remission mean in group Resistent 
##                7.784062                7.903836

wilcox.test(Expression~Group,data = dat)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Expression by Group
## W = 5511.5, p-value = 0.3179
## alternative hypothesis: true location shift is not equal to 0

## HPV
#colnames(pdata) 
GroupBy <- "HPV" 
dat <- pdata %>% dplyr::select(GroupBy,all_of(glist))
colnames(dat) <- c("Group","Expression") 
str(dat)

## 'data.frame':    500 obs. of  2 variables:
##  $ Group     : Factor w/ 2 levels "Negative","Positive": NA NA NA NA NA NA NA NA NA NA ...
##  $ Expression: num  6.74 8.81 9.89 9.04 8.25 ...

summary(dat)

##       Group       Expression    
##  Negative: 56   Min.   : 2.000  
##  Positive: 33   1st Qu.: 7.355  
##  NA's    :411   Median : 7.989  
##                 Mean   : 7.806  
##                 3rd Qu.: 8.582  
##                 Max.   :10.295

t.test(Expression~Group,data = dat)

## 
##  Welch Two Sample t-test
## 
## data:  Expression by Group
## t = 3.2434, df = 60.195, p-value = 0.001929
## alternative hypothesis: true difference in means between group Negative and group Positive is not equal to 0
## 95 percent confidence interval:
##  0.3025616 1.2760776
## sample estimates:
## mean in group Negative mean in group Positive 
##               7.931633               7.142314

wilcox.test(Expression~Group,data = dat)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Expression by Group
## W = 1285.5, p-value = 0.002166
## alternative hypothesis: true location shift is not equal to 0

多组间比较

是否有差异

Fit an Analysis of Variance Model（ANOVA）

Description: Fit an analysis of variance model by a call to lm for each stratum.

适用条件：正态分布、方差相齐

Kruskal-Wallis Rank Sum Test(KW test)

适用条件：多组间比较差异是否显著+不满足ANOVA条件

事后两两比较

参数：Games-Howell比较泛用，方差不齐也可以

非参数：Dunn检验

### AJCC_T:无差异

GroupBy <- "AJCC_T" 
dat <- pdata %>%dplyr::select(GroupBy,all_of(glist)) 
#dat <- dat %>%filter(SiteClassify != "other") 
colnames(dat) <- c("Group","Expression") 
dat <- na.omit(dat)
str(dat)

## 'data.frame':    485 obs. of  2 variables:
##  $ Group     : Factor w/ 4 levels "T1","T2","T3",..: 4 3 4 3 4 2 4 4 2 2 ...
##  $ Expression: num  6.74 8.81 9.89 9.04 8.25 ...
##  - attr(*, "na.action")= 'omit' Named int [1:15] 75 144 152 159 383 384 385 386 387 388 ...
##   ..- attr(*, "names")= chr [1:15] "75" "144" "152" "159" ...

summary(dat)

##  Group      Expression    
##  T1: 33   Min.   : 2.000  
##  T2:143   1st Qu.: 7.366  
##  T3:130   Median : 7.989  
##  T4:179   Mean   : 7.812  
##           3rd Qu.: 8.581  
##           Max.   :10.295

## ANOVA
aov(Expression~Group,data=dat)

## Call:
##    aov(formula = Expression ~ Group, data = dat)
## 
## Terms:
##                    Group Residuals
## Sum of Squares   14.6665  699.7829
## Deg. of Freedom        3       481
## 
## Residual standard error: 1.206172
## Estimated effects may be unbalanced

## 非参数检验
# 零假设的检验 
kruskal.test(Expression~Group,data = dat)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Expression by Group
## Kruskal-Wallis chi-squared = 14.303, df = 3, p-value = 0.00252

#事后两两比较
source("http://www.statmethods.net/RiA/wmc.txt") 
wmc(Expression~Group,data =dat,method="holm")#method?

## Warning in xtfrm.data.frame(x): cannot xtfrm data frames

## Descriptive Statistics
## 
##               T1         T2          T3          T4
## n      33.000000 143.000000 130.0000000 179.0000000
## median  7.491853   7.965784   7.9686409   8.0927571
## mad     1.049413   1.047666   0.8865357   0.7417449
## 
## Multiple Comparisons (Wilcoxon Rank Sum Tests)
## Probability Adjustment = holm
## 
##   Group.1 Group.2       W           p   
## 1      T1      T2  1756.0 0.089106078  .
## 2      T1      T3  1480.5 0.030505420  *
## 3      T1      T4  1759.5 0.001367242 **
## 4      T2      T3  8871.0 0.515671528   
## 5      T2      T4 11166.0 0.147839310   
## 6      T3      T4 10619.5 0.380979969   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

方差齐性检验

  library(car)
  LEVETEST <- function(V){
    pd <- pdata %>% dplyr::select(HOXA1,all_of(V))
    colnames(pd)[2] <- "group"
    pd <- pd[pd$group != "other",]
    
    T1 <- leveneTest(pd$HOXA1,pd$group)
    print(T1)
  }
  #Stage
  LEVETEST("Stage")## 符合方差齐性检验

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   3  0.0677 0.9771
##       482

  #Grade
  LEVETEST("Grade")## 不符合方差齐性检验

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   2  2.8849 0.05683 .
##       478                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

  #SiteClassify
  LEVETEST("SiteClassify")## 不符合方差齐性检验

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   2  0.0426 0.9583
##       496

  #AJCC_T
  LEVETEST("AJCC_T")## 不符合方差齐性检验

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   3  0.2979 0.8269
##       481

  ## 两组间的比较全都不符合正态分布，都要用Wilcoxon秩和检验