test_design

k;x

2021/12/3

一 Data preprocessing

1.1 Load data

## import dataset
library(readxl)
library(ggplot2)
path1 = "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/开放手术133例.xlsx"
path2='C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜手术150例.xlsx'
data1 <- readxl::read_excel(path1, sheet = 1)
data3 <- readxl::read_excel(path2, sheet = 1)
## New names:
## * `` -> ...12
## * `` -> ...13

1.2 Data clean

开放手术

data1$中央区清扫淋巴结总数
##   [1] "5"       "1"       "1"       "0"       "4"       "5"       "4"      
##   [8] "2"       "未 清扫" "2"       "未 清扫" "2"       "7"       "3"      
##  [15] "0"       "6"       "未 清扫" "0"       "2"       "1"       "4"      
##  [22] "5"       "4"       "5"       "2"       "1"       "3"       "2"      
##  [29] "6"       "1"       "0"       "0"       "5"       "11"      "3"      
##  [36] "0"       "1"       "1"       "0"       "4"       "0"       "3"      
##  [43] "0"       "未 清扫" "1"       "1"       "14"      "2"       "2"      
##  [50] "2"       "4"       "3"       "0"       "1"       "3"       "1"      
##  [57] "0"       "2"       "0"       "2"       "3"       "1"       "5"      
##  [64] "1"       "3"       "10"      "7"       "4"       "0"       "3"      
##  [71] "5"       "7"       "2"       "2"       "1"       "5"       "6"      
##  [78] "1"       "2"       "2"       "0"       "2"       "6"       "1"      
##  [85] "4"       "1"       "3"       "1"       "3"       "9"       "0"      
##  [92] "3"       "3"       "0"       "0"       "1"       "0"       "2"      
##  [99] "3"       "2"       "1"       "3"       "2"       "2"       "1"      
## [106] "0"       "1"       "6"       "3"       "0"       "1"       "4"      
## [113] "0"       "1"       "2"       "4"       "0"       "3"       "4"      
## [120] "5"       "3"       "2"       "4"       "未 清扫" "6"       "未 清扫"
## [127] "0"       "11"      "7"       "未 清扫" "0"       "0"       "1"      
## [134] "0"
w=which(data1$中央区清扫淋巴结总数=="未 清扫")
data2=data1[-w,]
data2$中央区清扫淋巴结总数=as.numeric(data2$中央区清扫淋巴结总数)
data2$中央区清扫淋巴结总数
##   [1]  5  1  1  0  4  5  4  2  2  2  7  3  0  6  0  2  1  4  5  4  5  2  1  3  2
##  [26]  6  1  0  0  5 11  3  0  1  1  0  4  0  3  0  1  1 14  2  2  2  4  3  0  1
##  [51]  3  1  0  2  0  2  3  1  5  1  3 10  7  4  0  3  5  7  2  2  1  5  6  1  2
##  [76]  2  0  2  6  1  4  1  3  1  3  9  0  3  3  0  0  1  0  2  3  2  1  3  2  2
## [101]  1  0  1  6  3  0  1  4  0  1  2  4  0  3  4  5  3  2  4  6  0 11  7  0  0
## [126]  1  0

腔镜手术

data3$中央区清扫淋巴结总数
##   [1]  4  1  3  1  1  3  1  3  3  4  2  5  8  2  4  7  4  2  5  4  3  5  6  4  6
##  [26]  3  2  6 12  5  5  6  2  3  3  4  4  5  7  2  4  7  4  2  5  4  1  5  6  6
##  [51]  3  6  3  2  6 12  5  5  4  1  5  6  6  3  6  3  1  1  3  1  3  3  1  5  6
##  [76]  6  0  6  3  1  1  3  3  6  3  1  4  3  1  3  3  1  5  1  5  6  6  2  6  3
## [101]  4  1  3  1  1  3  4  3  3  3  5  6  2  4  6  3  1  1  3  6  3  3  4  5  6
## [126]  6  6  2  6  3  1  4  3  6  3  3  5  6  6  6  4  6  3  4  1  3  4  7  4  2
## [151]  5  4  1  6  3

二 Descriptive statistical analysis

2.1 Histogram lymph

par(mfrow=c(1,2))
target1=data2$中央区清扫淋巴结总数
target2=data3$中央区清扫淋巴结总数
hist(target1,main='开放手术',breaks=14,xlab='',ylab='')
abline(v=mean(target1),lwd=1,col='red')
text(x=4.5,y=45,round(mean(target1),3))
hist(target2,main='腔镜手术',breaks=14,xlab='',ylab='')
abline(v=mean(target2),lwd=1,col='red')
text(x=5.5,y=34,round(mean(target2),3))

2.2 Sex

data4=data.frame(data2$`性别(女1,男2)`,data2$年龄,data2$住院天数d)
data5=data.frame(data3$`性别(女1,男2)`,data3$年龄,data3$住院天数d)
data4<-data.frame(Sample<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value<-c(table(data2$`性别(女1,男2)`)[1]/length(data2$`性别(女1,男2)`),
                           table(data2$`性别(女1,男2)`)[2]/length(data2$`性别(女1,男2)`)))
p1=ggplot(data4,mapping = aes(Sample,value,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=paste0(round(value*length(data2$`性别(女1,男2)`)),' ','(',round(value,3),')'),y=value+0.01), position=position_dodge(0.9), vjust=0,family='serif')+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('开放手术')+
  guides(fill=F)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
data5<-data.frame(Sample1<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value1<-c(table(data3$`性别(女1,男2)`)[1]/length(data3$`性别(女1,男2)`),
                           table(data3$`性别(女1,男2)`)[2]/length(data3$`性别(女1,男2)`)))

p2=ggplot(data5,mapping = aes(Sample1,value1,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=paste0(round(value1*length(data3$`性别(女1,男2)`)),' ','(',round(value1,3),')'),y=value1+0.01), 
            position=position_dodge(0.9), vjust=0,family='serif')+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('腔镜手术')+
  guides(fill=F)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
library(ggpubr)
ggarrange(p1,p2,labels=c('A','B'),ncol=2,nrow=1)

2.2.1 Chi-square test

chisq.test(matrix(c(143,12,116,11),nrow=2))#p-value = 0.9505
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  matrix(c(143, 12, 116, 11), nrow = 2)
## X-squared = 0.0038477, df = 1, p-value = 0.9505

2.3 Age

p3=ggplot(data=data2)+
  geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))

p4=ggplot(data=data3)+
  geom_boxplot(aes(x=年龄,y=年龄))+
  labs(x='腔镜手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))

ggarrange(p3,p4,labels=c('A','B'),ncol=2,nrow=1)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

2.3.1 T-test

t.test(data1$年龄,data3$年龄)#p-value < 2.2e-16
## 
##  Welch Two Sample t-test
## 
## data:  data1$年龄 and data3$年龄
## t = -13.14, df = 286.65, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17.13528 -12.67059
## sample estimates:
## mean of x mean of y 
##  34.71642  49.61935

2.4 Linear regression

Note:classification index setting

指标 0 1
手术类型 开放手术 腔镜手术
指标 1 2
性别

Result:

Index Rsquare r
age 0.018 0.134
sex 0.025 0.158
operation 0.063 0.251
days in hospital 不显著
age=c(data2$年龄,data3$年龄)
sex=c(data2$`性别(女1,男2)`,data3$`性别(女1,男2)`)
day=c(data2$住院天数d,data3$住院天数d)
lymph=c(data2$中央区清扫淋巴结总数,data3$中央区清扫淋巴结总数)
operation=c(rep(0,127),rep(1,155))
da=data.frame(age,sex,day,operation,lymph)
for(i in 1:4){
  print(colnames(da)[i])
  print('#############################')
  print(summary(lm(da$lymph~da[,i])))
}
## [1] "age"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6076 -1.9404 -0.2905  1.4387 11.1587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.15438    0.51608   4.175 3.99e-05 ***
## da[, i]      0.02642    0.01155   2.288   0.0229 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.349 on 280 degrees of freedom
## Multiple R-squared:  0.01835,    Adjusted R-squared:  0.01484 
## F-statistic: 5.233 on 1 and 280 DF,  p-value: 0.0229
## 
## [1] "sex"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4015 -2.0435 -0.4015  1.5985 10.5985 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7596     0.5684   8.374 2.71e-15 ***
## da[, i]      -1.3581     0.5095  -2.666  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.342 on 280 degrees of freedom
## Multiple R-squared:  0.02475,    Adjusted R-squared:  0.02127 
## F-statistic: 7.106 on 1 and 280 DF,  p-value: 0.00813
## 
## [1] "day"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.366 -2.225 -0.323  1.677 10.699 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.49676    0.55878   6.258 1.46e-09 ***
## da[, i]     -0.02172    0.05702  -0.381    0.704    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.37 on 280 degrees of freedom
## Multiple R-squared:  0.0005181,  Adjusted R-squared:  -0.003051 
## F-statistic: 0.1452 on 1 and 280 DF,  p-value: 0.7035
## 
## [1] "operation"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8258 -1.6378 -0.6378  1.3622 11.3622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6378     0.2037  12.949  < 2e-16 ***
## da[, i]       1.1880     0.2748   4.324 2.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.296 on 280 degrees of freedom
## Multiple R-squared:  0.06259,    Adjusted R-squared:  0.05924 
## F-statistic: 18.69 on 1 and 280 DF,  p-value: 2.136e-05

通过简单线性回归结果可以知道,sex,age,operation对淋巴清扫总数具有显著性影响。其中以手术类型Rsquare最大,影响关系最强,但是Rsquare只有0.062。性别回归系数为负,说明男性的病人淋巴清扫总数偏少,年龄和手术类型回归系数为正,说明年纪大和腔镜手术的病人淋巴清扫总数偏多,将以上指标运用多元线性模型进一步分析。

2.5 Multi-linear regression

pre=lm(lymph~age+sex+operation,data=da)
summary(pre)
## 
## Call:
## lm(formula = lymph ~ age + sex + operation, data = da)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1600 -1.6693 -0.2676  1.2680 11.1612 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.441814   0.800182   5.551 6.63e-08 ***
## age         -0.009291   0.014106  -0.659 0.510657    
## sex         -1.361419   0.498194  -2.733 0.006684 ** 
## operation    1.311837   0.341566   3.841 0.000152 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.273 on 278 degrees of freedom
## Multiple R-squared:  0.08747,    Adjusted R-squared:  0.07762 
## F-statistic: 8.882 on 3 and 278 DF,  p-value: 1.219e-05

通过多元统计分析结果可以看出,性别和手术类型对淋巴清扫总数具有显著性影响,其中手术类型影响最大,但总Rsquare仅为0.087。

2.6 Step regression

通过逐步回归进一步寻求最佳模型

step(pre)
## Start:  AIC=467.11
## lymph ~ age + sex + operation
## 
##             Df Sum of Sq    RSS    AIC
## - age        1     2.242 1438.7 465.55
## <none>                   1436.5 467.11
## - sex        1    38.587 1475.0 472.58
## - operation  1    76.219 1512.7 479.69
## 
## Step:  AIC=465.55
## lymph ~ sex + operation
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1438.7 465.55
## - sex        1    36.927 1475.6 470.69
## - operation  1    96.487 1535.2 481.85
## 
## Call:
## lm(formula = lymph ~ sex + operation, data = da)
## 
## Coefficients:
## (Intercept)          sex    operation  
##       4.075       -1.322        1.176

从逐步回归结果可以看出,模型lymph ~ sex + operation为最佳模型,与多元统计分析结果相同。

pre2=lm(lymph~sex+operation,data=da)
summary(pre2)
## 
## Call:
## lm(formula = lymph ~ sex + operation, data = da)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9282 -1.7523 -0.1791  1.2477 11.2477 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.0747     0.5735   7.105 1.01e-11 ***
## sex          -1.3223     0.4941  -2.676  0.00789 ** 
## operation     1.1759     0.2718   4.326 2.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.271 on 279 degrees of freedom
## Multiple R-squared:  0.08604,    Adjusted R-squared:  0.07949 
## F-statistic: 13.13 on 2 and 279 DF,  p-value: 3.54e-06

三 Distribution test

3.1 Normal distribution test

ks.test(target1,"pnorm",
        
        alternative = c("two.sided", "less", "greater"),
        
        exact = NULL)
## Warning in ks.test(target1, "pnorm", alternative = c("two.sided", "less", : ties
## should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target1
## D = 0.63662, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(target2,"pnorm",
        
        alternative = c("two.sided", "less", "greater"),
        
        exact = NULL)
## Warning in ks.test(target2, "pnorm", alternative = c("two.sided", "less", : ties
## should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target2
## D = 0.83489, p-value < 2.2e-16
## alternative hypothesis: two-sided

3.2 QQ plot for normal distribution test

par(mfcol=c(1,2))
qqnorm(target1,col="green",main="Normal Q-Q Plot-y1")
qqline(target1,col="green")
qqnorm(target2,col="red",main="Normal Q-Q Plot-y2")
qqline(target2,col="red")

3.3 Poisson distribution test

ks.test(target1,"pgamma",1)
## Warning in ks.test(target1, "pgamma", 1): ties should not be present for the
## Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target1
## D = 0.46309, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(target2,"pgamma",1)
## Warning in ks.test(target2, "pgamma", 1): ties should not be present for the
## Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target2
## D = 0.7115, p-value < 2.2e-16
## alternative hypothesis: two-sided

四 Sample size calculation—assuming normality

4.1 Preliminary analysis of “target1” and “target2”

组别 开放手术 腔镜手术
样本量 127 155
均值 2.64 3.83
方差 6.66 4.12
标准差 2.58 2.03

4.2 Sample size

Set: \(\delta\)非劣性临界值

4.2.1 Test for Equality

Set: \(\delta=0\)

两样本样本量计算公式:

\[n_1=\frac{(z_{\alpha/2}+z_{\beta})^2\sigma^2(1+1/k)}{(\mu_1-\mu_2)^2}\]

\[n_2=kn_1\]

Note: When \(\sigma^2\) is unknown,it can replace by \(s^2\) :

\[s^2=\frac{(n_1-1)s_1^2+(n_2-1)s_2^2}{(n_1+n_2-2)}\]

where \(s_1^2\),\(s_2^2\) are the variances of two examples.

4.2.2 Test for Non-Inferority/Superiority

Set: \(\delta\not=0\)

两样本样本量计算公式:

\[n_1=\frac{(z_{\alpha}+z_{\beta})^2\sigma^2(1+1/k)}{(\mu_1-\mu_2-\delta)^2}\]

\[n_2=kn_1\]

Note: When \(\sigma^2\) is unknown,it can replace by \(s^2\) :

\[s^2=\frac{(n_1-1)s_1^2+(n_2-1)s_2^2}{(n_1+n_2-2)}\]

where \(s_1^2\),\(s_2^2\) are the variances of two examples.


计算样本量前提设定:\(\alpha=0.05\); \(1-\beta=0.8\); \(\delta=0.12\)

组别 开放手术 腔镜手术
等价性检验 202 247
非劣性检验 247 247