test_design

k;x

2021/12/3

一 Data preprocessing

1.1 Load data

## import dataset
library(readxl)
library(ggplot2)
path1 = "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜手术150例.xlsx"
path2='C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/开放手术133例.xlsx'
data1 <- readxl::read_excel(path1, sheet = 1)
data2 <- readxl::read_excel(path2, sheet = 1)

1.2 Data clean

开放手术

data1$中央区清扫淋巴结总数
##   [1]  4  1  3  1  1  3  1  3  3  4  2  5  8  2  4  7  4  2  5  4  3  5  6  4  6
##  [26]  3  2  6 12  5  5  6  2  3  3  4  4  5  7  2  4  7  4  2  5  4  1  5  6  6
##  [51]  3  6  3  2  6 12  5  5  4  1  5  6  6  3  6  3  1  1  3  1  3  3  1  5  6
##  [76]  6  0  6  3  1  1  3  3  6  3  1  4  3  1  3  3  1  5  1  5  6  6  2  6  3
## [101]  4  1  3  1  1  3  4  3  3  3  5  6  2  4  6  3  1  1  3  6  3  3  4  5  6
## [126]  6  6  2  6  3  1  4  3  6  3  3  5  6  6  6  4  6  3  4  1  3  4  7  4  2
## [151]  5  4  1  6  3

腔镜手术

data2$中央区清扫淋巴结总数
##   [1] "5"       "1"       "1"       "0"       "4"       "5"       "4"      
##   [8] "2"       "未 清扫" "2"       "未 清扫" "2"       "7"       "3"      
##  [15] "0"       "6"       "未 清扫" "0"       "2"       "1"       "4"      
##  [22] "5"       "4"       "5"       "2"       "1"       "3"       "2"      
##  [29] "6"       "1"       "0"       "0"       "5"       "11"      "3"      
##  [36] "0"       "1"       "1"       "0"       "4"       "0"       "3"      
##  [43] "0"       "未 清扫" "1"       "1"       "14"      "2"       "2"      
##  [50] "2"       "4"       "3"       "0"       "1"       "3"       "1"      
##  [57] "0"       "2"       "0"       "2"       "3"       "1"       "5"      
##  [64] "1"       "3"       "10"      "7"       "4"       "0"       "3"      
##  [71] "5"       "7"       "2"       "2"       "1"       "5"       "6"      
##  [78] "1"       "2"       "2"       "0"       "2"       "6"       "1"      
##  [85] "4"       "1"       "3"       "1"       "3"       "9"       "0"      
##  [92] "3"       "3"       "0"       "0"       "1"       "0"       "2"      
##  [99] "3"       "2"       "1"       "3"       "2"       "2"       "1"      
## [106] "0"       "1"       "6"       "3"       "0"       "1"       "4"      
## [113] "0"       "1"       "2"       "4"       "0"       "3"       "4"      
## [120] "5"       "3"       "2"       "4"       "未 清扫" "6"       "未 清扫"
## [127] "0"       "11"      "7"       "未 清扫" "0"       "0"       "1"      
## [134] "0"
w=which(data2$中央区清扫淋巴结总数=="未 清扫")
data3=data2[-w,]
data3$中央区清扫淋巴结总数=as.numeric(data3$中央区清扫淋巴结总数)
data3$中央区清扫淋巴结总数
##   [1]  5  1  1  0  4  5  4  2  2  2  7  3  0  6  0  2  1  4  5  4  5  2  1  3  2
##  [26]  6  1  0  0  5 11  3  0  1  1  0  4  0  3  0  1  1 14  2  2  2  4  3  0  1
##  [51]  3  1  0  2  0  2  3  1  5  1  3 10  7  4  0  3  5  7  2  2  1  5  6  1  2
##  [76]  2  0  2  6  1  4  1  3  1  3  9  0  3  3  0  0  1  0  2  3  2  1  3  2  2
## [101]  1  0  1  6  3  0  1  4  0  1  2  4  0  3  4  5  3  2  4  6  0 11  7  0  0
## [126]  1  0

二 Descriptive statistical analysis

2.1 Histogram lymph

par(mfrow=c(1,2))
target1=data1$中央区清扫淋巴结总数
target2=data3$中央区清扫淋巴结总数
hist(target1,main='开放手术',breaks=14,xlab='',ylab='')
hist(target2,main='腔镜手术',breaks=14,xlab='',ylab='')

2.2 Sex

data4=data.frame(data1$`性别(女1,男2)`,data1$年龄,data1$住院天数d)
data5=data.frame(data3$`性别(女1,男2)`,data3$年龄,data3$住院天数d)
data4<-data.frame(Sample<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value<-c(table(data1$`性别(女1,男2)`)[1]/length(data1$`性别(女1,男2)`),
                           table(data1$`性别(女1,男2)`)[2]/length(data1$`性别(女1,男2)`)))
p1=ggplot(data4,mapping = aes(Sample,value,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=round(value,3),y=value+0.01), position=position_dodge(0.9), vjust=0,family='serif')+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('开放手术')+
  guides(fill=F)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
data5<-data.frame(Sample1<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value1<-c(table(data3$`性别(女1,男2)`)[1]/length(data3$`性别(女1,男2)`),
                           table(data3$`性别(女1,男2)`)[2]/length(data3$`性别(女1,男2)`)))

p2=ggplot(data5,mapping = aes(Sample1,value1,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=round(value1,3),y=value1+0.01), 
            position=position_dodge(0.9), vjust=0,family='serif')+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('腔镜手术')+
  guides(fill=F)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
library(ggpubr)
ggarrange(p1,p2,labels=c('A','B'),ncol=2,nrow=1)

2.2.1 Chi-square test

chisq.test(matrix(c(143,12,116,11),nrow=2))#p-value = 0.9505
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  matrix(c(143, 12, 116, 11), nrow = 2)
## X-squared = 0.0038477, df = 1, p-value = 0.9505

2.3 Age

p3=ggplot(data=data1)+
  geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))

p4=ggplot(data=data3)+
  geom_boxplot(aes(x=年龄,y=年龄))+
  labs(x='腔镜手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))

ggarrange(p3,p4,labels=c('A','B'),ncol=2,nrow=1)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

2.3.1 T-test

t.test(data1$年龄,data3$年龄)#p-value < 2.2e-16
## 
##  Welch Two Sample t-test
## 
## data:  data1$年龄 and data3$年龄
## t = 12.783, df = 277.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  12.41471 16.93423
## sample estimates:
## mean of x mean of y 
##  49.61935  34.94488

2.4 Linear regression

Note:classification index setting

指标 0 1
性别
手术类型 开放手术 腔镜手术

Result:

Index Rsquare r
age 0.018 0.134
sex 0.025 0.158
operation 0.063 0.251
days in hospital 不显著
age=c(data1$年龄,data3$年龄)
sex=c(data1$`性别(女1,男2)`,data3$`性别(女1,男2)`)
day=c(data1$住院天数d,data3$住院天数d)
lymph=c(data1$中央区清扫淋巴结总数,data3$中央区清扫淋巴结总数)
operation=c(rep(0,155),rep(1,127))
da=data.frame(age,sex,day,operation,lymph)
for(i in 1:4){
  print(colnames(da)[i])
  print('#############################')
  print(summary(lm(da$lymph~da[,i])))
}
## [1] "age"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6076 -1.9404 -0.2905  1.4387 11.1587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.15438    0.51608   4.175 3.99e-05 ***
## da[, i]      0.02642    0.01155   2.288   0.0229 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.349 on 280 degrees of freedom
## Multiple R-squared:  0.01835,    Adjusted R-squared:  0.01484 
## F-statistic: 5.233 on 1 and 280 DF,  p-value: 0.0229
## 
## [1] "sex"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4015 -2.0435 -0.4015  1.5985 10.5985 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7596     0.5684   8.374 2.71e-15 ***
## da[, i]      -1.3581     0.5095  -2.666  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.342 on 280 degrees of freedom
## Multiple R-squared:  0.02475,    Adjusted R-squared:  0.02127 
## F-statistic: 7.106 on 1 and 280 DF,  p-value: 0.00813
## 
## [1] "day"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.366 -2.225 -0.323  1.677 10.699 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.49676    0.55878   6.258 1.46e-09 ***
## da[, i]     -0.02172    0.05702  -0.381    0.704    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.37 on 280 degrees of freedom
## Multiple R-squared:  0.0005181,  Adjusted R-squared:  -0.003051 
## F-statistic: 0.1452 on 1 and 280 DF,  p-value: 0.7035
## 
## [1] "operation"
## [1] "#############################"
## 
## Call:
## lm(formula = da$lymph ~ da[, i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8258 -1.6378 -0.6378  1.3622 11.3622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.8258     0.1844  20.748  < 2e-16 ***
## da[, i]      -1.1880     0.2748  -4.324 2.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.296 on 280 degrees of freedom
## Multiple R-squared:  0.06259,    Adjusted R-squared:  0.05924 
## F-statistic: 18.69 on 1 and 280 DF,  p-value: 2.136e-05

2.5 Multi-linear regression

sex operation 均显著

pre=lm(lymph~age+sex+operation,data=da)
summary(pre)
## 
## Call:
## lm(formula = lymph ~ age + sex + operation, data = da)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1600 -1.6693 -0.2676  1.2680 11.1612 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.753651   0.949135   6.062 4.37e-09 ***
## age         -0.009291   0.014106  -0.659 0.510657    
## sex         -1.361419   0.498194  -2.733 0.006684 ** 
## operation   -1.311837   0.341566  -3.841 0.000152 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.273 on 278 degrees of freedom
## Multiple R-squared:  0.08747,    Adjusted R-squared:  0.07762 
## F-statistic: 8.882 on 3 and 278 DF,  p-value: 1.219e-05

2.6 Step regression

step(pre)
## Start:  AIC=467.11
## lymph ~ age + sex + operation
## 
##             Df Sum of Sq    RSS    AIC
## - age        1     2.242 1438.7 465.55
## <none>                   1436.5 467.11
## - sex        1    38.587 1475.0 472.58
## - operation  1    76.219 1512.7 479.69
## 
## Step:  AIC=465.55
## lymph ~ sex + operation
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1438.7 465.55
## - sex        1    36.927 1475.6 470.69
## - operation  1    96.487 1535.2 481.85
## 
## Call:
## lm(formula = lymph ~ sex + operation, data = da)
## 
## Coefficients:
## (Intercept)          sex    operation  
##       5.251       -1.322       -1.176
pre2=lm(lymph~sex+operation,data=da)
summary(pre2)
## 
## Call:
## lm(formula = lymph ~ sex + operation, data = da)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9282 -1.7523 -0.1791  1.2477 11.2477 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.2505     0.5628   9.330  < 2e-16 ***
## sex          -1.3223     0.4941  -2.676  0.00789 ** 
## operation    -1.1759     0.2718  -4.326 2.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.271 on 279 degrees of freedom
## Multiple R-squared:  0.08604,    Adjusted R-squared:  0.07949 
## F-statistic: 13.13 on 2 and 279 DF,  p-value: 3.54e-06

三 Distribution test

3.1 Normal distribution test

ks.test(target1,"pnorm",
        
        alternative = c("two.sided", "less", "greater"),
        
        exact = NULL)
## Warning in ks.test(target1, "pnorm", alternative = c("two.sided", "less", : ties
## should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target1
## D = 0.83489, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(target2,"pnorm",
        
        alternative = c("two.sided", "less", "greater"),
        
        exact = NULL)
## Warning in ks.test(target2, "pnorm", alternative = c("two.sided", "less", : ties
## should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target2
## D = 0.63662, p-value < 2.2e-16
## alternative hypothesis: two-sided

3.2 QQ plot for normal distribution test

par(mfcol=c(1,2))
qqnorm(target1,col="green",main="Normal Q-Q Plot-y1")
qqline(target1,col="green")
qqnorm(target2,col="red",main="Normal Q-Q Plot-y2")
qqline(target2,col="red")

3.3 Poisson distribution test

ks.test(target1,"pgamma",1)
## Warning in ks.test(target1, "pgamma", 1): ties should not be present for the
## Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target1
## D = 0.7115, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(target2,"pgamma",1)
## Warning in ks.test(target2, "pgamma", 1): ties should not be present for the
## Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  target2
## D = 0.46309, p-value < 2.2e-16
## alternative hypothesis: two-sided

四 Sample size calculation—assuming normality

4.1 Preliminary analysis of “target1” and “target2”

组别 开放手术 腔镜手术
样本量 127 155
均值 2.64 3.83
方差 6.66 4.12
标准差 2.58 2.03

4.2 Sample size

两样本单侧检验样本量计算公式:

\[n_1=\frac{(z_{1-\alpha}+z_{1-\beta})(\sigma_1^2+\sigma2^2/k)}{(\mu_1-\mu_2)^2}\]

\[n_2=kn_1\]

两样本双侧检验样本量计算公式:

\[n_1=\frac{(z_{1-\alpha/2}+z_{1-\beta})(\sigma_1^2+\sigma2^2/k)}{(\mu_1-\mu_2)^2}\]

\[n_2=kn_1\]


计算样本量前提设定:\(\alpha=0.05\);\(1-\beta=0.8\)

组别 开放手术 腔镜手术
单侧检验 44 54
双侧检验 56 69