一 Data preprocessing
1.1 Load data
## import dataset
library(readxl)
library(ggplot2)
path1 = "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/开放手术133例.xlsx"
path2='C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜手术150例.xlsx'
data1 <- readxl::read_excel(path1, sheet = 1)
data3 <- readxl::read_excel(path2, sheet = 1)## New names:
## * `` -> ...12
## * `` -> ...13
1.2 Data clean
开放手术
## [1] "5" "1" "1" "0" "4" "5" "4"
## [8] "2" "未 清扫" "2" "未 清扫" "2" "7" "3"
## [15] "0" "6" "未 清扫" "0" "2" "1" "4"
## [22] "5" "4" "5" "2" "1" "3" "2"
## [29] "6" "1" "0" "0" "5" "11" "3"
## [36] "0" "1" "1" "0" "4" "0" "3"
## [43] "0" "未 清扫" "1" "1" "14" "2" "2"
## [50] "2" "4" "3" "0" "1" "3" "1"
## [57] "0" "2" "0" "2" "3" "1" "5"
## [64] "1" "3" "10" "7" "4" "0" "3"
## [71] "5" "7" "2" "2" "1" "5" "6"
## [78] "1" "2" "2" "0" "2" "6" "1"
## [85] "4" "1" "3" "1" "3" "9" "0"
## [92] "3" "3" "0" "0" "1" "0" "2"
## [99] "3" "2" "1" "3" "2" "2" "1"
## [106] "0" "1" "6" "3" "0" "1" "4"
## [113] "0" "1" "2" "4" "0" "3" "4"
## [120] "5" "3" "2" "4" "未 清扫" "6" "未 清扫"
## [127] "0" "11" "7" "未 清扫" "0" "0" "1"
## [134] "0"
w=which(data1$中央区清扫淋巴结总数=="未 清扫")
data2=data1[-w,]
data2$中央区清扫淋巴结总数=as.numeric(data2$中央区清扫淋巴结总数)
data2$中央区清扫淋巴结总数## [1] 5 1 1 0 4 5 4 2 2 2 7 3 0 6 0 2 1 4 5 4 5 2 1 3 2
## [26] 6 1 0 0 5 11 3 0 1 1 0 4 0 3 0 1 1 14 2 2 2 4 3 0 1
## [51] 3 1 0 2 0 2 3 1 5 1 3 10 7 4 0 3 5 7 2 2 1 5 6 1 2
## [76] 2 0 2 6 1 4 1 3 1 3 9 0 3 3 0 0 1 0 2 3 2 1 3 2 2
## [101] 1 0 1 6 3 0 1 4 0 1 2 4 0 3 4 5 3 2 4 6 0 11 7 0 0
## [126] 1 0
腔镜手术
## [1] 4 1 3 1 1 3 1 3 3 4 2 5 8 2 4 7 4 2 5 4 3 5 6 4 6
## [26] 3 2 6 12 5 5 6 2 3 3 4 4 5 7 2 4 7 4 2 5 4 1 5 6 6
## [51] 3 6 3 2 6 12 5 5 4 1 5 6 6 3 6 3 1 1 3 1 3 3 1 5 6
## [76] 6 0 6 3 1 1 3 3 6 3 1 4 3 1 3 3 1 5 1 5 6 6 2 6 3
## [101] 4 1 3 1 1 3 4 3 3 3 5 6 2 4 6 3 1 1 3 6 3 3 4 5 6
## [126] 6 6 2 6 3 1 4 3 6 3 3 5 6 6 6 4 6 3 4 1 3 4 7 4 2
## [151] 5 4 1 6 3
二 Descriptive statistical analysis
2.1 Histogram lymph
par(mfrow=c(1,2))
target1=data2$中央区清扫淋巴结总数
target2=data3$中央区清扫淋巴结总数
hist(target1,main='开放手术',breaks=14,xlab='',ylab='')
abline(v=mean(target1),lwd=1,col='red')
text(x=4.5,y=45,round(mean(target1),3))
hist(target2,main='腔镜手术',breaks=14,xlab='',ylab='')
abline(v=mean(target2),lwd=1,col='red')
text(x=5.5,y=34,round(mean(target2),3))2.2 Sex
data4=data.frame(data2$`性别(女1,男2)`,data2$年龄,data2$住院天数d)
data5=data.frame(data3$`性别(女1,男2)`,data3$年龄,data3$住院天数d)
data4<-data.frame(Sample<-c('Woman','Man'),
sex=c('Woman','Man'),
value<-c(table(data2$`性别(女1,男2)`)[1]/length(data2$`性别(女1,男2)`),
table(data2$`性别(女1,男2)`)[2]/length(data2$`性别(女1,男2)`)))
p1=ggplot(data4,mapping = aes(Sample,value,fill=sex))+
geom_bar(stat='identity',position='dodge') +
geom_text(aes(label=paste0(round(value*length(data2$`性别(女1,男2)`)),' ','(',round(value,3),')'),y=value+0.01), position=position_dodge(0.9), vjust=0,family='serif')+
labs(x = '',y = '',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+
ggtitle('开放手术')+
guides(fill=F)## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
data5<-data.frame(Sample1<-c('Woman','Man'),
sex=c('Woman','Man'),
value1<-c(table(data3$`性别(女1,男2)`)[1]/length(data3$`性别(女1,男2)`),
table(data3$`性别(女1,男2)`)[2]/length(data3$`性别(女1,男2)`)))
p2=ggplot(data5,mapping = aes(Sample1,value1,fill=sex))+
geom_bar(stat='identity',position='dodge') +
geom_text(aes(label=paste0(round(value1*length(data3$`性别(女1,男2)`)),' ','(',round(value1,3),')'),y=value1+0.01),
position=position_dodge(0.9), vjust=0,family='serif')+
labs(x = '',y = '',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+
ggtitle('腔镜手术')+
guides(fill=F)## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
2.3 Age
p3=ggplot(data=data2)+
geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))
p4=ggplot(data=data3)+
geom_boxplot(aes(x=年龄,y=年龄))+
labs(x='腔镜手术',y='Age')+
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))
ggarrange(p3,p4,labels=c('A','B'),ncol=2,nrow=1)## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
2.3.1 T-test
##
## Welch Two Sample t-test
##
## data: data1$年龄 and data3$年龄
## t = -13.14, df = 286.65, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17.13528 -12.67059
## sample estimates:
## mean of x mean of y
## 34.71642 49.61935
2.4 Linear regression
Note:classification index setting
| 指标 | 0 | 1 |
|---|---|---|
| 手术类型 | 开放手术 | 腔镜手术 |
| 指标 | 1 | 2 |
|---|---|---|
| 性别 | 女 | 男 |
Result:
| Index | Rsquare | r |
|---|---|---|
| age | 0.018 | 0.134 |
| sex | 0.025 | 0.158 |
| operation | 0.063 | 0.251 |
| days in hospital | 不显著 |
age=c(data2$年龄,data3$年龄)
sex=c(data2$`性别(女1,男2)`,data3$`性别(女1,男2)`)
day=c(data2$住院天数d,data3$住院天数d)
lymph=c(data2$中央区清扫淋巴结总数,data3$中央区清扫淋巴结总数)
operation=c(rep(0,127),rep(1,155))
da=data.frame(age,sex,day,operation,lymph)
for(i in 1:4){
print(colnames(da)[i])
print('#############################')
print(summary(lm(da$lymph~da[,i])))
}## [1] "age"
## [1] "#############################"
##
## Call:
## lm(formula = da$lymph ~ da[, i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6076 -1.9404 -0.2905 1.4387 11.1587
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.15438 0.51608 4.175 3.99e-05 ***
## da[, i] 0.02642 0.01155 2.288 0.0229 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.349 on 280 degrees of freedom
## Multiple R-squared: 0.01835, Adjusted R-squared: 0.01484
## F-statistic: 5.233 on 1 and 280 DF, p-value: 0.0229
##
## [1] "sex"
## [1] "#############################"
##
## Call:
## lm(formula = da$lymph ~ da[, i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4015 -2.0435 -0.4015 1.5985 10.5985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7596 0.5684 8.374 2.71e-15 ***
## da[, i] -1.3581 0.5095 -2.666 0.00813 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.342 on 280 degrees of freedom
## Multiple R-squared: 0.02475, Adjusted R-squared: 0.02127
## F-statistic: 7.106 on 1 and 280 DF, p-value: 0.00813
##
## [1] "day"
## [1] "#############################"
##
## Call:
## lm(formula = da$lymph ~ da[, i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.366 -2.225 -0.323 1.677 10.699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.49676 0.55878 6.258 1.46e-09 ***
## da[, i] -0.02172 0.05702 -0.381 0.704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.37 on 280 degrees of freedom
## Multiple R-squared: 0.0005181, Adjusted R-squared: -0.003051
## F-statistic: 0.1452 on 1 and 280 DF, p-value: 0.7035
##
## [1] "operation"
## [1] "#############################"
##
## Call:
## lm(formula = da$lymph ~ da[, i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8258 -1.6378 -0.6378 1.3622 11.3622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6378 0.2037 12.949 < 2e-16 ***
## da[, i] 1.1880 0.2748 4.324 2.14e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.296 on 280 degrees of freedom
## Multiple R-squared: 0.06259, Adjusted R-squared: 0.05924
## F-statistic: 18.69 on 1 and 280 DF, p-value: 2.136e-05
通过简单线性回归结果可以知道,sex,age,operation对淋巴清扫总数具有显著性影响。其中以手术类型Rsquare最大,影响关系最强,但是Rsquare只有0.062。性别回归系数为负,说明男性的病人淋巴清扫总数偏少,年龄和手术类型回归系数为正,说明年纪大和腔镜手术的病人淋巴清扫总数偏多,将以上指标运用多元线性模型进一步分析。
2.5 Multi-linear regression
##
## Call:
## lm(formula = lymph ~ age + sex + operation, data = da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1600 -1.6693 -0.2676 1.2680 11.1612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.441814 0.800182 5.551 6.63e-08 ***
## age -0.009291 0.014106 -0.659 0.510657
## sex -1.361419 0.498194 -2.733 0.006684 **
## operation 1.311837 0.341566 3.841 0.000152 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.273 on 278 degrees of freedom
## Multiple R-squared: 0.08747, Adjusted R-squared: 0.07762
## F-statistic: 8.882 on 3 and 278 DF, p-value: 1.219e-05
通过多元统计分析结果可以看出,性别和手术类型对淋巴清扫总数具有显著性影响,其中手术类型影响最大,但总Rsquare仅为0.087。
2.6 Step regression
通过逐步回归进一步寻求最佳模型
## Start: AIC=467.11
## lymph ~ age + sex + operation
##
## Df Sum of Sq RSS AIC
## - age 1 2.242 1438.7 465.55
## <none> 1436.5 467.11
## - sex 1 38.587 1475.0 472.58
## - operation 1 76.219 1512.7 479.69
##
## Step: AIC=465.55
## lymph ~ sex + operation
##
## Df Sum of Sq RSS AIC
## <none> 1438.7 465.55
## - sex 1 36.927 1475.6 470.69
## - operation 1 96.487 1535.2 481.85
##
## Call:
## lm(formula = lymph ~ sex + operation, data = da)
##
## Coefficients:
## (Intercept) sex operation
## 4.075 -1.322 1.176
从逐步回归结果可以看出,模型lymph ~ sex + operation为最佳模型,与多元统计分析结果相同。
##
## Call:
## lm(formula = lymph ~ sex + operation, data = da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9282 -1.7523 -0.1791 1.2477 11.2477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0747 0.5735 7.105 1.01e-11 ***
## sex -1.3223 0.4941 -2.676 0.00789 **
## operation 1.1759 0.2718 4.326 2.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.271 on 279 degrees of freedom
## Multiple R-squared: 0.08604, Adjusted R-squared: 0.07949
## F-statistic: 13.13 on 2 and 279 DF, p-value: 3.54e-06
三 Distribution test
3.1 Normal distribution test
## Warning in ks.test(target1, "pnorm", alternative = c("two.sided", "less", : ties
## should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: target1
## D = 0.63662, p-value < 2.2e-16
## alternative hypothesis: two-sided
## Warning in ks.test(target2, "pnorm", alternative = c("two.sided", "less", : ties
## should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: target2
## D = 0.83489, p-value < 2.2e-16
## alternative hypothesis: two-sided
3.2 QQ plot for normal distribution test
par(mfcol=c(1,2))
qqnorm(target1,col="green",main="Normal Q-Q Plot-y1")
qqline(target1,col="green")
qqnorm(target2,col="red",main="Normal Q-Q Plot-y2")
qqline(target2,col="red")3.3 Poisson distribution test
## Warning in ks.test(target1, "pgamma", 1): ties should not be present for the
## Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: target1
## D = 0.46309, p-value < 2.2e-16
## alternative hypothesis: two-sided
## Warning in ks.test(target2, "pgamma", 1): ties should not be present for the
## Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: target2
## D = 0.7115, p-value < 2.2e-16
## alternative hypothesis: two-sided
四 Sample size calculation—assuming normality
4.1 Preliminary analysis of “target1” and “target2”
| 组别 | 开放手术 | 腔镜手术 |
|---|---|---|
| 样本量 | 127 | 155 |
| 均值 | 2.64 | 3.83 |
| 方差 | 6.66 | 4.12 |
| 标准差 | 2.58 | 2.03 |
4.2 Sample size
Set: \(\delta\)非劣性临界值
4.2.1 Test for Equality
Set: \(\delta=0\)
两样本样本量计算公式:
\[n_1=\frac{(z_{\alpha/2}+z_{\beta})^2\sigma^2(1+1/k)}{(\mu_1-\mu_2)^2}\]
\[n_2=kn_1\]
Note: When \(\sigma^2\) is unknown,it can replace by \(s^2\) :
\[s^2=\frac{(n_1-1)s_1^2+(n_2-1)s_2^2}{(n_1+n_2-2)}\]
where \(s_1^2\),\(s_2^2\) are the variances of two examples.
4.2.2 Test for Non-Inferority/Superiority
Set: \(\delta\not=0\)
两样本样本量计算公式:
\[n_1=\frac{(z_{\alpha}+z_{\beta})^2\sigma^2(1+1/k)}{(\mu_1-\mu_2-\delta)^2}\]
\[n_2=kn_1\]
Note: When \(\sigma^2\) is unknown,it can replace by \(s^2\) :
\[s^2=\frac{(n_1-1)s_1^2+(n_2-1)s_2^2}{(n_1+n_2-2)}\]
where \(s_1^2\),\(s_2^2\) are the variances of two examples.
计算样本量前提设定:\(\alpha=0.05\); \(1-\beta=0.8\); \(\delta=0.12\)
| 组别 | 开放手术 | 腔镜手术 |
|---|---|---|
| 等价性检验 | 202 | 247 |
| 非劣性检验 | 247 | 247 |