一 Data
1.1 Load data
library(readxl)
library(ggplot2)
path1 = "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜开放甲状腺癌.xlsx"
path2= "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜开放甲状腺癌-郑组2.xlsx"
data1 <- readxl::read_excel(path1, sheet = 1)#new_data(开放)
data3 <- readxl::read_excel(path1, sheet =2 )#new_data(腔镜)
data2 <- readxl::read_excel(path2, sheet =2 )#old_data(开放)
data4 <- readxl::read_excel(path2, sheet =1 )#old_data(腔镜)1.2 Data comparison
两份数据开放手术的重复数
dat = data.frame(
"病案号" = rep(NA,140),
"病人姓名" = rep(NA,140),
"性别" = rep(NA, 140),
"年龄" = rep(NA, 140),
"肿瘤大小" = rep(NA, 140),
"淋巴结清扫数目" = rep(NA, 140),
"转移数目" = rep(NA, 140)
)
count=0
for(j in seq(length(data2$病人姓名))){
for(i in seq(length(data1$病人姓名))){
if(data1$病人姓名[i]==data2$病人姓名[j]){
count=count+1
dat$病案号[j]=data1$病案号[i]
dat$病人姓名[j]=data1$病人姓名[i]
dat$性别[j]=data1$性别[i]
dat$年龄[j]=data1$年龄[i]
dat$肿瘤大小[j]=data1$肿瘤大小[i]
dat$淋巴结清扫数目[j]=data1$淋巴结清扫数目[i]
dat$转移数目[j]=data1$转移数目[i]
}
}
}
count## [1] 140
重复数据示例
## 病案号 性别 年龄 肿瘤大小 淋巴结清扫数目 转移数目
## 1 90771743 男 18 6 3 3
## 2 90783934 男 25 5 1 0
## 3 90806059 男 27 18 2 2
## 4 90767374 男 27 6 8 4
## 5 91120807 男 27 7 1 0
## 6 90698622 男 28 12 6 5
## # A tibble: 6 x 6
## 病案号 `性别(女1男2)` 年龄 肿瘤大小 淋巴结清扫数目 转移数目
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 90771743 2 18 6 3 3
## 2 90783934 2 25 5 1 0
## 3 90806059 2 27 18 2 2
## 4 90767374 2 27 6 8 4
## 5 91120807 2 27 7 1 0
## 6 90698622 2 28 12 6 5
两份数据腔镜手术的重复数
dat2 = data.frame(
"病案号" = rep(NA,140),
"病人姓名" = rep(NA,140),
"性别" = rep(NA, 140),
"年龄" = rep(NA, 140),
"肿瘤大小" = rep(NA, 140),
"淋巴结清扫数目" = rep(NA, 140),
"转移数目" = rep(NA, 140)
)
count2=0
for(j in seq(length(data4$病人姓名))){
for(i in seq(length(data3$病人姓名))){
if(data3$病人姓名[i]==data4$病人姓名[j]){
count2=count2+1
dat2$病案号[j]=data3$病案号[i]
dat2$病人姓名[j]=data3$病人姓名[i]
dat2$性别[j]=data3$性别[i]
dat2$年龄[j]=data3$年龄[i]
dat2$肿瘤大小[j]=data3$肿瘤大小[i]
dat2$淋巴结清扫数目[j]=data3$淋巴结清扫数目[i]
dat2$转移数目[j]=data3$转移数目[i]
}
}
}
count2## [1] 140
重复数据示例
## 病案号 性别 年龄 肿瘤大小 淋巴结清扫数目 转移数目
## 1 90688679 男 23 6 4 1
## 2 91089533 男 24 6 10 0
## 3 90619613 男 24 8 6 0
## 4 90626144 男 25 25 2 0
## 5 91199032 男 25 11 1 0
## 6 20555206 男 26 7 4 3
## # A tibble: 6 x 6
## 病案号 `性别(女1男2)` 年龄 肿瘤大小 淋巴结清扫数目 转移数目
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 90688679 2 23 6 4 1
## 2 91089533 2 24 6 10 0
## 3 90619613 2 24 8 6 0
## 4 90626144 2 25 25 2 0
## 5 91199032 2 25 11 1 0
## 6 20555206 2 26 7 4 3
二 Descriptive statistical analysis
2.1 Sex comparison
性别指标解释
| 指标 | 1 | 2 |
|---|---|---|
| 性别 | 女 | 男 |
options (warn = -1)
#new data
data5<-data.frame(Sample<-c('Woman','Man'),
sex=c('Woman','Man'),
value<-c(table(data1$`性别`)[2]/length(data1$`性别`),
table(data1$`性别`)[1]/length(data1$`性别`)))
p1=ggplot(data5,mapping = aes(Sample,value,fill=sex))+
geom_bar(stat='identity',position='dodge') +
geom_text(aes(label=paste0(round(value*length(data1$`性别`)),' ','(',round(value,3),')'),y=value+0.01), position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
labs(x = '',y = '',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+
ggtitle('开放手术')+theme(plot.title = element_text(hjust = 0.5))+
guides(fill=F)
data6<-data.frame(Sample1<-c('Woman','Man'),
sex=c('Woman','Man'),
value1<-c(table(data3$`性别`)[2]/length(data3$`性别`),
table(data3$`性别`)[1]/length(data3$`性别`)))
p2=ggplot(data6,mapping = aes(Sample1,value1,fill=sex))+
geom_bar(stat='identity',position='dodge') +
geom_text(aes(label=paste0(round(value1*length(data3$`性别`)),' ','(',round(value1,3),')'),y=value1+0.01),
position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
labs(x = '',y = '',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+
ggtitle('腔镜手术')+theme(plot.title = element_text(hjust = 0.5))+
guides(fill=F)
#old data
data7<-data.frame(Sample<-c('Woman','Man'),
sex=c('Woman','Man'),
value2<-c(table(data2$`性别(女1男2)`)[1]/length(data2$`性别(女1男2)`),
table(data2$`性别(女1男2)`)[2]/length(data2$`性别(女1男2)`)))
p3=ggplot(data7,mapping = aes(Sample,value2,fill=sex))+
geom_bar(stat='identity',position='dodge') +
geom_text(aes(label=paste0(round(value2*length(data2$`性别(女1男2)`)),' ','(',round(value2,3),')'),y=value2+0.01), position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
labs(x = '',y = '',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+
ggtitle('开放手术')+theme(plot.title = element_text(hjust = 0.5))+
guides(fill=F)
data8<-data.frame(Sample1<-c('Woman','Man'),
sex=c('Woman','Man'),
value3<-c(table(data4$`性别(女1男2)`)[1]/length(data4$`性别(女1男2)`),
table(data4$`性别(女1男2)`)[2]/length(data4$`性别(女1男2)`)))
p4=ggplot(data8,mapping = aes(Sample1,value3,fill=sex))+
geom_bar(stat='identity',position='dodge') +
geom_text(aes(label=paste0(round(value3*length(data4$`性别(女1男2)`)),' ','(',round(value3,3),')'),y=value3+0.01),
position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
labs(x = '',y = '',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+
ggtitle('腔镜手术')+theme(plot.title = element_text(hjust = 0.5))+
guides(fill=F)
library(ggpubr)
ggarrange(p3,p4,p1,p2,labels=c('Old_1','Old_2','New_1','New_2'),ncol=2,nrow=2)假定以年龄45岁为分界线
| Old_data | New_data | |||
|---|---|---|---|---|
| 性别(年龄) 手术类型 | 开放 | 腔镜 | 开放 | 腔镜 |
| 男(<45) | 38 | 32 | 54 | 43 |
| 男(>=45) | 0 | 6 | 64 | 6 |
| 女(<45) | 48 | 57 | 59 | 227 |
| 女(>=45) | 54 | 45 | 121 | 51 |
2.2 Fisher test and chi-square test between age and type of surgery
理论基础:
卡方检验可以做两个及两个以上分类变量的显著性检验
Fisher精确检验是一种显著性检验,当样本量较小时,可以用它来代替2×2表的卡方检验
旧数据:
| Old_data | ||
|---|---|---|
| 性别 手术类型 | 开放 | 腔镜 |
| 男 | 38 | 38 |
| 女 | 102 | 102 |
##
## Pearson's Chi-squared test
##
## data: old_age
## X-squared = 0, df = 1, p-value = 1
##
## Fisher's Exact Test for Count Data
##
## data: old_age
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5703536 1.7532982
## sample estimates:
## odds ratio
## 1
结果1:\(p_{\chi^2}=1\)、\(p_{f}=1\)均大于默认\(\alpha=0.05\),表明旧数据年龄和手术类型之间没有显著性关系。
新数据:
| New_data | ||
|---|---|---|
| 性别 手术类型 | 开放 | 腔镜 |
| 男 | 116 | 49 |
| 女 | 180 | 278 |
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: new_age
## X-squared = 45.514, df = 1, p-value = 1.516e-11
##
## Fisher's Exact Test for Count Data
##
## data: new_age
## p-value = 9.168e-12
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 2.454679 5.480355
## sample estimates:
## odds ratio
## 3.648186
结果2:\(p_{\chi^2}=1.516e-11\)、\(p_{f}=9.168e-12\)均小于默认\(\alpha=0.05\),表明旧数据年龄和手术类型之间有显著性关系。
2.3 Age comparison
箱线图对比
options (warn = -1)
p11=ggplot(data=data1)+
geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)
p21=ggplot(data=data3)+
geom_boxplot(aes(x=年龄,y=年龄))+
labs(x='腔镜手术',y='Age')+
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)
p31=ggplot(data=data2)+
geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)
p41=ggplot(data=data4)+
geom_boxplot(aes(x=年龄,y=年龄))+
labs(x='腔镜手术',y='Age')+
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)
ggarrange(p31,p41,p11,p21,labels=c('Old_1','Old_2','New_1','New_2'),ncol=2,nrow=2)直方图对比
#Old_data
par(mfrow=c(2,2))
hist(age1,freq=FALSE,main='开放手术(Old_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age1),col='blue')
abline(v=mean(age1),lwd=1,col='red')
text(x=48,y=0.045,round(mean(age1),2))
hist(age2,freq=FALSE,main='腔镜手术(Old_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age2),col='blue')
abline(v=mean(age2),lwd=1,col='red')
text(x=48,y=0.045,round(mean(age2),2))
#New_data
hist(age3,freq=FALSE,main='开放手术(New_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age3),col='blue')
abline(v=mean(age3),lwd=1,col='red')
text(x=55,y=0.045,round(mean(age3),2))
hist(age4,freq = FALSE,main='腔镜手术(New_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age4),col='blue')
abline(v=mean(age4),lwd=1,col='red')
text(x=42,y=0.045,round(mean(age4),2))2.4 Basic situations of numerical indicators
| Old_data | New_data | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| 指标 | 手术类型 | 年龄 | 肿瘤大小 | 淋巴结清扫数目 | 转移数目 | 年龄 | 肿瘤大小 | 淋巴结清扫数目 | 转移数目 | |
| 均值 | 开放 | 41.25 | 6.82 | 4.14 | 0.73 | 48.30 | 8.45 | 4.43 | 0.80 | |
| 腔镜 | 41.25 | 6.39 | 3.72 | 0.53 | 35.53 | 7.03 | 3.94 | 0.72 | ||
| 方差 | 开放 | 80.66 | 15.98 | 6.15 | 2.07 | 128.25 | 53.39 | 11.06 | 2.41 | |
| 腔镜 | 81.56 | 12.38 | 7.58 | 1.22 | 83.69 | 30.04 | 9.12 | 2.75 | ||
| 标准差 | 开放 | 8.95 | 3.98 | 2.47 | 1.43 | 11.32 | 7.31 | 3.33 | 1.55 | |
| 腔镜 | 9.00 | 3.51 | 2.74 | 1.10 | 9.15 | 5.48 | 3.02 | 1.66 |
三 Comparison of important indicators
3.1 Number of lymph node dissection
#Old_data
par(mfrow=c(2,2))
hist(target1,main='开放手术(Old_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target1),lwd=1,col='red')
text(x=6,y=60,round(mean(target1),2))
hist(target2,main='腔镜手术(Old_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target2),lwd=1,col='red')
text(x=5.5,y=60,round(mean(target2),2))
#New_data
hist(target3,main='开放手术(New_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target3),lwd=1,col='red')
text(x=6,y=60,round(mean(target3),2))
hist(target4,main='腔镜手术(New_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target4),lwd=1,col='red')
text(x=5.5,y=60,round(mean(target4),2))3.2 Tumor size
par(mfrow=c(2,2))
#Old_data
hist(data2$肿瘤大小,main='开放手术(Old_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data1$肿瘤大小),lwd=1,col='red')
text(x=11,y=60,round(mean(data2$肿瘤大小),2))
hist(data4$肿瘤大小,main='腔镜手术(Old_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data3$肿瘤大小),lwd=1,col='red')
text(x=10,y=60,round(mean(data4$肿瘤大小),2))
#New_data
hist(data1$肿瘤大小,main='开放手术(New_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data1$肿瘤大小),lwd=1,col='red')
text(x=11,y=60,round(mean(data1$肿瘤大小),2))
hist(data3$肿瘤大小,main='腔镜手术(New_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data3$肿瘤大小),lwd=1,col='red')
text(x=10,y=60,round(mean(data3$肿瘤大小),2))