Old data vs New_data

xxy

2021/12/25

一 Data

1.1 Load data

library(readxl)
library(ggplot2)
path1 = "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜开放甲状腺癌.xlsx"
path2= "C:/Users/xuxiyun/Desktop/zherenyi/淋巴手术/腔镜开放甲状腺癌-郑组2.xlsx"
data1 <- readxl::read_excel(path1, sheet = 1)#new_data(开放)
data3 <- readxl::read_excel(path1, sheet =2 )#new_data(腔镜)
data2 <- readxl::read_excel(path2, sheet =2 )#old_data(开放)
data4 <- readxl::read_excel(path2, sheet =1 )#old_data(腔镜)

1.2 Data comparison

两份数据开放手术的重复数

dat = data.frame(
  "病案号" = rep(NA,140),
  "病人姓名" = rep(NA,140),
  "性别" = rep(NA, 140),
  "年龄" = rep(NA, 140),
  "肿瘤大小" = rep(NA, 140),
  "淋巴结清扫数目" = rep(NA, 140),
  "转移数目" = rep(NA, 140)
)
count=0
for(j in seq(length(data2$病人姓名))){
  for(i in seq(length(data1$病人姓名))){
    if(data1$病人姓名[i]==data2$病人姓名[j]){
      count=count+1
      dat$病案号[j]=data1$病案号[i]
      dat$病人姓名[j]=data1$病人姓名[i]
      dat$性别[j]=data1$性别[i]
      dat$年龄[j]=data1$年龄[i]
      dat$肿瘤大小[j]=data1$肿瘤大小[i]
      dat$淋巴结清扫数目[j]=data1$淋巴结清扫数目[i]
      dat$转移数目[j]=data1$转移数目[i]
    }
  }
}
count
## [1] 140

重复数据示例

#新数据
dat=dat[,-2]
head(dat)
##     病案号 性别 年龄 肿瘤大小 淋巴结清扫数目 转移数目
## 1 90771743   男   18        6              3        3
## 2 90783934   男   25        5              1        0
## 3 90806059   男   27       18              2        2
## 4 90767374   男   27        6              8        4
## 5 91120807   男   27        7              1        0
## 6 90698622   男   28       12              6        5
#旧数据
old_dat=data2[,-2]
head(old_dat)
## # A tibble: 6 x 6
##     病案号 `性别(女1男2)`  年龄 肿瘤大小 淋巴结清扫数目 转移数目
##      <dbl>          <dbl> <dbl>    <dbl>          <dbl>    <dbl>
## 1 90771743              2    18        6              3        3
## 2 90783934              2    25        5              1        0
## 3 90806059              2    27       18              2        2
## 4 90767374              2    27        6              8        4
## 5 91120807              2    27        7              1        0
## 6 90698622              2    28       12              6        5

两份数据腔镜手术的重复数

dat2 = data.frame(
  "病案号" = rep(NA,140),
  "病人姓名" = rep(NA,140),
  "性别" = rep(NA, 140),
  "年龄" = rep(NA, 140),
  "肿瘤大小" = rep(NA, 140),
  "淋巴结清扫数目" = rep(NA, 140),
  "转移数目" = rep(NA, 140)
)
count2=0
for(j in seq(length(data4$病人姓名))){
  for(i in seq(length(data3$病人姓名))){
    if(data3$病人姓名[i]==data4$病人姓名[j]){
      count2=count2+1
      dat2$病案号[j]=data3$病案号[i]
      dat2$病人姓名[j]=data3$病人姓名[i]
      dat2$性别[j]=data3$性别[i]
      dat2$年龄[j]=data3$年龄[i]
      dat2$肿瘤大小[j]=data3$肿瘤大小[i]
      dat2$淋巴结清扫数目[j]=data3$淋巴结清扫数目[i]
      dat2$转移数目[j]=data3$转移数目[i]
    }
  }
}
count2
## [1] 140

重复数据示例

#新数据
dat2=dat2[,-2]
head(dat2)
##     病案号 性别 年龄 肿瘤大小 淋巴结清扫数目 转移数目
## 1 90688679   男   23        6              4        1
## 2 91089533   男   24        6             10        0
## 3 90619613   男   24        8              6        0
## 4 90626144   男   25       25              2        0
## 5 91199032   男   25       11              1        0
## 6 20555206   男   26        7              4        3
#旧数据
old_dat1=data4[,-2]
head(old_dat1)
## # A tibble: 6 x 6
##     病案号 `性别(女1男2)`  年龄 肿瘤大小 淋巴结清扫数目 转移数目
##      <dbl>          <dbl> <dbl>    <dbl>          <dbl>    <dbl>
## 1 90688679              2    23        6              4        1
## 2 91089533              2    24        6             10        0
## 3 90619613              2    24        8              6        0
## 4 90626144              2    25       25              2        0
## 5 91199032              2    25       11              1        0
## 6 20555206              2    26        7              4        3

二 Descriptive statistical analysis

2.1 Sex comparison

性别指标解释

指标 1 2
性别
options (warn = -1)
#new data
data5<-data.frame(Sample<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value<-c(table(data1$`性别`)[2]/length(data1$`性别`),
                           table(data1$`性别`)[1]/length(data1$`性别`)))
p1=ggplot(data5,mapping = aes(Sample,value,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=paste0(round(value*length(data1$`性别`)),' ','(',round(value,3),')'),y=value+0.01), position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('开放手术')+theme(plot.title = element_text(hjust = 0.5))+
  guides(fill=F)

data6<-data.frame(Sample1<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value1<-c(table(data3$`性别`)[2]/length(data3$`性别`),
                           table(data3$`性别`)[1]/length(data3$`性别`)))

p2=ggplot(data6,mapping = aes(Sample1,value1,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=paste0(round(value1*length(data3$`性别`)),' ','(',round(value1,3),')'),y=value1+0.01), 
            position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('腔镜手术')+theme(plot.title = element_text(hjust = 0.5))+
  guides(fill=F)

#old data
data7<-data.frame(Sample<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value2<-c(table(data2$`性别(女1男2)`)[1]/length(data2$`性别(女1男2)`),
                           table(data2$`性别(女1男2)`)[2]/length(data2$`性别(女1男2)`)))
p3=ggplot(data7,mapping = aes(Sample,value2,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=paste0(round(value2*length(data2$`性别(女1男2)`)),' ','(',round(value2,3),')'),y=value2+0.01), position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('开放手术')+theme(plot.title = element_text(hjust = 0.5))+
  guides(fill=F)

data8<-data.frame(Sample1<-c('Woman','Man'), 
                  sex=c('Woman','Man'),
                  value3<-c(table(data4$`性别(女1男2)`)[1]/length(data4$`性别(女1男2)`),
                           table(data4$`性别(女1男2)`)[2]/length(data4$`性别(女1男2)`)))

p4=ggplot(data8,mapping = aes(Sample1,value3,fill=sex))+
  geom_bar(stat='identity',position='dodge') +
  geom_text(aes(label=paste0(round(value3*length(data4$`性别(女1男2)`)),' ','(',round(value3,3),')'),y=value3+0.01), 
            position=position_dodge(0.9), vjust=0,family='serif')+ylim(0,1)+
  labs(x = '',y = '',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+
  ggtitle('腔镜手术')+theme(plot.title = element_text(hjust = 0.5))+
  guides(fill=F)


library(ggpubr)
ggarrange(p3,p4,p1,p2,labels=c('Old_1','Old_2','New_1','New_2'),ncol=2,nrow=2)

假定以年龄45岁为分界线

Old_data New_data
性别(年龄)  手术类型 开放 腔镜 开放 腔镜
男(<45) 38 32 54 43
男(>=45) 0 6 64 6
女(<45) 48 57 59 227
女(>=45) 54 45 121 51

2.2 Fisher test and chi-square test between age and type of surgery

理论基础:

卡方检验可以做两个及两个以上分类变量的显著性检验
Fisher精确检验是一种显著性检验,当样本量较小时,可以用它来代替2×2表的卡方检验

旧数据:

Old_data
性别  手术类型 开放 腔镜
38 38
102 102
y<-c(38,102,38,102)
old_age<-matrix(y,nrow=2)
chisq.test(old_age)
## 
##  Pearson's Chi-squared test
## 
## data:  old_age
## X-squared = 0, df = 1, p-value = 1
fisher.test(old_age)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  old_age
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5703536 1.7532982
## sample estimates:
## odds ratio 
##          1

结果1:\(p_{\chi^2}=1\)\(p_{f}=1\)均大于默认\(\alpha=0.05\),表明旧数据年龄和手术类型之间没有显著性关系。

新数据:

New_data
性别  手术类型 开放 腔镜
116 49
180 278
y1<-c(116,180,49,278)
new_age<-matrix(y1,nrow=2)
chisq.test(new_age)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  new_age
## X-squared = 45.514, df = 1, p-value = 1.516e-11
fisher.test(new_age)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  new_age
## p-value = 9.168e-12
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  2.454679 5.480355
## sample estimates:
## odds ratio 
##   3.648186

结果2:\(p_{\chi^2}=1.516e-11\)\(p_{f}=9.168e-12\)均小于默认\(\alpha=0.05\),表明旧数据年龄和手术类型之间有显著性关系。

2.3 Age comparison

箱线图对比

options (warn = -1)
p11=ggplot(data=data1)+
  geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)

p21=ggplot(data=data3)+
  geom_boxplot(aes(x=年龄,y=年龄))+
  labs(x='腔镜手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)

p31=ggplot(data=data2)+
  geom_boxplot(aes(x=年龄,y=年龄))+labs(x='开放手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)

p41=ggplot(data=data4)+
  geom_boxplot(aes(x=年龄,y=年龄))+
  labs(x='腔镜手术',y='Age')+
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))+xlim(10,70)+ylim(10,70)

ggarrange(p31,p41,p11,p21,labels=c('Old_1','Old_2','New_1','New_2'),ncol=2,nrow=2)

直方图对比

age3=data1$年龄
age4=data3$年龄
age1=data2$年龄
age2=data4$年龄
#Old_data
par(mfrow=c(2,2))
hist(age1,freq=FALSE,main='开放手术(Old_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age1),col='blue')
abline(v=mean(age1),lwd=1,col='red')
text(x=48,y=0.045,round(mean(age1),2))
hist(age2,freq=FALSE,main='腔镜手术(Old_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age2),col='blue')
abline(v=mean(age2),lwd=1,col='red')
text(x=48,y=0.045,round(mean(age2),2))
#New_data
hist(age3,freq=FALSE,main='开放手术(New_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age3),col='blue')
abline(v=mean(age3),lwd=1,col='red')
text(x=55,y=0.045,round(mean(age3),2))
hist(age4,freq = FALSE,main='腔镜手术(New_data)',breaks=14,xlab='年龄',ylab='',xlim=range(0,80),ylim=range(0,0.05))
lines(density(age4),col='blue')
abline(v=mean(age4),lwd=1,col='red')
text(x=42,y=0.045,round(mean(age4),2))

2.4 Basic situations of numerical indicators

Old_data New_data
指标 手术类型 年龄 肿瘤大小 淋巴结清扫数目 转移数目 年龄 肿瘤大小 淋巴结清扫数目 转移数目
均值 开放 41.25 6.82 4.14 0.73 48.30 8.45 4.43 0.80
腔镜 41.25 6.39 3.72 0.53 35.53 7.03 3.94 0.72
方差 开放 80.66 15.98 6.15 2.07 128.25 53.39 11.06 2.41
腔镜 81.56 12.38 7.58 1.22 83.69 30.04 9.12 2.75
标准差 开放 8.95 3.98 2.47 1.43 11.32 7.31 3.33 1.55
腔镜 9.00 3.51 2.74 1.10 9.15 5.48 3.02 1.66

三 Comparison of important indicators

3.1 Number of lymph node dissection

target3=data1$淋巴结清扫数目
target4=data3$淋巴结清扫数目
target1=data2$淋巴结清扫数目
target2=data4$淋巴结清扫数目
#Old_data
par(mfrow=c(2,2))
hist(target1,main='开放手术(Old_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target1),lwd=1,col='red')
text(x=6,y=60,round(mean(target1),2))
hist(target2,main='腔镜手术(Old_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target2),lwd=1,col='red')
text(x=5.5,y=60,round(mean(target2),2))
#New_data
hist(target3,main='开放手术(New_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target3),lwd=1,col='red')
text(x=6,y=60,round(mean(target3),2))
hist(target4,main='腔镜手术(New_data)',breaks=14,xlab='淋巴结清扫数目',ylab='',xlim=range(0,14),ylim=range(0,100))
abline(v=mean(target4),lwd=1,col='red')
text(x=5.5,y=60,round(mean(target4),2))

3.2 Tumor size

par(mfrow=c(2,2))
#Old_data
hist(data2$肿瘤大小,main='开放手术(Old_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data1$肿瘤大小),lwd=1,col='red')
text(x=11,y=60,round(mean(data2$肿瘤大小),2))
hist(data4$肿瘤大小,main='腔镜手术(Old_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data3$肿瘤大小),lwd=1,col='red')
text(x=10,y=60,round(mean(data4$肿瘤大小),2))
#New_data
hist(data1$肿瘤大小,main='开放手术(New_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data1$肿瘤大小),lwd=1,col='red')
text(x=11,y=60,round(mean(data1$肿瘤大小),2))
hist(data3$肿瘤大小,main='腔镜手术(New_data)',breaks=25,xlab='Tumor size',ylab='',xlim=range(0,25),ylim=range(0,80))
abline(v=mean(data3$肿瘤大小),lwd=1,col='red')
text(x=10,y=60,round(mean(data3$肿瘤大小),2))