Loading packages

Reading data

dat = read.csv("~/Dropbox/Temp Files/Thao Quyen/AB476TH.csv")

ca = rename(dat, c("age_c"="age", 
                    "assmfnl_c" = "assmfinal", 
                    "cancfu1yr_c" = "canceryr1", 
                    "cancfu2yr_c" = "canceryr2", 
                    "cancscrfu1yr_c" = "cancscryr1", 
                    "density_c" = "density", 
                    "prvmam_c" = "prvmam", 
                    "prvmos_c" = "prvmos", 
                    "prvtim_c" = "prvtim", 
                    "resfnl_c" = "res.final", 
                    "resinit_c" = "res.init",
                    "scrmam_c" = "scrmam", 
                    "timsnc_c" = "timsnc",
                    "menopht_c" = "menopht",
                    "examyear" = "year",
                    "samp1cens_srmam" = "sampl1cens",
                    "FP_Recall" = "fp.recall",
                    "FP_BxRecom" = "fp.bxrecom", 
                    "FP_Sifu" = "fp.sifu",
                    "Mamm_1st_vs_Subs" = "mamm1.subs", 
                    "womanid" = "id", 
                    "racenci_c" = "race"))

ca$ageg[ca$age>=40 & ca$age<=49] = "40-49"
ca$ageg[ca$age>=50 & ca$age<=59] = "50-59"
ca$ageg[ca$age>=60 & ca$age<=69] = "60-69"
ca$ageg[ca$age>=70 & ca$age<=79] = "70-79"

ca$ass.final[ca$assmfinal=="0: Need addtl imagng eval"] = "Need Imaging"
ca$ass.final[ca$assmfinal=="1: Negative"] = "Negative"
ca$ass.final[ca$assmfinal=="2: Benign finding"] = "Benign"
ca$ass.final[ca$assmfinal=="3: Probably benign fndng"] = "Probably benign"
ca$ass.final[ca$assmfinal=="4: Suspiciously abnormal"] = "Susp abnormal"
ca$ass.final[ca$assmfinal=="8: Structural missing"] = NA
ca$ass.final[ca$assmfinal=="9: Unknown"] = NA
ca$ass.final = factor(ca$ass.final, levels=c("Need Imaging", "Negative", "Benign", "Probably benign", "Susp abnormal"))

ca$canceryr1[ca$canceryr1=="0: No"] = 0
ca$canceryr1[ca$canceryr1=="1: Yes"] = 1

ca$canceryr2[ca$canceryr2=="0: No"] = 0
ca$canceryr2[ca$canceryr2=="1: Yes"] = 1

ca$cancscryr1[ca$cancscryr1=="0: No"] = 0
ca$cancscryr1[ca$cancscryr1=="1: Yes"] = 1

ca$density[ca$density=="1: Almost entirely fat(<25% fibrglandular)"] = "Fat"
ca$density[ca$density=="2: Scttrd fibroglandular tiss(25%-50%)"] = "Scattered"
ca$density[ca$density=="3: Heterogeneously dense(50%-75%)"] = "Hetero"
ca$density[ca$density=="4: Extremely dense(>75%)"] = "Dense"
ca$density = factor(ca$density, levels=c("Fat", "Scattered","Hetero", "Dense"))

ca$prvmam[ca$prvmam =="0: No"] = 0
ca$prvmam[ca$prvmam =="1: Yes"] = 1
ca$prvmam[ca$prvmam =="9: Unknown"] = NA

ca$res.init[ca$res.init=="0: Negative"] = 0
ca$res.init[ca$res.init=="1: Positive"] = 1
ca$res.init[ca$res.init=="9"] = NA

ca$res.final[ca$res.final=="0: Negative"] = 0
ca$res.final[ca$res.final=="1: Positive"] = 1
ca$res.final[ca$res.final=="8: Structural missing"] = NA
ca$res.final[ca$res.final=="9"] = NA

ca$meno[ca$menopht=="01: Natural"] = 1
ca$meno[ca$menopht=="02: ooph"] = 2
ca$meno[ca$menopht=="03: Age 60+"] = 3
ca$meno[ca$menopht=="04: HRT"] = 4
ca$meno[ca$menopht=="05: PERI"] = 5
ca$meno[ca$menopht=="06: PRE"] = 6
ca$meno[ca$menopht=="95: LMP 365+"] = 95
ca$meno[ca$menopht=="96: Surgical"] = 96
ca$meno[ca$menopht=="97: Prd Stop Oth"] = 97
ca$meno[ca$menopht=="98: All Missing"] = NA
ca$meno[ca$menopht=="99: Some Missing"] = NA

ca$evermamm[ca$evermamm=="0: No"] = 0
ca$evermamm[ca$evermamm=="1: Yes"] = 1
ca$evermamm[ca$evermamm=="8: Structural missing"] = NA
ca$evermamm[ca$evermamm=="9: Unknown"] = NA

ca$mammintvl[ca$mammintvl=="0:1st Mamm"] = "First"
ca$mammintvl[ca$mammintvl=="1:Annual (9-18m)"] = "Annual"
ca$mammintvl[ca$mammintvl=="2:Biennial (19-30m)"] = "Biennial"
ca$mammintvl[ca$mammintvl=="3:>30months"] = "30+months"
ca$mammintvl[ca$mammintvl=="9"] = NA
ca$mammintvl = factor(ca$mammintvl, levels=c("First", "Annual", "Biennial", "30+months"))

ca$type[ca$mammtype=="2: Digital(2-D only)"] = "Digital"
ca$type[ca$mammtype=="5: Tomosynthesis"] = "Tomosynthesis"

# Key data  

ca = ddply(ca, .(id), transform, dur = (age - age[1]))

ca1 = ca[, c("id", "age", "ageg", "dur", "type", "density", "fp.recall", "fp.bxrecom", "fp.sifu")]

Table 1

table1(~ageg + density + factor(screenround) + res.init + res.final | type, data=ca)
Digital
(N=1223840)
Tomosynthesis
(N=106329)
Overall
(N=1330169)
ageg
40-49 153718 (12.6%) 18761 (17.6%) 172479 (13.0%)
50-59 54169 (4.4%) 6577 (6.2%) 60746 (4.6%)
60-69 669451 (54.7%) 55922 (52.6%) 725373 (54.5%)
70-79 346502 (28.3%) 25069 (23.6%) 371571 (27.9%)
density
Fat 157017 (12.8%) 12739 (12.0%) 169756 (12.8%)
Scattered 599967 (49.0%) 51941 (48.8%) 651908 (49.0%)
Hetero 409255 (33.4%) 35480 (33.4%) 444735 (33.4%)
Dense 57601 (4.7%) 6169 (5.8%) 63770 (4.8%)
factor(screenround)
1 110211 (9.0%) 8699 (8.2%) 118910 (8.9%)
2 48410 (4.0%) 5073 (4.8%) 53483 (4.0%)
3 24631 (2.0%) 3614 (3.4%) 28245 (2.1%)
4 12910 (1.1%) 2644 (2.5%) 15554 (1.2%)
5 6566 (0.5%) 2000 (1.9%) 8566 (0.6%)
6 3138 (0.3%) 1397 (1.3%) 4535 (0.3%)
7 1346 (0.1%) 924 (0.9%) 2270 (0.2%)
8 472 (0.0%) 594 (0.6%) 1066 (0.1%)
9 166 (0.0%) 283 (0.3%) 449 (0.0%)
10 37 (0.0%) 110 (0.1%) 147 (0.0%)
Missing 1015953 (83.0%) 80991 (76.2%) 1096944 (82.5%)
res.init
0 1096821 (89.6%) 97773 (92.0%) 1194594 (89.8%)
1 125196 (10.2%) 8544 (8.0%) 133740 (10.1%)
Missing 1823 (0.1%) 12 (0.0%) 1835 (0.1%)
res.final
0 1191140 (97.3%) 103983 (97.8%) 1295123 (97.4%)
1 21893 (1.8%) 1849 (1.7%) 23742 (1.8%)
Missing 10807 (0.9%) 497 (0.5%) 11304 (0.8%)

Table 4

table1(~ageg*density | type, data=ca)
Digital
(N=1223840)
Tomosynthesis
(N=106329)
Overall
(N=1330169)
ageg
40-49 153718 (12.6%) 18761 (17.6%) 172479 (13.0%)
50-59 54169 (4.4%) 6577 (6.2%) 60746 (4.6%)
60-69 669451 (54.7%) 55922 (52.6%) 725373 (54.5%)
70-79 346502 (28.3%) 25069 (23.6%) 371571 (27.9%)
density
Fat 157017 (12.8%) 12739 (12.0%) 169756 (12.8%)
Scattered 599967 (49.0%) 51941 (48.8%) 651908 (49.0%)
Hetero 409255 (33.4%) 35480 (33.4%) 444735 (33.4%)
Dense 57601 (4.7%) 6169 (5.8%) 63770 (4.8%)

Descriptive stat of data

table1(~ assmfinal + canceryr1 + canceryr2 + cancscryr1 + density + prvmam + prvmos + prvtim + res.init + res.final + scrmam + timsnc + meno + mammtype + year + evermamm + screenround + mammintvl + sample + sample + fp.recall +  fp.bxrecom + fp.sifu + mamm1.subs, data=ca)
Overall
(N=1330169)
assmfinal
0: Need addtl imagng eval 8628 (0.6%)
1: Negative 813695 (61.2%)
2: Benign finding 449899 (33.8%)
3: Probably benign fndng 31529 (2.4%)
4: Suspiciously abnormal 21994 (1.7%)
5: Highly sggstv of malignancy 1748 (0.1%)
8: Structural missing 841 (0.1%)
9: Unknown 1835 (0.1%)
canceryr1
802242 (60.3%)
0 524697 (39.4%)
1 3230 (0.2%)
canceryr2
802242 (60.3%)
0 522148 (39.3%)
1 5779 (0.4%)
cancscryr1
802242 (60.3%)
0 524720 (39.4%)
1 3207 (0.2%)
density
Fat 169756 (12.8%)
Scattered 651908 (49.0%)
Hetero 444735 (33.4%)
Dense 63770 (4.8%)
prvmam
0 143006 (10.8%)
1 1175974 (88.4%)
Missing 11189 (0.8%)
prvmos
Mean (SD) 19.0 (15.6)
Median [Min, Max] 14.0 [9.00, 481]
Missing 194033 (14.6%)
prvtim
Mean (SD) 2.07 (1.43)
Median [Min, Max] 2.00 [0, 9.00]
Missing 18 (0.0%)
res.init
0 1194594 (89.8%)
1 133740 (10.1%)
Missing 1835 (0.1%)
res.final
0 1295123 (97.4%)
1 23742 (1.8%)
Missing 11304 (0.8%)
scrmam
1: Yes 1330169 (100%)
timsnc
Mean (SD) 3.26 (3.00)
Median [Min, Max] 2.00 [0, 9.00]
Missing 12436 (0.9%)
meno
1 542341 (40.8%)
2 51567 (3.9%)
3 530615 (39.9%)
4 1901 (0.1%)
5 4584 (0.3%)
6 171418 (12.9%)
95 82 (0.0%)
96 15376 (1.2%)
97 4993 (0.4%)
Missing 7292 (0.5%)
mammtype
2: Digital(2-D only) 1223840 (92.0%)
5: Tomosynthesis 106329 (8.0%)
year
Mean (SD) 2010 (2.77)
Median [Min, Max] 2010 [2010, 2020]
evermamm
12436 (0.9%)
0 158852 (11.9%)
1 1034174 (77.7%)
Missing 124707 (9.4%)
screenround
Mean (SD) 2.03 (1.44)
Median [Min, Max] 1.00 [1.00, 10.0]
Missing 1096944 (82.5%)
mammintvl
First 143006 (10.8%)
Annual 854896 (64.3%)
Biennial 178889 (13.4%)
30+months 102351 (7.7%)
Missing 51027 (3.8%)
sample
1:1 Only 233225 (17.5%)
2:2 only 1096944 (82.5%)
fp.recall
Mean (SD) 0.0945 (0.293)
Median [Min, Max] 0 [0, 1.00]
fp.bxrecom
Mean (SD) 0.0124 (0.110)
Median [Min, Max] 0 [0, 1.00]
fp.sifu
Mean (SD) 0.0234 (0.151)
Median [Min, Max] 0 [0, 1.00]
mamm1.subs
Mean (SD) 2.00 (0)
Median [Min, Max] 2.00 [2.00, 2.00]

Initial and final assessment

table1(~res.init | res.final, data=ca)
0
(N=1295123)
1
(N=23742)
Overall
(N=1330169)
res.init
0 1193850 (92.2%) 0 (0%) 1194594 (89.8%)
1 101273 (7.8%) 23742 (100%) 133740 (10.1%)
Missing 0 (0%) 0 (0%) 1835 (0.1%)

Interval of mammography

ggplot(data=ca, aes(x=mammintvl, fill= mammintvl)) + geom_bar() + theme(legend.position="none") + labs(x="Mammo interval", y="Number of women")

The number of rounds

ca %>% drop_na(screenround) %>% ggplot(aes(x=factor(screenround), fill=factor(screenround))) + geom_bar() + theme(legend.position="none") + labs(x="Screen round", y="Number of women")

Distribution of age and final assessment?

ggplot(data=ca, aes(x=age)) + geom_histogram(col="white", fill="blue") + labs(x="Age", y="Number of women")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=ca, aes(meno, fill=meno)) + geom_bar() + theme(legend.position="none") + labs(x="Meno", y="Number of women")

ggplot(data=ca, aes(ass.final, fill=ass.final)) + geom_bar() + theme(legend.position="none") + labs(x="ASSM Final", y="Number of women")

FP results

table1(~factor(fp.recall) + factor(fp.bxrecom) + factor(fp.sifu) | type, data=ca)
Digital
(N=1223840)
Tomosynthesis
(N=106329)
Overall
(N=1330169)
factor(fp.recall)
0 1105904 (90.4%) 98498 (92.6%) 1204402 (90.5%)
1 117936 (9.6%) 7831 (7.4%) 125767 (9.5%)
factor(fp.bxrecom)
0 1208589 (98.8%) 105145 (98.9%) 1313734 (98.8%)
1 15251 (1.2%) 1184 (1.1%) 16435 (1.2%)
factor(fp.sifu)
0 1194370 (97.6%) 104660 (98.4%) 1299030 (97.7%)
1 29470 (2.4%) 1669 (1.6%) 31139 (2.3%)
p1 = ggplot(data=ca, aes(x=factor(fp.recall), fill=factor(fp.recall))) + geom_bar() + theme(legend.position="none") + labs(x="FP Recall", y="Number of women")

p2 = ggplot(data=ca, aes(x=factor(fp.bxrecom), fill=factor(fp.bxrecom))) + geom_bar() + theme(legend.position="none") + labs(x="FP Recom", y="Number of women")

p3 = ggplot(data=ca, aes(x=factor(fp.sifu), fill=factor(fp.sifu))) + geom_bar() + theme(legend.position="none") + labs(x="FP Sifu", y="Number of women")

grid.arrange(p1, p2, p3, ncol=3)

Cumulative probability of false positive

# Using survminer package
km = survfit(Surv(dur, fp.recall==1) ~ type, data=ca1)
ggsurvplot(km, xlab = "Time (Years)", fun="event") + labs(x="Time (years)", y="Cumulative risk of FP - Recall")

km = survfit(Surv(dur, fp.bxrecom==1) ~ type, data=ca1)
ggsurvplot(km, xlab = "Time (Years)", fun="event") + labs(x="Time (years)", y="Cumulative risk of FP BxRecom")

km = survfit(Surv(dur, fp.sifu==1) ~ type, data=ca1)
ggsurvplot(km, xlab = "Time (Years)", fun="event") + labs(x="Time (years)", y="Cumulative risk of FP - Sifu")