Reading in the inital data

Per Shubham (email 1.29.25), The initial query was very broad with selection criteria of patients older than 18 with microscopically confirmed non-small cell carcinoma of the lung and bronchus (that was not identified on autopsy or death certificate) who received surgery.

library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("dplyr")
library("readxl")
library("compareGroups")
library("readxl")
library("lubridate")
library("sas7bdat") 
library("compareGroups")
library("survival")
library("survminer")
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
unclean <- read.sas7bdat("C:/Users/tumins01/Downloads/uncleaned.sas7bdat") #n=29322

Patient Selection

Selection step 1: surgical procedure codes, excluding anyone who did not have a lobectomy or pnumonectomy or wedge; n=28594 (excluded = 728)

Useful links: https://apps.naaccr.org/data-dictionary/data-dictionary/version=22/data-item-view/item-number=1290/ https://seer.cancer.gov/archive/manuals/2018/AppendixC/Surgery_Codes_Lung_2018.pdf

# Recode RX_Summ__Surg_Prim_Site__1998__ to `primsite`
seer <- unclean %>%
  mutate(
    surg2 = case_when(
      RX_Summ__Surg_Prim_Site__1998__ %in% c(20, 21, 22, 23) ~ "Wedge",  #removing 24, 25, "Laser excision & Bronchial sleeve resection ONLY
      RX_Summ__Surg_Prim_Site__1998__ %in% c(30, 33, 45, 46, 47, 48) ~ "Lobectomy",
      RX_Summ__Surg_Prim_Site__1998__ %in% c(55, 56, 65, 66, 70) ~ "Pneumonectomy",
      TRUE ~ NA_character_    # Ensure NA is a character type
    )
  )

table(seer$RX_Summ__Surg_Prim_Site__1998__)
## 
##    12    13    15    19    20    21    22    23    24    25    30    33    45 
##   230    72    72    51   142  5090  1799   176   106    34  1746 18492   404 
##    46    47    48    55    56    65    66    70    80    90 
##   137    15    13   112   445     4     5    14    31   132
table(seer$surg2)
## 
##     Lobectomy Pneumonectomy         Wedge 
##         20807           580          7207
seer <- seer %>% filter(surg2 == "Lobectomy" | surg2 == "Pneumonectomy" | surg2 == "Wedge" )

  
#creating a new variable for specified mediastinal lymph node resection yes (1) or no (0)
seer <- seer %>%
  mutate(
    mediastinal = ifelse(RX_Summ__Surg_Prim_Site__1998__ %in% c(33, 56, 70), "yes", "no"))


table(seer$mediastinal)
## 
##    no   yes 
##  9643 18951
table(seer$mediastinal, seer$surg2)
##      
##       Lobectomy Pneumonectomy Wedge
##   no       2315           121  7207
##   yes     18492           459     0
table(seer$Derived_EOD_2018_N__2018__)
## 
##    88    N0    N1    N2    N3    NX 
##   238 22647  2759  2508   171   271

Selection step 2: Regional lymph nodes examined, removing when unknown or 90 (cause >90 number unknown); n=27381 (excluded = 1213)

Useful links: https://seer.cancer.gov/data-software/documentation/seerstat/nov2017/TextData.FileDescription.pdf#REGIONAL_NODES_EXAMINED

table(seer$Regional_nodes_examined__1988__)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2723  814 1027 1185 1251 1410 1357 1365 1387 1312 1433 1267 1187 1004  946  886 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  714  747  603  579  513  436  436  342  268  265  248  207  180  141  129  109 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##  116   91   73   74   56   44   44   47   46   39   32   27   27   24   20   16 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##   15   22    9   17    7   13    3    3    2    2    2    5    4    5    4    3 
##   64   65   66   68   69   70   71   72   73   75   76   83   86   90   95   96 
##    3    1    1    2    1    1    1    2    1    1    2    1    1    3  218   53 
##   97   98   99 
##  587  230  122
seer <- seer %>%
  filter(!Regional_nodes_examined__1988__ %in% c("90", "95", "96", "97", "98", "99", NA)) 

Selection step 3: Regional lymph nodes were positive, removing when unknown or 90 (cause >90 number unknown); n=26876 (excluded = 505)

table(seer$Regional_nodes_positive__1988__)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 19966  1947   909   577   346   231   162   111    77    47    35    39    29 
##    13    14    15    16    17    18    19    21    22    23    24    25    27 
##    20    14    17     6     6     4     6     4     4     1     1     3     1 
##    29    30    33    34    39    95    97    98    99 
##     1     2     1     1     1    81     6  2723     2
table(seer$Derived_EOD_2018_N__2018__)
## 
##    88    N0    N1    N2    N3    NX 
##   219 21828  2652  2329   150   203
seer <- seer %>%
  filter(!Regional_nodes_positive__1988__ %in% c("90", "95", "96", "97", "99", NA)) 

seer <- seer %>%
  mutate(
    Regional_nodes_positive__1988__ = case_when(
      Regional_nodes_positive__1988__ == "98" ~ NA_real_,
      TRUE ~ Regional_nodes_positive__1988__  # Keep other values unchanged
    )
  )

table(seer$Derived_EOD_2018_N__2018__)
## 
##    88    N0    N1    N2    N3    NX 
##   213 21825  2619  2287   145   203
seer <- seer %>%
  filter(!seer$Derived_EOD_2018_N__2018__ %in% c("88", "NX")) 

Selection step 4: Stage, only I, II, III, removing if unknown OR if grouped stage is distant; n= 25153 (excluded = 1723)

table(seer$Derived_EOD_2018_Stage_Group__20)
## 
##    0  1A1  1A2  1A3   1B   2A   2B    3   3A   3B   3C    4   4A   4B   99   OC 
##    4 3866 6047 3227 4449  913 3594   26 2966  686   32   71  687  190  117    1
seer <- seer %>% 
  mutate(stage = case_when(
    Derived_EOD_2018_Stage_Group__20 %in% c("1A1", "1A2", "1A3", "1B") ~ "Stage 1",
    Derived_EOD_2018_Stage_Group__20 %in% c("2A", "2B") ~ "Stage 2",
    Derived_EOD_2018_Stage_Group__20 %in% c("3", "3A", "3B", "3C") ~ "Stage 3",
    Derived_EOD_2018_Stage_Group__20 %in% c("4", "4A", "4B") ~ "Stage 4",
    TRUE ~ NA_character_  # Assign NA to unmatched cases
  ))

table(seer$stage)
## 
## Stage 1 Stage 2 Stage 3 Stage 4 
##   17589    4507    3710     948
table(seer$Combined_Summary_Stage__2004__) 
## 
##          Distant          In situ        Localized         Regional 
##             1599                4            15997             9245 
## Unknown/unstaged 
##               31
table(seer$Derived_EOD_2018_N__2018__)
## 
##    N0    N1    N2    N3 
## 21825  2619  2287   145
seer <- seer %>% filter(stage == "Stage 1" | stage == "Stage 2" | stage == "Stage 3") %>% filter(Combined_Summary_Stage__2004__ != "Distant" )

table(seer$EOD_Mets__2018__) #0, good, no mets
## 
##     0 
## 25153
table(seer$Derived_EOD_2018_N__2018__)
## 
##    N0    N1    N2 
## 20798  2402  1953

Selection step 5: Tumor Size, removing missing, unknown, or abnormally large tumor size; n=25028 (excluded = 125)

table(seer$Tumor_Size_Summary__2016__)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##    1   47   84  107  129  153  228  293  430  461  838  720 1068  767  776 1685 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  726  932 1035  557 1176  699  845  536  503 1175  381  432  524  224  734  312 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##  448  204  188  701  185  211  234  130  449  136  191  122   96  415   98   83 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##  119   49  255   59  106   65   47  234   48   74   64   19  209   35   46   35 
##   64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79 
##   26  139   20   38   31   24  102   17   32   24   16   88   14   15   19    5 
##   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95 
##   68   15   13    7    6   46    9    8   11    7   47    2    3    4    3   28 
##   96   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111 
##    6    8    6    4   37    4    3    4    3   17    3    2    4    2   19    2 
##  114  115  116  117  118  119  120  122  124  125  128  130  131  133  135  136 
##    3    6    2    2    2    1   18    1    1    2    1    6    1    1    2    1 
##  137  139  140  145  150  152  153  160  163  165  170  175  180  190  195  200 
##    1    1    5    3    3    1    2    2    1    2    2    2    4    2    2    1 
##  203  205  210  225  260  280  301  310  430  500  750  990  999 
##    1    1    1    1    1    1    1    1    1    1    1    2  112
seer <- seer %>%
  filter(
    !Tumor_Size_Summary__2016__ %in% c(990, 998, 999, 301, 307, 310, 430, 500, 750, NA)
  )

table(seer$Derived_EOD_2018_T__2018__)
## 
##   T0  T1a  T1b  T1c T1mi  T2a  T2b   T3   T4   TX 
##    1 2386 6632 3812 1711 5672 1352 2639  824    5
seer <- seer %>%
  filter(
    !Derived_EOD_2018_T__2018__ %in% c("T0", "TX")
  )

Selection step 6: good survival data, n=24021 (excluded = 1007)

Useful links:https://seer.cancer.gov/survivaltime/SurvivalTimeCalculation.pdf; Survival Months = floor((date last contact – date dx) / days in a month)

table(seer$Survival_months)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 712 770 808 862 751 757 778 528 509 809 816 795 759 723 802 719 746 698 787 683 
##  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35 
## 814 722 758 681 596 626 663 568 650 611 586 574 659 558 601 549
table(seer$Months_from_diagnosis_to_treatme)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 8119 7086 5587 2261  927  418  225  153   83   42   29   20   18    8    5    5 
##   16   17   19   20   21   23 
##    5    6    1    1    1    1
table(seer$COD_to_site_rec_KM)
## 
##                           Accidents and Adverse Effects 
##                                                      35 
##                              Acute Lymphocytic Leukemia 
##                                                       1 
##                                  Acute Myeloid Leukemia 
##                                                       5 
##                          Aleukemic, Subleukemic and NOS 
##                                                       1 
##                                                   Alive 
##                                                   22743 
##                          Anus, Anal Canal and Anorectum 
##                                                       1 
##                          Aortic Aneurysm and Dissection 
##                                                       1 
##                                         Atherosclerosis 
##                                                       1 
##                                        Bones and Joints 
##                                                       2 
##                          Brain and Other Nervous System 
##                                                       3 
##                                                  Breast 
##                                                      21 
##                                Cerebrovascular Diseases 
##                                                      43 
##                     Chronic Liver Disease and Cirrhosis 
##                                                       7 
##                            Chronic Lymphocytic Leukemia 
##                                                       4 
##   Chronic Obstructive Pulmonary Disease and Allied Cond 
##                                                      93 
##                                  Colon excluding Rectum 
##                                                      15 
##                                            Corpus Uteri 
##                                                       2 
##                                       Diabetes Mellitus 
##                                                      17 
##                                       Diseases of Heart 
##                                                     190 
##                                               Esophagus 
##                                                      11 
##                         Homicide and Legal Intervention 
##                                                       1 
##                      Hypertension without Heart Disease 
##                                                       3 
##            In situ, benign or unknown behavior neoplasm 
##                                                      13 
##                                  Intrahepatic Bile Duct 
##                                                       2 
##                                 Kidney and Renal Pelvis 
##                                                       7 
##                                                  Larynx 
##                                                       6 
##                                                   Liver 
##                                                       8 
##                                       Lung and Bronchus 
##                                                    1298 
##                                    Melanoma of the Skin 
##                                                       3 
##                          Miscellaneous Malignant Cancer 
##                                                      53 
##                                                 Myeloma 
##                                                       8 
##                                             Nasopharynx 
##                                                       1 
##             Nephritis, Nephrotic Syndrome and Nephrosis 
##                                                      15 
##                                    Non-Hodgkin Lymphoma 
##                                                      12 
##                                              Oropharynx 
##                                                       1 
##                                           Other Biliary 
##                                                       2 
##                                    Other Cause of Death 
##                                                     221 
##                                  Other Digestive Organs 
##                                                       1 
##     Other Diseases of Arteries, Arterioles, Capillaries 
##                                                       5 
##   Other Infectious and Parasitic Diseases including HIV 
##                                                       9 
##                        Other Myeloid/Monocytic Leukemia 
##                                                       1 
##                           Other Oral Cavity and Pharynx 
##                                                       2 
##                                                   Ovary 
##                                                       3 
##                                                Pancreas 
##                                                      19 
##                                 Pneumonia and Influenza 
##                                                      27 
##                                                Prostate 
##                                                      12 
##                        Rectum and Rectosigmoid Junction 
##                                                       1 
##                                              Septicemia 
##                                                      29 
##                             Soft Tissue including Heart 
##                                                       3 
## State DC not available or state DC available but no COD 
##                                                      28 
##                                                 Stomach 
##                                                       2 
##                             Stomach and Duodenal Ulcers 
##                                                       1 
##                       Suicide and Self-Inflicted Injury 
##                                                      11 
##              Symptoms, Signs and Ill-Defined Conditions 
##                                                      10 
##                                                  Tongue 
##                                                       5 
##                                                  Tonsil 
##                                                       2 
##                                         Urinary Bladder 
##                                                       4 
##                                             Uterus, NOS 
##                                                       3
seer <- seer %>% mutate(dead_any = ifelse(COD_to_site_rec_KM == "Alive", 0, 1))
table(seer$dead_any)
## 
##     0     1 
## 22743  2285
seer <- seer %>% mutate(dead_lung = ifelse(COD_to_site_rec_KM == "Lung and Bronchus", 1, 0))
table(seer$dead_lung)
## 
##     0     1 
## 23730  1298
#if survival months is time from dx to event/censoring, then need to subtract time to treatment, so survival is time of treatment to event/censor
seer <- seer %>% mutate(sur_time = Survival_months - Months_from_diagnosis_to_treatme )
table(seer$sur_time)
## 
## -15 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5 
##   1   2   1   2   5   1   6   8  16  17  52 148 234 467 906 782 793 778 676 681 
##   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25 
## 702 637 704 777 798 813 720 719 763 783 746 731 664 715 764 663 698 692 569 625 
##  26  27  28  29  30  31  32  33  34  35 
## 656 564 656 543 568 600 535 501 338 181
test <- seer %>% select(Survival_months, Months_from_diagnosis_to_treatme, sur_time)

seer <- seer %>% filter(sur_time >= 0)

Data Cleaning

simple_stats <- function(variable) {
  # Calculate stats
  mean_val <- mean(variable, na.rm = TRUE)
  sd_val <- sd(variable, na.rm = TRUE)
  range_val <- range(variable, na.rm = TRUE)
  
  # Print results
  cat("Mean:", mean_val, "\n")
  cat("Standard Deviation:", sd_val, "\n")
  cat("Range:", range_val, "\n")
}
#lymph nodes yes subset
seer_nodes_examined <- seer %>% filter(Regional_nodes_examined__1988__ != 0) # n=15694

summary(seer$Regional_nodes_examined__1988__)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    5.00   10.00   11.26   16.00   86.00
hist(seer$Regional_nodes_examined__1988__)

simple_stats(seer$Regional_nodes_examined__1988__)
## Mean: 11.25714 
## Standard Deviation: 9.071014 
## Range: 0 86
table(seer$Regional_nodes_examined__1988__)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1910  703  920 1063 1126 1282 1225 1240 1254 1159 1282 1137 1061  911  850  785 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  654  666  533  537  457  395  393  307  245  233  226  184  156  121  113  100 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##  108   82   66   66   52   34   40   41   45   32   30   23   21   23   18   16 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##   15   15    9   14    6   11    3    3    2    2    2    4    4    4    3    3 
##   64   65   68   69   70   71   72   73   75   76   86 
##    3    1    2    1    1    1    2    1    1    2    1
summary(seer_nodes_examined$Regional_nodes_positive__1988__)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5093  0.0000 39.0000
hist(seer_nodes_examined$Regional_nodes_positive__1988__)

simple_stats(seer_nodes_examined$Regional_nodes_positive__1988__)
## Mean: 0.5092856 
## Standard Deviation: 1.694691 
## Range: 0 39
table(seer_nodes_examined$Regional_nodes_positive__1988__)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 18205  1707   769   487   297   193   126    90    66    34    28    31    26 
##    13    14    15    16    17    18    19    21    22    24    25    27    29 
##    17    10    13     4     4     4     5     2     3     1     2     1     1 
##    30    33    34    39 
##     2     1     1     1
table(seer$Derived_EOD_2018_T__2018__)
## 
##  T1a  T1b  T1c T1mi  T2a  T2b   T3   T4 
## 2307 6370 3642 1656 5440 1301 2523  802
table(seer$Derived_EOD_2018_N__2018__)
## 
##    N0    N1    N2 
## 19890  2315  1836
table(seer$Derived_EOD_2018_M__2018__)
## 
##    M0 
## 24041
table(seer$Regional_nodes_examined__1988__, seer$Derived_EOD_2018_N__2018__)
##     
##        N0   N1   N2
##   0  1840   29   41
##   1   643   29   31
##   2   832   38   50
##   3   946   62   55
##   4  1001   74   51
##   5  1135   88   59
##   6  1043   91   91
##   7  1042  123   75
##   8  1053  113   88
##   9   985  100   74
##   10 1049  127  106
##   11  910  130   97
##   12  857  112   92
##   13  737   94   80
##   14  674  103   73
##   15  629   82   74
##   16  502   96   56
##   17  515   85   66
##   18  408   73   52
##   19  404   69   64
##   20  355   53   49
##   21  288   68   39
##   22  295   52   46
##   23  214   48   45
##   24  172   48   25
##   25  166   36   31
##   26  163   33   30
##   27  137   31   16
##   28  116   18   22
##   29   84   21   16
##   30   82   22    9
##   31   73   13   14
##   32   76   17   15
##   33   47   24   11
##   34   45   10   11
##   35   52    8    6
##   36   35   12    5
##   37   27    4    3
##   38   28    6    6
##   39   24    7   10
##   40   30   10    5
##   41   18    4   10
##   42   26    2    2
##   43   17    3    3
##   44   14    3    4
##   45   11    8    4
##   46    9    3    6
##   47    7    4    5
##   48   11    3    1
##   49    9    3    3
##   50    8    1    0
##   51   10    3    1
##   52    6    0    0
##   53    7    3    1
##   54    2    1    0
##   55    2    0    1
##   56    1    1    0
##   57    2    0    0
##   58    0    2    0
##   59    3    0    1
##   60    2    1    1
##   61    0    2    2
##   62    1    2    0
##   63    1    2    0
##   64    2    1    0
##   65    1    0    0
##   68    0    2    0
##   69    0    0    1
##   70    1    0    0
##   71    0    1    0
##   72    2    0    0
##   73    1    0    0
##   75    1    0    0
##   76    1    0    1
##   86    0    1    0
seer <- seer %>% mutate(node_any = ifelse(Regional_nodes_examined__1988__ == 0, "no", "yes"))
table(seer$Regional_nodes_examined__1988__, seer$node_any)
##     
##        no  yes
##   0  1910    0
##   1     0  703
##   2     0  920
##   3     0 1063
##   4     0 1126
##   5     0 1282
##   6     0 1225
##   7     0 1240
##   8     0 1254
##   9     0 1159
##   10    0 1282
##   11    0 1137
##   12    0 1061
##   13    0  911
##   14    0  850
##   15    0  785
##   16    0  654
##   17    0  666
##   18    0  533
##   19    0  537
##   20    0  457
##   21    0  395
##   22    0  393
##   23    0  307
##   24    0  245
##   25    0  233
##   26    0  226
##   27    0  184
##   28    0  156
##   29    0  121
##   30    0  113
##   31    0  100
##   32    0  108
##   33    0   82
##   34    0   66
##   35    0   66
##   36    0   52
##   37    0   34
##   38    0   40
##   39    0   41
##   40    0   45
##   41    0   32
##   42    0   30
##   43    0   23
##   44    0   21
##   45    0   23
##   46    0   18
##   47    0   16
##   48    0   15
##   49    0   15
##   50    0    9
##   51    0   14
##   52    0    6
##   53    0   11
##   54    0    3
##   55    0    3
##   56    0    2
##   57    0    2
##   58    0    2
##   59    0    4
##   60    0    4
##   61    0    4
##   62    0    3
##   63    0    3
##   64    0    3
##   65    0    1
##   68    0    2
##   69    0    1
##   70    0    1
##   71    0    1
##   72    0    2
##   73    0    1
##   75    0    1
##   76    0    2
##   86    0    1
seer <- seer %>%
  mutate(node_group = ifelse(Regional_nodes_examined__1988__ == 0, "0 nodes",
                             ifelse(Regional_nodes_examined__1988__ >= 1 & Regional_nodes_examined__1988__ <= 4, "1-4 nodes",
                                    ifelse(Regional_nodes_examined__1988__ > 4 & Regional_nodes_examined__1988__ <= 10, "5-10 nodes", 
                                           "10+ nodes")))) 

table(seer$Regional_nodes_examined__1988__, seer$node_group)
##     
##      0 nodes 1-4 nodes 10+ nodes 5-10 nodes
##   0     1910         0         0          0
##   1        0       703         0          0
##   2        0       920         0          0
##   3        0      1063         0          0
##   4        0      1126         0          0
##   5        0         0         0       1282
##   6        0         0         0       1225
##   7        0         0         0       1240
##   8        0         0         0       1254
##   9        0         0         0       1159
##   10       0         0         0       1282
##   11       0         0      1137          0
##   12       0         0      1061          0
##   13       0         0       911          0
##   14       0         0       850          0
##   15       0         0       785          0
##   16       0         0       654          0
##   17       0         0       666          0
##   18       0         0       533          0
##   19       0         0       537          0
##   20       0         0       457          0
##   21       0         0       395          0
##   22       0         0       393          0
##   23       0         0       307          0
##   24       0         0       245          0
##   25       0         0       233          0
##   26       0         0       226          0
##   27       0         0       184          0
##   28       0         0       156          0
##   29       0         0       121          0
##   30       0         0       113          0
##   31       0         0       100          0
##   32       0         0       108          0
##   33       0         0        82          0
##   34       0         0        66          0
##   35       0         0        66          0
##   36       0         0        52          0
##   37       0         0        34          0
##   38       0         0        40          0
##   39       0         0        41          0
##   40       0         0        45          0
##   41       0         0        32          0
##   42       0         0        30          0
##   43       0         0        23          0
##   44       0         0        21          0
##   45       0         0        23          0
##   46       0         0        18          0
##   47       0         0        16          0
##   48       0         0        15          0
##   49       0         0        15          0
##   50       0         0         9          0
##   51       0         0        14          0
##   52       0         0         6          0
##   53       0         0        11          0
##   54       0         0         3          0
##   55       0         0         3          0
##   56       0         0         2          0
##   57       0         0         2          0
##   58       0         0         2          0
##   59       0         0         4          0
##   60       0         0         4          0
##   61       0         0         4          0
##   62       0         0         3          0
##   63       0         0         3          0
##   64       0         0         3          0
##   65       0         0         1          0
##   68       0         0         2          0
##   69       0         0         1          0
##   70       0         0         1          0
##   71       0         0         1          0
##   72       0         0         2          0
##   73       0         0         1          0
##   75       0         0         1          0
##   76       0         0         2          0
##   86       0         0         1          0
#how is TN distributed by stage

seer <- seer %>% 
  mutate(TN_Stage = case_when(
    Derived_EOD_2018_T__2018__ %in% c("T1a", "T1b", "T1c", "T1mi") & Derived_EOD_2018_N__2018__ == "N0" ~ "T1N0", 
    Derived_EOD_2018_T__2018__ %in% c("T1a", "T1b", "T1c", "T1mi") & Derived_EOD_2018_N__2018__ == "N1" ~ "T1N1",
    Derived_EOD_2018_T__2018__ %in% c("T1a", "T1b", "T1c", "T1mi") & Derived_EOD_2018_N__2018__ == "N2" ~ "T1N2",
    Derived_EOD_2018_T__2018__ %in% c("T1a", "T1b", "T1c", "T1mi") & Derived_EOD_2018_N__2018__ == "N3" ~ "T1N3",
    
    Derived_EOD_2018_T__2018__ %in% c("T2a", "T2b") & Derived_EOD_2018_N__2018__ == "N0" ~ "T2N0",
    Derived_EOD_2018_T__2018__ %in% c("T2a", "T2b") & Derived_EOD_2018_N__2018__ == "N1" ~ "T2N1",
    Derived_EOD_2018_T__2018__ %in% c("T2a", "T2b") & Derived_EOD_2018_N__2018__ == "N2" ~ "T2N2",
    Derived_EOD_2018_T__2018__ %in% c("T2a", "T2b") & Derived_EOD_2018_N__2018__ == "N3" ~ "T2N3",
    
    Derived_EOD_2018_T__2018__ %in% c("T3") & Derived_EOD_2018_N__2018__ == "N0" ~ "T3N0",
    Derived_EOD_2018_T__2018__ %in% c("T3") & Derived_EOD_2018_N__2018__ == "N1" ~ "T3N1",
    Derived_EOD_2018_T__2018__ %in% c("T3") & Derived_EOD_2018_N__2018__ == "N2" ~ "T3N2",
    Derived_EOD_2018_T__2018__ %in% c("T3") & Derived_EOD_2018_N__2018__ == "N3" ~ "T3N3",
    
    Derived_EOD_2018_T__2018__ %in% c("T4") & Derived_EOD_2018_N__2018__ == "N0" ~ "T4N0",
    Derived_EOD_2018_T__2018__ %in% c("T4") & Derived_EOD_2018_N__2018__ == "N1" ~ "T4N1",
    Derived_EOD_2018_T__2018__ %in% c("T4") & Derived_EOD_2018_N__2018__ == "N2" ~ "T4N2",
    Derived_EOD_2018_T__2018__ %in% c("T4") & Derived_EOD_2018_N__2018__ == "N3" ~ "T4N3",
    
    TRUE ~ NA_character_  # Assign NA to unmatched cases
  ))

table(seer$TN_Stage)
## 
##  T1N0  T1N1  T1N2  T2N0  T2N1  T2N2  T3N0  T3N1  T3N2  T4N0  T4N1  T4N2 
## 12597   786   592  5119   891   731  1714   452   357   460   186   156
seer$TN_Stage <- as.factor(seer$TN_Stage)

table(seer$Derived_EOD_2018_T__2018__)
## 
##  T1a  T1b  T1c T1mi  T2a  T2b   T3   T4 
## 2307 6370 3642 1656 5440 1301 2523  802
seer <- seer %>% 
  mutate(T_Stage = case_when(
    Derived_EOD_2018_T__2018__ %in% c("T1a", "T1b", "T1c", "T1mi")  ~ "T1", 
    Derived_EOD_2018_T__2018__ %in% c("T2a", "T2b")  ~ "T2",
    Derived_EOD_2018_T__2018__ %in% c("T3")  ~ "T3",
    Derived_EOD_2018_T__2018__ %in% c("T4")  ~ "T4",
    TRUE ~ NA_character_  # Assign NA to unmatched cases
    ))

table(seer$T_Stage)
## 
##    T1    T2    T3    T4 
## 13975  6741  2523   802
table(seer$TN_Stage, seer$mediastinal)
##       
##          no  yes
##   T1N0 4847 7750
##   T1N1  169  617
##   T1N2  124  468
##   T2N0 1462 3657
##   T2N1  144  747
##   T2N2  123  608
##   T3N0  416 1298
##   T3N1   75  377
##   T3N2   67  290
##   T4N0  111  349
##   T4N1   31  155
##   T4N2   33  123
#age
table(seer$age)
## 
##   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33 
##    4    2    5    4    5    5    5    5    3    8   11    7    4   15    9   13 
##   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49 
##   11   20   18   15   22   23   28   35   35   35   43   60   54   86   85  118 
##   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65 
##  118  163  159  240  267  339  404  464  455  551  625  669  727  752  827  976 
##   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81 
##  942  997 1005 1064 1063 1106 1121 1044  977  965  899  846  652  613  478  430 
##   82   83   84   85   86   87   88   89   90 
##  379  293  201  155  102   67   46   34   33
seer$age2 <- as.numeric(seer$age)

#race
table(seer$Race_recode__W__B__AI__API_)
## 
## American Indian/Alaska Native     Asian or Pacific Islander 
##                            98                          2023 
##                         Black                       Unknown 
##                          1993                           130 
##                         White 
##                         19797
seer <- seer %>% mutate(race2 = ifelse(Race_recode__W__B__AI__API_ == "American Indian/Alaska Native" | Race_recode__W__B__AI__API_ == "Unknown", "Other/Unknown", Race_recode__W__B__AI__API_))
table(seer$race2)
## 
## Asian or Pacific Islander                     Black             Other/Unknown 
##                      2023                      1993                       228 
##                     White 
##                     19797
#sex
table(seer$Sex)
## 
## Female   Male 
##  13714  10327
#marital status
table(seer$Marital_status_at_diagnosis)
## 
##                       Divorced Married (including common law) 
##                           2887                          13531 
##                      Separated         Single (never married) 
##                            230                           3353 
##                        Unknown  Unmarried or Domestic Partner 
##                            915                            149 
##                        Widowed 
##                           2976
seer <- seer %>% mutate(married2 = ifelse(Marital_status_at_diagnosis == "Divorced" | Marital_status_at_diagnosis == "Separated" | Marital_status_at_diagnosis == "Single (never married)", "single/seperated",
                                          ifelse(Marital_status_at_diagnosis == "Married (including common law)" | Marital_status_at_diagnosis == "Unmarried or Domestic Partner", "married/partner",  Marital_status_at_diagnosis)))
table(seer$married2)
## 
##  married/partner single/seperated          Unknown          Widowed 
##            13680             6470              915             2976
#histology
table(seer$Histologic_Type_ICD_O_3)
## 
## 8010 8012 8013 8022 8023 8031 8032 8033 8070 8071 8072 8082 8083 8140 8144 8200 
##   45   90  265  103    1    4   25   28 2393 1200 1056    8   92 6305   24   31 
## 8230 8240 8246 8249 8250 8253 8254 8255 8256 8257 8260 8265 8333 8430 8480 8481 
##  809 1604   87  262 1244  562  169  637  158   20  667  257    4   58  391    9 
## 8551 8560 8562 
## 5030  401    2
seer <- seer %>%
  mutate(
    hist2 = case_when(
      Histologic_Type_ICD_O_3 %in% c(8070, 8071, 8072, 8073, 8074, 8075) ~ "Squamous cell", 
      Histologic_Type_ICD_O_3 %in% c(8140, 8144) ~ "Adenocarcinoma",                       
      Histologic_Type_ICD_O_3 %in% c(8240, 8244, 8245, 8246, 8249) ~ "Carcinoid",     
      Histologic_Type_ICD_O_3 %in% c(8250, 8251, 8252, 8253, 8254, 8255, 
                                     8256, 8257) ~ "Bronchioalveolar",                       
      Histologic_Type_ICD_O_3 %in% c(8012, 8013, 8014) ~ "Large cell",                 
      TRUE ~ "Other"                                                             
    )
  )

table(seer$hist2)
## 
##   Adenocarcinoma Bronchioalveolar        Carcinoid       Large cell 
##             6329             2790             1953              355 
##            Other    Squamous cell 
##             7965             4649
#Grade
table(seer$Grade_Pathological__2018__)
## 
##    1    2    3    4    9 
## 4346 9544 5369   99 4683
seer <- seer %>% mutate(
  grade2 = ifelse(Grade_Pathological__2018__ == 1, "G1: Well differentiated",
                  ifelse(Grade_Pathological__2018__ == 2, "G2: Moderately differentiated",
                         ifelse(Grade_Pathological__2018__ == 3, "G3: Poorly differentiated", "G4: Undifferentiated/Unknown")))
)

table(seer$grade2)
## 
##       G1: Well differentiated G2: Moderately differentiated 
##                          4346                          9544 
##     G3: Poorly differentiated  G4: Undifferentiated/Unknown 
##                          5369                          4782
#tumor size
summary(seer$Tumor_Size_Summary__2016__)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   15.00   21.00   25.97   32.00  280.00
hist(seer$Tumor_Size_Summary__2016__)

simple_stats(seer$Tumor_Size_Summary__2016__)
## Mean: 25.97396 
## Standard Deviation: 18.18596 
## Range: 1 280
#tumor site
table(seer$Primary_Site___labeled)
## 
##              C34.0-Main bronchus           C34.1-Upper lobe, lung 
##                              119                            13526 
##          C34.2-Middle lobe, lung           C34.3-Lower lobe, lung 
##                             1518                             8450 
## C34.8-Overlapping lesion of lung                  C34.9-Lung, NOS 
##                              205                              223
seer <- seer %>% mutate(site2 = ifelse(Primary_Site___labeled == "C34.1-Upper lobe, lung", "Upper lobe",
                                       ifelse(Primary_Site___labeled == "C34.2-Middle lobe, lung", "Middle lobe",
                                              ifelse(Primary_Site___labeled == "C34.3-Lower lobe, lung", "Lower lobe", "Other,NOS" ))))

table(seer$site2)
## 
##  Lower lobe Middle lobe   Other,NOS  Upper lobe 
##        8450        1518         547       13526
#systemic treatment
table(seer$RX_Summ__Systemic_Sur_Seq__2007_)
## 
## No systemic therapy and/or surgical procedures 
##                                          18735 
##                               Sequence unknown 
##                                              1 
## Surgery both before and after systemic therapy 
##                                             89 
##                 Systemic therapy after surgery 
##                                           4598 
##                Systemic therapy before surgery 
##                                            440 
## Systemic therapy both before and after surgery 
##                                            178
test <- seer %>% filter(is.na(RX_Summ__Systemic_Sur_Seq__2007_))

seer <- seer %>% mutate(treat2 = ifelse(RX_Summ__Systemic_Sur_Seq__2007_ == "No systemic therapy and/or surgical procedures", "no", "yes"))

table(seer$treat2)
## 
##    no   yes 
## 18735  5306
#year of dx
table(seer$Year_of_diagnosis)
## 
## 2018 2019 2020 
## 8528 8873 6640
seer$Year_of_diagnosis <- as.character(seer$Year_of_diagnosis)

Univariate Analysis

#Nodes
table1 <- compareGroups(surg2 ~ Regional_nodes_examined__1988__ + node_any + node_group + mediastinal, data = seer)
createTable(table1)
## 
## --------Summary descriptives table by 'surg2'---------
## 
## __________________________________________________________________________________ 
##                                   Lobectomy   Pneumonectomy    Wedge     p.overall 
##                                    N=17897        N=437        N=5707              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Regional_nodes_examined__1988__  12.9 (8.85)   18.4 (12.2)  5.54 (6.66)    0.000   
## node_any:                                                                  0.000   
##     no                           399 (2.23%)   10 (2.29%)   1501 (26.3%)           
##     yes                         17498 (97.8%)  427 (97.7%)  4206 (73.7%)           
## node_group:                                                                0.000   
##     0 nodes                      399 (2.23%)   10 (2.29%)   1501 (26.3%)           
##     1-4 nodes                   2056 (11.5%)   23 (5.26%)   1733 (30.4%)           
##     10+ nodes                   9596 (53.6%)   316 (72.3%)  965 (16.9%)            
##     5-10 nodes                  5846 (32.7%)   88 (20.1%)   1508 (26.4%)           
## mediastinal:                                                               0.000   
##     no                          1806 (10.1%)   89 (20.4%)   5707 (100%)            
##     yes                         16091 (89.9%)  348 (79.6%)   0 (0.00%)             
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#Clinical and demographic factors
table2 <- compareGroups(surg2 ~ age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage +  treat2 + Year_of_diagnosis, data = seer)
createTable(table2)
## 
## --------Summary descriptives table by 'surg2'---------
## 
## ____________________________________________________________________________________ 
##                                     Lobectomy   Pneumonectomy    Wedge     p.overall 
##                                      N=17897        N=437        N=5707              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age2                               67.8 (9.35)   62.8 (11.7)  69.5 (8.91)   <0.001   
## Sex:                                                                        <0.001   
##     Female                        10128 (56.6%)  193 (44.2%)  3393 (59.5%)           
##     Male                          7769 (43.4%)   244 (55.8%)  2314 (40.5%)           
## race2:                                                                         .     
##     Asian or Pacific Islander     1606 (8.97%)   32 (7.32%)   385 (6.75%)            
##     Black                         1510 (8.44%)   41 (9.38%)   442 (7.74%)            
##     Other/Unknown                  182 (1.02%)    1 (0.23%)    45 (0.79%)            
##     White                         14599 (81.6%)  363 (83.1%)  4835 (84.7%)           
## married2:                                                                   <0.001   
##     married/partner               10270 (57.4%)  254 (58.1%)  3156 (55.3%)           
##     single/seperated              4842 (27.1%)   131 (30.0%)  1497 (26.2%)           
##     Unknown                        624 (3.49%)   20 (4.58%)   271 (4.75%)            
##     Widowed                       2161 (12.1%)   32 (7.32%)   783 (13.7%)            
## hist2:                                                                      <0.001   
##     Adenocarcinoma                4826 (27.0%)   70 (16.0%)   1433 (25.1%)           
##     Bronchioalveolar              1981 (11.1%)   25 (5.72%)   784 (13.7%)            
##     Carcinoid                     1381 (7.72%)   49 (11.2%)   523 (9.16%)            
##     Large cell                     274 (1.53%)    8 (1.83%)    73 (1.28%)            
##     Other                         5989 (33.5%)   99 (22.7%)   1877 (32.9%)           
##     Squamous cell                 3446 (19.3%)   186 (42.6%)  1017 (17.8%)           
## site2:                                                                      <0.001   
##     Lower lobe                    6254 (34.9%)   128 (29.3%)  2068 (36.2%)           
##     Middle lobe                   1285 (7.18%)   15 (3.43%)   218 (3.82%)            
##     Other,NOS                      325 (1.82%)   107 (24.5%)  115 (2.02%)            
##     Upper lobe                    10033 (56.1%)  187 (42.8%)  3306 (57.9%)           
## grade2:                                                                     <0.001   
##     G1: Well differentiated       3065 (17.1%)   52 (11.9%)   1229 (21.5%)           
##     G2: Moderately differentiated 7248 (40.5%)   115 (26.3%)  2181 (38.2%)           
##     G3: Poorly differentiated     4140 (23.1%)   146 (33.4%)  1083 (19.0%)           
##     G4: Undifferentiated/Unknown  3444 (19.2%)   124 (28.4%)  1214 (21.3%)           
## T_Stage:                                                                    <0.001   
##     T1                            9719 (54.3%)   77 (17.6%)   4179 (73.2%)           
##     T2                            5413 (30.2%)   160 (36.6%)  1168 (20.5%)           
##     T3                            2130 (11.9%)   94 (21.5%)   299 (5.24%)            
##     T4                             635 (3.55%)   106 (24.3%)   61 (1.07%)            
## treat2:                                                                     <0.001   
##     no                            13421 (75.0%)  200 (45.8%)  5114 (89.6%)           
##     yes                           4476 (25.0%)   237 (54.2%)  593 (10.4%)            
## Year_of_diagnosis:                                                           0.002   
##     2018                          6384 (35.7%)   189 (43.2%)  1955 (34.3%)           
##     2019                          6617 (37.0%)   145 (33.2%)  2111 (37.0%)           
##     2020                          4896 (27.4%)   103 (23.6%)  1641 (28.8%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
seer$any2 <- as.factor(seer$dead_any)
seer$lung2 <- as.factor(seer$dead_lung)

table3 <- compareGroups(surg2 ~ sur_time + any2 + lung2, data = seer)
createTable(table3)
## 
## --------Summary descriptives table by 'surg2'---------
## 
## ___________________________________________________________ 
##            Lobectomy   Pneumonectomy    Wedge     p.overall 
##             N=17897        N=437        N=5707              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## sur_time  15.9 (9.87)   15.4 (10.2)  15.9 (9.99)    0.558   
## any2:                                              <0.001   
##     0    16267 (90.9%)  332 (76.0%)  5189 (90.9%)           
##     1    1630 (9.11%)   105 (24.0%)  518 (9.08%)            
## lung2:                                             <0.001   
##     0    16938 (94.6%)  368 (84.2%)  5456 (95.6%)           
##     1     959 (5.36%)   69 (15.8%)   251 (4.40%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Survival Analysis

##All cause mortality

#any nodes examined
km_fit <- survfit(Surv(sur_time, dead_any) ~ node_any, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("0 Nodes", "1+ Nodes"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model1 <- coxph(Surv(sur_time, dead_any) ~ node_any + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model1)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ node_any + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 2253 
## 
##                                          coef exp(coef)  se(coef)      z
## node_anyyes                         -0.426588  0.652733  0.068667 -6.212
## age2                                 0.030539  1.031011  0.002669 11.442
## SexMale                              0.426833  1.532397  0.044939  9.498
## race2Black                           0.435445  1.545650  0.113971  3.821
## race2Other/Unknown                   0.543083  1.721306  0.227317  2.389
## race2White                           0.339692  1.404514  0.091537  3.711
## married2single/seperated             0.177247  1.193926  0.051576  3.437
## married2Unknown                      0.187867  1.206673  0.101559  1.850
## married2Widowed                      0.277651  1.320025  0.064045  4.335
## hist2Bronchioalveolar               -0.187584  0.828959  0.085805 -2.186
## hist2Carcinoid                      -0.914041  0.400901  0.154410 -5.920
## hist2Large cell                      0.638590  1.893809  0.130731  4.885
## hist2Other                          -0.082682  0.920644  0.056705 -1.458
## hist2Squamous cell                   0.221851  1.248385  0.056693  3.913
## site2Middle lobe                     0.078584  1.081754  0.096012  0.818
## site2Other,NOS                       0.094234  1.098816  0.124867  0.755
## site2Upper lobe                     -0.111702  0.894311  0.045347 -2.463
## grade2G2: Moderately differentiated  0.369531  1.447056  0.082537  4.477
## grade2G3: Poorly differentiated      0.672683  1.959487  0.085619  7.857
## grade2G4: Undifferentiated/Unknown   0.377012  1.457922  0.089711  4.203
## T_StageT2                            0.443147  1.557602  0.050953  8.697
## T_StageT3                            0.791138  2.205906  0.065946 11.997
## T_StageT4                            1.205427  3.338184  0.085593 14.083
## treat2yes                           -0.025059  0.975252  0.053101 -0.472
## Year_of_diagnosis2019               -0.061986  0.939896  0.049041 -1.264
## Year_of_diagnosis2020               -0.131279  0.876973  0.083370 -1.575
##                                     Pr(>|z|)    
## node_anyyes                         5.22e-10 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          0.000133 ***
## race2Other/Unknown                  0.016890 *  
## race2White                          0.000206 ***
## married2single/seperated            0.000589 ***
## married2Unknown                     0.064338 .  
## married2Widowed                     1.46e-05 ***
## hist2Bronchioalveolar               0.028803 *  
## hist2Carcinoid                      3.23e-09 ***
## hist2Large cell                     1.04e-06 ***
## hist2Other                          0.144814    
## hist2Squamous cell                  9.11e-05 ***
## site2Middle lobe                    0.413085    
## site2Other,NOS                      0.450446    
## site2Upper lobe                     0.013766 *  
## grade2G2: Moderately differentiated 7.57e-06 ***
## grade2G3: Poorly differentiated     3.94e-15 ***
## grade2G4: Undifferentiated/Unknown  2.64e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.636987    
## Year_of_diagnosis2019               0.206248    
## Year_of_diagnosis2020               0.115337    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## node_anyyes                            0.6527     1.5320    0.5705    0.7468
## age2                                   1.0310     0.9699    1.0256    1.0364
## SexMale                                1.5324     0.6526    1.4032    1.6735
## race2Black                             1.5457     0.6470    1.2362    1.9325
## race2Other/Unknown                     1.7213     0.5810    1.1025    2.6875
## race2White                             1.4045     0.7120    1.1738    1.6805
## married2single/seperated               1.1939     0.8376    1.0791    1.3209
## married2Unknown                        1.2067     0.8287    0.9889    1.4724
## married2Widowed                        1.3200     0.7576    1.1643    1.4966
## hist2Bronchioalveolar                  0.8290     1.2063    0.7006    0.9808
## hist2Carcinoid                         0.4009     2.4944    0.2962    0.5426
## hist2Large cell                        1.8938     0.5280    1.4657    2.4469
## hist2Other                             0.9206     1.0862    0.8238    1.0289
## hist2Squamous cell                     1.2484     0.8010    1.1171    1.3951
## site2Middle lobe                       1.0818     0.9244    0.8962    1.3057
## site2Other,NOS                         1.0988     0.9101    0.8603    1.4035
## site2Upper lobe                        0.8943     1.1182    0.8183    0.9774
## grade2G2: Moderately differentiated    1.4471     0.6911    1.2309    1.7011
## grade2G3: Poorly differentiated        1.9595     0.5103    1.6568    2.3175
## grade2G4: Undifferentiated/Unknown     1.4579     0.6859    1.2228    1.7382
## T_StageT2                              1.5576     0.6420    1.4096    1.7212
## T_StageT3                              2.2059     0.4533    1.9384    2.5103
## T_StageT4                              3.3382     0.2996    2.8226    3.9479
## treat2yes                              0.9753     1.0254    0.8789    1.0822
## Year_of_diagnosis2019                  0.9399     1.0639    0.8538    1.0347
## Year_of_diagnosis2020                  0.8770     1.1403    0.7448    1.0326
## 
## Concordance= 0.696  (se = 0.006 )
## Likelihood ratio test= 1125  on 26 df,   p=<2e-16
## Wald test            = 1086  on 26 df,   p=<2e-16
## Score (logrank) test = 1181  on 26 df,   p=<2e-16
#any mediastinal examined
km_fit <- survfit(Surv(sur_time, dead_any) ~ mediastinal, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("No-mediastinal", "Yes-mediastinal"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model2 <- coxph(Surv(sur_time, dead_any) ~ mediastinal + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model2)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ mediastinal + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 2253 
## 
##                                          coef exp(coef)  se(coef)      z
## mediastinalyes                      -0.205284  0.814416  0.045492 -4.513
## age2                                 0.030511  1.030981  0.002674 11.408
## SexMale                              0.429490  1.536474  0.044944  9.556
## race2Black                           0.453221  1.573372  0.113935  3.978
## race2Other/Unknown                   0.523892  1.688586  0.227367  2.304
## race2White                           0.343511  1.409889  0.091543  3.752
## married2single/seperated             0.172604  1.188395  0.051604  3.345
## married2Unknown                      0.182602  1.200336  0.101583  1.798
## married2Widowed                      0.280307  1.323536  0.064074  4.375
## hist2Bronchioalveolar               -0.198731  0.819770  0.085788 -2.317
## hist2Carcinoid                      -0.909578  0.402694  0.154368 -5.892
## hist2Large cell                      0.633641  1.884460  0.130707  4.848
## hist2Other                          -0.091138  0.912892  0.056663 -1.608
## hist2Squamous cell                   0.218926  1.244739  0.056678  3.863
## site2Middle lobe                     0.097629  1.102553  0.096030  1.017
## site2Other,NOS                       0.098523  1.103540  0.124836  0.789
## site2Upper lobe                     -0.109701  0.896102  0.045351 -2.419
## grade2G2: Moderately differentiated  0.370893  1.449029  0.082546  4.493
## grade2G3: Poorly differentiated      0.673370  1.960834  0.085608  7.866
## grade2G4: Undifferentiated/Unknown   0.390899  1.478310  0.089643  4.361
## T_StageT2                            0.440578  1.553605  0.050960  8.646
## T_StageT3                            0.796047  2.216762  0.066049 12.052
## T_StageT4                            1.212390  3.361509  0.085712 14.145
## treat2yes                           -0.019862  0.980334  0.053206 -0.373
## Year_of_diagnosis2019               -0.069006  0.933321  0.049010 -1.408
## Year_of_diagnosis2020               -0.139351  0.869923  0.083332 -1.672
##                                     Pr(>|z|)    
## mediastinalyes                      6.40e-06 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          6.95e-05 ***
## race2Other/Unknown                  0.021213 *  
## race2White                          0.000175 ***
## married2single/seperated            0.000824 ***
## married2Unknown                     0.072246 .  
## married2Widowed                     1.22e-05 ***
## hist2Bronchioalveolar               0.020529 *  
## hist2Carcinoid                      3.81e-09 ***
## hist2Large cell                     1.25e-06 ***
## hist2Other                          0.107742    
## hist2Squamous cell                  0.000112 ***
## site2Middle lobe                    0.309324    
## site2Other,NOS                      0.429981    
## site2Upper lobe                     0.015566 *  
## grade2G2: Moderately differentiated 7.02e-06 ***
## grade2G3: Poorly differentiated     3.67e-15 ***
## grade2G4: Undifferentiated/Unknown  1.30e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.708922    
## Year_of_diagnosis2019               0.159132    
## Year_of_diagnosis2020               0.094480 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## mediastinalyes                         0.8144     1.2279    0.7449    0.8904
## age2                                   1.0310     0.9699    1.0256    1.0364
## SexMale                                1.5365     0.6508    1.4069    1.6780
## race2Black                             1.5734     0.6356    1.2585    1.9670
## race2Other/Unknown                     1.6886     0.5922    1.0814    2.6367
## race2White                             1.4099     0.7093    1.1783    1.6870
## married2single/seperated               1.1884     0.8415    1.0741    1.3149
## married2Unknown                        1.2003     0.8331    0.9836    1.4648
## married2Widowed                        1.3235     0.7556    1.1673    1.5006
## hist2Bronchioalveolar                  0.8198     1.2199    0.6929    0.9699
## hist2Carcinoid                         0.4027     2.4833    0.2976    0.5450
## hist2Large cell                        1.8845     0.5307    1.4586    2.4347
## hist2Other                             0.9129     1.0954    0.8169    1.0201
## hist2Squamous cell                     1.2447     0.8034    1.1139    1.3910
## site2Middle lobe                       1.1026     0.9070    0.9134    1.3309
## site2Other,NOS                         1.1035     0.9062    0.8640    1.4094
## site2Upper lobe                        0.8961     1.1159    0.8199    0.9794
## grade2G2: Moderately differentiated    1.4490     0.6901    1.2326    1.7035
## grade2G3: Poorly differentiated        1.9608     0.5100    1.6579    2.3191
## grade2G4: Undifferentiated/Unknown     1.4783     0.6764    1.2401    1.7623
## T_StageT2                              1.5536     0.6437    1.4059    1.7168
## T_StageT3                              2.2168     0.4511    1.9476    2.5231
## T_StageT4                              3.3615     0.2975    2.8417    3.9764
## treat2yes                              0.9803     1.0201    0.8833    1.0881
## Year_of_diagnosis2019                  0.9333     1.0714    0.8478    1.0274
## Year_of_diagnosis2020                  0.8699     1.1495    0.7388    1.0243
## 
## Concordance= 0.694  (se = 0.006 )
## Likelihood ratio test= 1110  on 26 df,   p=<2e-16
## Wald test            = 1067  on 26 df,   p=<2e-16
## Score (logrank) test = 1163  on 26 df,   p=<2e-16
#nodes examined by group
km_fit <- survfit(Surv(sur_time, dead_any) ~ node_group, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("0 Nodes", "1-4 Nodes", "10+ nodes", "5-10 nodes"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model3 <- coxph(Surv(sur_time, dead_any) ~ node_group + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model3)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ node_group + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 2253 
## 
##                                          coef exp(coef)  se(coef)      z
## node_group1-4 nodes                 -0.238580  0.787746  0.082055 -2.908
## node_group10+ nodes                 -0.504462  0.603830  0.072910 -6.919
## node_group5-10 nodes                -0.425695  0.653315  0.075932 -5.606
## age2                                 0.030226  1.030688  0.002669 11.324
## SexMale                              0.428073  1.534298  0.044952  9.523
## race2Black                           0.430434  1.537925  0.113994  3.776
## race2Other/Unknown                   0.543853  1.722632  0.227361  2.392
## race2White                           0.346295  1.413819  0.091552  3.782
## married2single/seperated             0.173338  1.189268  0.051597  3.359
## married2Unknown                      0.187954  1.206777  0.101601  1.850
## married2Widowed                      0.278771  1.321504  0.064051  4.352
## hist2Bronchioalveolar               -0.185851  0.830397  0.085799 -2.166
## hist2Carcinoid                      -0.912950  0.401339  0.154415 -5.912
## hist2Large cell                      0.640993  1.898364  0.130754  4.902
## hist2Other                          -0.074553  0.928159  0.056737 -1.314
## hist2Squamous cell                   0.227855  1.255903  0.056715  4.018
## site2Middle lobe                     0.070912  1.073487  0.096044  0.738
## site2Other,NOS                       0.102521  1.107960  0.124957  0.820
## site2Upper lobe                     -0.108867  0.896849  0.045360 -2.400
## grade2G2: Moderately differentiated  0.377580  1.458751  0.082555  4.574
## grade2G3: Poorly differentiated      0.681880  1.977592  0.085627  7.963
## grade2G4: Undifferentiated/Unknown   0.385292  1.470044  0.089734  4.294
## T_StageT2                            0.448373  1.565762  0.050983  8.794
## T_StageT3                            0.807456  2.242196  0.066060 12.223
## T_StageT4                            1.234743  3.437495  0.086030 14.352
## treat2yes                           -0.005106  0.994907  0.053385 -0.096
## Year_of_diagnosis2019               -0.060252  0.941527  0.049037 -1.229
## Year_of_diagnosis2020               -0.127915  0.879928  0.083369 -1.534
##                                     Pr(>|z|)    
## node_group1-4 nodes                 0.003643 ** 
## node_group10+ nodes                 4.55e-12 ***
## node_group5-10 nodes                2.07e-08 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          0.000159 ***
## race2Other/Unknown                  0.016756 *  
## race2White                          0.000155 ***
## married2single/seperated            0.000781 ***
## married2Unknown                     0.064326 .  
## married2Widowed                     1.35e-05 ***
## hist2Bronchioalveolar               0.030303 *  
## hist2Carcinoid                      3.37e-09 ***
## hist2Large cell                     9.47e-07 ***
## hist2Other                          0.188846    
## hist2Squamous cell                  5.88e-05 ***
## site2Middle lobe                    0.460313    
## site2Other,NOS                      0.411959    
## site2Upper lobe                     0.016393 *  
## grade2G2: Moderately differentiated 4.79e-06 ***
## grade2G3: Poorly differentiated     1.67e-15 ***
## grade2G4: Undifferentiated/Unknown  1.76e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.923806    
## Year_of_diagnosis2019               0.219183    
## Year_of_diagnosis2020               0.124948    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## node_group1-4 nodes                    0.7877     1.2694    0.6707    0.9252
## node_group10+ nodes                    0.6038     1.6561    0.5234    0.6966
## node_group5-10 nodes                   0.6533     1.5307    0.5630    0.7582
## age2                                   1.0307     0.9702    1.0253    1.0361
## SexMale                                1.5343     0.6518    1.4049    1.6756
## race2Black                             1.5379     0.6502    1.2300    1.9229
## race2Other/Unknown                     1.7226     0.5805    1.1032    2.6898
## race2White                             1.4138     0.7073    1.1816    1.6917
## married2single/seperated               1.1893     0.8409    1.0749    1.3158
## married2Unknown                        1.2068     0.8287    0.9889    1.4727
## married2Widowed                        1.3215     0.7567    1.1656    1.4983
## hist2Bronchioalveolar                  0.8304     1.2042    0.7019    0.9825
## hist2Carcinoid                         0.4013     2.4917    0.2965    0.5432
## hist2Large cell                        1.8984     0.5268    1.4692    2.4529
## hist2Other                             0.9282     1.0774    0.8305    1.0373
## hist2Squamous cell                     1.2559     0.7962    1.1238    1.4036
## site2Middle lobe                       1.0735     0.9315    0.8893    1.2958
## site2Other,NOS                         1.1080     0.9026    0.8673    1.4154
## site2Upper lobe                        0.8968     1.1150    0.8206    0.9802
## grade2G2: Moderately differentiated    1.4588     0.6855    1.2408    1.7150
## grade2G3: Poorly differentiated        1.9776     0.5057    1.6721    2.3390
## grade2G4: Undifferentiated/Unknown     1.4700     0.6803    1.2330    1.7527
## T_StageT2                              1.5658     0.6387    1.4169    1.7303
## T_StageT3                              2.2422     0.4460    1.9699    2.5521
## T_StageT4                              3.4375     0.2909    2.9041    4.0688
## treat2yes                              0.9949     1.0051    0.8961    1.1046
## Year_of_diagnosis2019                  0.9415     1.0621    0.8552    1.0365
## Year_of_diagnosis2020                  0.8799     1.1365    0.7473    1.0361
## 
## Concordance= 0.696  (se = 0.006 )
## Likelihood ratio test= 1143  on 28 df,   p=<2e-16
## Wald test            = 1103  on 28 df,   p=<2e-16
## Score (logrank) test = 1199  on 28 df,   p=<2e-16
#nodes continuous 
cox_model4 <- coxph(Surv(sur_time, dead_any) ~ Regional_nodes_examined__1988__ + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model4)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ Regional_nodes_examined__1988__ + 
##     age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + 
##     T_Stage + treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 2253 
## 
##                                          coef exp(coef)  se(coef)      z
## Regional_nodes_examined__1988__     -0.007231  0.992795  0.002377 -3.042
## age2                                 0.031002  1.031487  0.002674 11.595
## SexMale                              0.431560  1.539657  0.044977  9.595
## race2Black                           0.451046  1.569954  0.113952  3.958
## race2Other/Unknown                   0.538307  1.713104  0.227316  2.368
## race2White                           0.354650  1.425681  0.091532  3.875
## married2single/seperated             0.175773  1.192167  0.051587  3.407
## married2Unknown                      0.200494  1.222007  0.101579  1.974
## married2Widowed                      0.283185  1.327351  0.064078  4.419
## hist2Bronchioalveolar               -0.198438  0.820010  0.085771 -2.314
## hist2Carcinoid                      -0.907468  0.403545  0.154432 -5.876
## hist2Large cell                      0.631082  1.879643  0.130727  4.827
## hist2Other                          -0.089077  0.914775  0.056690 -1.571
## hist2Squamous cell                   0.224956  1.252268  0.056725  3.966
## site2Middle lobe                     0.079282  1.082510  0.096017  0.826
## site2Other,NOS                       0.128306  1.136901  0.124636  1.029
## site2Upper lobe                     -0.109172  0.896576  0.045347 -2.407
## grade2G2: Moderately differentiated  0.365380  1.441061  0.082536  4.427
## grade2G3: Poorly differentiated      0.671683  1.957528  0.085619  7.845
## grade2G4: Undifferentiated/Unknown   0.391091  1.478593  0.089652  4.362
## T_StageT2                            0.430925  1.538680  0.050885  8.469
## T_StageT3                            0.786833  2.196430  0.065976 11.926
## T_StageT4                            1.213980  3.366858  0.086003 14.116
## treat2yes                           -0.021281  0.978944  0.053333 -0.399
## Year_of_diagnosis2019               -0.067319  0.934897  0.049018 -1.373
## Year_of_diagnosis2020               -0.136970  0.871996  0.083341 -1.643
##                                     Pr(>|z|)    
## Regional_nodes_examined__1988__     0.002349 ** 
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          7.55e-05 ***
## race2Other/Unknown                  0.017880 *  
## race2White                          0.000107 ***
## married2single/seperated            0.000656 ***
## married2Unknown                     0.048407 *  
## married2Widowed                     9.90e-06 ***
## hist2Bronchioalveolar               0.020691 *  
## hist2Carcinoid                      4.20e-09 ***
## hist2Large cell                     1.38e-06 ***
## hist2Other                          0.116114    
## hist2Squamous cell                  7.32e-05 ***
## site2Middle lobe                    0.408966    
## site2Other,NOS                      0.303272    
## site2Upper lobe                     0.016063 *  
## grade2G2: Moderately differentiated 9.56e-06 ***
## grade2G3: Poorly differentiated     4.33e-15 ***
## grade2G4: Undifferentiated/Unknown  1.29e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.689882    
## Year_of_diagnosis2019               0.169645    
## Year_of_diagnosis2020               0.100282    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## Regional_nodes_examined__1988__        0.9928     1.0073    0.9882    0.9974
## age2                                   1.0315     0.9695    1.0261    1.0369
## SexMale                                1.5397     0.6495    1.4097    1.6815
## race2Black                             1.5700     0.6370    1.2557    1.9628
## race2Other/Unknown                     1.7131     0.5837    1.0972    2.6747
## race2White                             1.4257     0.7014    1.1915    1.7058
## married2single/seperated               1.1922     0.8388    1.0775    1.3190
## married2Unknown                        1.2220     0.8183    1.0014    1.4912
## married2Widowed                        1.3274     0.7534    1.1707    1.5050
## hist2Bronchioalveolar                  0.8200     1.2195    0.6931    0.9701
## hist2Carcinoid                         0.4035     2.4780    0.2982    0.5462
## hist2Large cell                        1.8796     0.5320    1.4548    2.4286
## hist2Other                             0.9148     1.0932    0.8186    1.0223
## hist2Squamous cell                     1.2523     0.7986    1.1205    1.3995
## site2Middle lobe                       1.0825     0.9238    0.8968    1.3067
## site2Other,NOS                         1.1369     0.8796    0.8905    1.4515
## site2Upper lobe                        0.8966     1.1154    0.8203    0.9799
## grade2G2: Moderately differentiated    1.4411     0.6939    1.2258    1.6941
## grade2G3: Poorly differentiated        1.9575     0.5108    1.6551    2.3152
## grade2G4: Undifferentiated/Unknown     1.4786     0.6763    1.2403    1.7626
## T_StageT2                              1.5387     0.6499    1.3926    1.7000
## T_StageT3                              2.1964     0.4553    1.9300    2.4996
## T_StageT4                              3.3669     0.2970    2.8446    3.9850
## treat2yes                              0.9789     1.0215    0.8818    1.0868
## Year_of_diagnosis2019                  0.9349     1.0696    0.8493    1.0292
## Year_of_diagnosis2020                  0.8720     1.1468    0.7406    1.0267
## 
## Concordance= 0.693  (se = 0.006 )
## Likelihood ratio test= 1100  on 26 df,   p=<2e-16
## Wald test            = 1058  on 26 df,   p=<2e-16
## Score (logrank) test = 1154  on 26 df,   p=<2e-16
#nodes continuous (log transformed)
seer$node_log <- log(seer$Regional_nodes_examined__1988__ + 1)

cox_model5 <- coxph(Surv(sur_time, dead_any) ~ node_log + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model5)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ node_log + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 2253 
## 
##                                          coef exp(coef)  se(coef)      z
## node_log                            -0.147511  0.862853  0.022259 -6.627
## age2                                 0.030500  1.030969  0.002670 11.424
## SexMale                              0.431423  1.539447  0.044955  9.597
## race2Black                           0.436406  1.547136  0.113988  3.829
## race2Other/Unknown                   0.544235  1.723289  0.227360  2.394
## race2White                           0.351194  1.420763  0.091517  3.837
## married2single/seperated             0.173033  1.188906  0.051596  3.354
## married2Unknown                      0.197253  1.218052  0.101590  1.942
## married2Widowed                      0.281356  1.324925  0.064061  4.392
## hist2Bronchioalveolar               -0.188954  0.827824  0.085782 -2.203
## hist2Carcinoid                      -0.910376  0.402373  0.154418 -5.896
## hist2Large cell                      0.639928  1.896345  0.130736  4.895
## hist2Other                          -0.077003  0.925887  0.056731 -1.357
## hist2Squamous cell                   0.229516  1.257990  0.056707  4.047
## site2Middle lobe                     0.071445  1.074059  0.096039  0.744
## site2Other,NOS                       0.109165  1.115347  0.124786  0.875
## site2Upper lobe                     -0.109079  0.896660  0.045349 -2.405
## grade2G2: Moderately differentiated  0.374200  1.453828  0.082552  4.533
## grade2G3: Poorly differentiated      0.680243  1.974358  0.085619  7.945
## grade2G4: Undifferentiated/Unknown   0.386370  1.471628  0.089679  4.308
## T_StageT2                            0.445122  1.560681  0.050964  8.734
## T_StageT3                            0.806391  2.239809  0.066049 12.209
## T_StageT4                            1.243074  3.466253  0.086026 14.450
## treat2yes                           -0.001934  0.998068  0.053366 -0.036
## Year_of_diagnosis2019               -0.062040  0.939845  0.049029 -1.265
## Year_of_diagnosis2020               -0.129828  0.878247  0.083356 -1.558
##                                     Pr(>|z|)    
## node_log                            3.42e-11 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          0.000129 ***
## race2Other/Unknown                  0.016679 *  
## race2White                          0.000124 ***
## married2single/seperated            0.000798 ***
## married2Unknown                     0.052178 .  
## married2Widowed                     1.12e-05 ***
## hist2Bronchioalveolar               0.027614 *  
## hist2Carcinoid                      3.73e-09 ***
## hist2Large cell                     9.84e-07 ***
## hist2Other                          0.174669    
## hist2Squamous cell                  5.18e-05 ***
## site2Middle lobe                    0.456929    
## site2Other,NOS                      0.381670    
## site2Upper lobe                     0.016159 *  
## grade2G2: Moderately differentiated 5.82e-06 ***
## grade2G3: Poorly differentiated     1.94e-15 ***
## grade2G4: Undifferentiated/Unknown  1.64e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.971095    
## Year_of_diagnosis2019               0.205737    
## Year_of_diagnosis2020               0.119350    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## node_log                               0.8629     1.1589    0.8260    0.9013
## age2                                   1.0310     0.9700    1.0256    1.0364
## SexMale                                1.5394     0.6496    1.4096    1.6812
## race2Black                             1.5471     0.6464    1.2374    1.9344
## race2Other/Unknown                     1.7233     0.5803    1.1036    2.6908
## race2White                             1.4208     0.7038    1.1875    1.6999
## married2single/seperated               1.1889     0.8411    1.0746    1.3154
## married2Unknown                        1.2181     0.8210    0.9981    1.4864
## married2Widowed                        1.3249     0.7548    1.1686    1.5022
## hist2Bronchioalveolar                  0.8278     1.2080    0.6997    0.9794
## hist2Carcinoid                         0.4024     2.4853    0.2973    0.5446
## hist2Large cell                        1.8963     0.5273    1.4677    2.4502
## hist2Other                             0.9259     1.0800    0.8285    1.0348
## hist2Squamous cell                     1.2580     0.7949    1.1257    1.4059
## site2Middle lobe                       1.0741     0.9310    0.8898    1.2965
## site2Other,NOS                         1.1153     0.8966    0.8734    1.4244
## site2Upper lobe                        0.8967     1.1153    0.8204    0.9800
## grade2G2: Moderately differentiated    1.4538     0.6878    1.2366    1.7092
## grade2G3: Poorly differentiated        1.9744     0.5065    1.6693    2.3351
## grade2G4: Undifferentiated/Unknown     1.4716     0.6795    1.2344    1.7544
## T_StageT2                              1.5607     0.6407    1.4123    1.7246
## T_StageT3                              2.2398     0.4465    1.9678    2.5494
## T_StageT4                              3.4663     0.2885    2.9284    4.1029
## treat2yes                              0.9981     1.0019    0.8989    1.1081
## Year_of_diagnosis2019                  0.9398     1.0640    0.8537    1.0346
## Year_of_diagnosis2020                  0.8782     1.1386    0.7459    1.0341
## 
## Concordance= 0.695  (se = 0.006 )
## Likelihood ratio test= 1133  on 26 df,   p=<2e-16
## Wald test            = 1090  on 26 df,   p=<2e-16
## Score (logrank) test = 1186  on 26 df,   p=<2e-16
#nodes positive continuous 
cox_model6 <- coxph(Surv(sur_time, dead_any) ~ Regional_nodes_positive__1988__ + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model6)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ Regional_nodes_positive__1988__ + 
##     age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + 
##     T_Stage + treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 22131, number of events= 2008 
##    (1910 observations deleted due to missingness)
## 
##                                          coef exp(coef)  se(coef)      z
## Regional_nodes_positive__1988__      0.099100  1.104177  0.007268 13.635
## age2                                 0.030461  1.030930  0.002826 10.779
## SexMale                              0.396793  1.487049  0.047510  8.352
## race2Black                           0.424926  1.529478  0.119462  3.557
## race2Other/Unknown                   0.442303  1.556287  0.242524  1.824
## race2White                           0.325436  1.384634  0.095148  3.420
## married2single/seperated             0.162260  1.176165  0.054479  2.978
## married2Unknown                      0.119860  1.127339  0.111445  1.076
## married2Widowed                      0.232566  1.261833  0.068602  3.390
## hist2Bronchioalveolar               -0.190115  0.826864  0.090424 -2.102
## hist2Carcinoid                      -0.997162  0.368925  0.173971 -5.732
## hist2Large cell                      0.663105  1.940809  0.137128  4.836
## hist2Other                          -0.125412  0.882134  0.059622 -2.103
## hist2Squamous cell                   0.208253  1.231524  0.060179  3.461
## site2Middle lobe                     0.088511  1.092546  0.101956  0.868
## site2Other,NOS                       0.020816  1.021035  0.137730  0.151
## site2Upper lobe                     -0.127443  0.880344  0.047861 -2.663
## grade2G2: Moderately differentiated  0.406161  1.501045  0.089891  4.518
## grade2G3: Poorly differentiated      0.699140  2.012021  0.093098  7.510
## grade2G4: Undifferentiated/Unknown   0.407548  1.503127  0.098143  4.153
## T_StageT2                            0.425374  1.530163  0.053931  7.887
## T_StageT3                            0.776757  2.174410  0.069155 11.232
## T_StageT4                            1.167576  3.214192  0.088958 13.125
## treat2yes                           -0.195026  0.822813  0.057316 -3.403
## Year_of_diagnosis2019               -0.096803  0.907735  0.051666 -1.874
## Year_of_diagnosis2020               -0.183143  0.832649  0.088953 -2.059
##                                     Pr(>|z|)    
## Regional_nodes_positive__1988__      < 2e-16 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          0.000375 ***
## race2Other/Unknown                  0.068190 .  
## race2White                          0.000625 ***
## married2single/seperated            0.002898 ** 
## married2Unknown                     0.282148    
## married2Widowed                     0.000699 ***
## hist2Bronchioalveolar               0.035511 *  
## hist2Carcinoid                      9.94e-09 ***
## hist2Large cell                     1.33e-06 ***
## hist2Other                          0.035427 *  
## hist2Squamous cell                  0.000539 ***
## site2Middle lobe                    0.385326    
## site2Other,NOS                      0.879865    
## site2Upper lobe                     0.007750 ** 
## grade2G2: Moderately differentiated 6.23e-06 ***
## grade2G3: Poorly differentiated     5.93e-14 ***
## grade2G4: Undifferentiated/Unknown  3.29e-05 ***
## T_StageT2                           3.09e-15 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.000667 ***
## Year_of_diagnosis2019               0.060981 .  
## Year_of_diagnosis2020               0.039506 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## Regional_nodes_positive__1988__        1.1042     0.9057    1.0886    1.1200
## age2                                   1.0309     0.9700    1.0252    1.0367
## SexMale                                1.4870     0.6725    1.3548    1.6322
## race2Black                             1.5295     0.6538    1.2102    1.9330
## race2Other/Unknown                     1.5563     0.6426    0.9675    2.5034
## race2White                             1.3846     0.7222    1.1491    1.6685
## married2single/seperated               1.1762     0.8502    1.0571    1.3087
## married2Unknown                        1.1273     0.8870    0.9061    1.4025
## married2Widowed                        1.2618     0.7925    1.1031    1.4434
## hist2Bronchioalveolar                  0.8269     1.2094    0.6926    0.9872
## hist2Carcinoid                         0.3689     2.7106    0.2623    0.5188
## hist2Large cell                        1.9408     0.5152    1.4834    2.5393
## hist2Other                             0.8821     1.1336    0.7848    0.9915
## hist2Squamous cell                     1.2315     0.8120    1.0945    1.3857
## site2Middle lobe                       1.0925     0.9153    0.8947    1.3342
## site2Other,NOS                         1.0210     0.9794    0.7795    1.3374
## site2Upper lobe                        0.8803     1.1359    0.8015    0.9669
## grade2G2: Moderately differentiated    1.5010     0.6662    1.2586    1.7902
## grade2G3: Poorly differentiated        2.0120     0.4970    1.6764    2.4148
## grade2G4: Undifferentiated/Unknown     1.5031     0.6653    1.2401    1.8219
## T_StageT2                              1.5302     0.6535    1.3767    1.7008
## T_StageT3                              2.1744     0.4599    1.8988    2.4900
## T_StageT4                              3.2142     0.3111    2.6999    3.8264
## treat2yes                              0.8228     1.2153    0.7354    0.9206
## Year_of_diagnosis2019                  0.9077     1.1016    0.8203    1.0045
## Year_of_diagnosis2020                  0.8326     1.2010    0.6994    0.9912
## 
## Concordance= 0.708  (se = 0.006 )
## Likelihood ratio test= 1107  on 26 df,   p=<2e-16
## Wald test            = 1166  on 26 df,   p=<2e-16
## Score (logrank) test = 1293  on 26 df,   p=<2e-16
#nodes positive yes vs no
seer <- seer %>% mutate(pos_node2 = ifelse(Regional_nodes_positive__1988__ == 0, "no", "yes"))

km_fit <- survfit(Surv(sur_time, dead_any) ~ pos_node2, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("0 Nodes Positive", "1+ Nodes Positive"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model7 <- coxph(Surv(sur_time, dead_any) ~ pos_node2 + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model7)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ pos_node2 + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 22131, number of events= 2008 
##    (1910 observations deleted due to missingness)
## 
##                                          coef exp(coef)  se(coef)      z
## pos_node2yes                         0.888487  2.431449  0.057343 15.494
## age2                                 0.029482  1.029921  0.002812 10.484
## SexMale                              0.393348  1.481934  0.047529  8.276
## race2Black                           0.467655  1.596246  0.119251  3.922
## race2Other/Unknown                   0.411430  1.508974  0.242492  1.697
## race2White                           0.345060  1.412074  0.095114  3.628
## married2single/seperated             0.149934  1.161758  0.054390  2.757
## married2Unknown                      0.120517  1.128080  0.111513  1.081
## married2Widowed                      0.225369  1.252785  0.068725  3.279
## hist2Bronchioalveolar               -0.180614  0.834758  0.090331 -1.999
## hist2Carcinoid                      -1.082031  0.338907  0.174361 -6.206
## hist2Large cell                      0.674410  1.962875  0.137096  4.919
## hist2Other                          -0.123817  0.883542  0.059606 -2.077
## hist2Squamous cell                   0.206493  1.229359  0.060099  3.436
## site2Middle lobe                     0.076568  1.079576  0.101807  0.752
## site2Other,NOS                      -0.061178  0.940656  0.138323 -0.442
## site2Upper lobe                     -0.136338  0.872548  0.047748 -2.855
## grade2G2: Moderately differentiated  0.358556  1.431261  0.090103  3.979
## grade2G3: Poorly differentiated      0.629316  1.876326  0.093364  6.740
## grade2G4: Undifferentiated/Unknown   0.369451  1.446940  0.098314  3.758
## T_StageT2                            0.394653  1.483869  0.053943  7.316
## T_StageT3                            0.758394  2.134845  0.068365 11.093
## T_StageT4                            1.141308  3.130860  0.088198 12.940
## treat2yes                           -0.452146  0.636261  0.061260 -7.381
## Year_of_diagnosis2019               -0.085892  0.917693  0.051662 -1.663
## Year_of_diagnosis2020               -0.173982  0.840312  0.088901 -1.957
##                                     Pr(>|z|)    
## pos_node2yes                         < 2e-16 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          8.80e-05 ***
## race2Other/Unknown                  0.089757 .  
## race2White                          0.000286 ***
## married2single/seperated            0.005840 ** 
## married2Unknown                     0.279811    
## married2Widowed                     0.001041 ** 
## hist2Bronchioalveolar               0.045558 *  
## hist2Carcinoid                      5.45e-10 ***
## hist2Large cell                     8.69e-07 ***
## hist2Other                          0.037778 *  
## hist2Squamous cell                  0.000591 ***
## site2Middle lobe                    0.451995    
## site2Other,NOS                      0.658284    
## site2Upper lobe                     0.004299 ** 
## grade2G2: Moderately differentiated 6.91e-05 ***
## grade2G3: Poorly differentiated     1.58e-11 ***
## grade2G4: Undifferentiated/Unknown  0.000171 ***
## T_StageT2                           2.55e-13 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           1.57e-13 ***
## Year_of_diagnosis2019               0.096398 .  
## Year_of_diagnosis2020               0.050345 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## pos_node2yes                           2.4314     0.4113    2.1730    2.7207
## age2                                   1.0299     0.9709    1.0243    1.0356
## SexMale                                1.4819     0.6748    1.3501    1.6266
## race2Black                             1.5962     0.6265    1.2636    2.0165
## race2Other/Unknown                     1.5090     0.6627    0.9382    2.4271
## race2White                             1.4121     0.7082    1.1719    1.7014
## married2single/seperated               1.1618     0.8608    1.0443    1.2924
## married2Unknown                        1.1281     0.8865    0.9066    1.4037
## married2Widowed                        1.2528     0.7982    1.0949    1.4334
## hist2Bronchioalveolar                  0.8348     1.1980    0.6993    0.9964
## hist2Carcinoid                         0.3389     2.9507    0.2408    0.4770
## hist2Large cell                        1.9629     0.5095    1.5004    2.5680
## hist2Other                             0.8835     1.1318    0.7861    0.9930
## hist2Squamous cell                     1.2294     0.8134    1.0928    1.3830
## site2Middle lobe                       1.0796     0.9263    0.8843    1.3180
## site2Other,NOS                         0.9407     1.0631    0.7173    1.2336
## site2Upper lobe                        0.8725     1.1461    0.7946    0.9581
## grade2G2: Moderately differentiated    1.4313     0.6987    1.1996    1.7077
## grade2G3: Poorly differentiated        1.8763     0.5330    1.5626    2.2531
## grade2G4: Undifferentiated/Unknown     1.4469     0.6911    1.1933    1.7544
## T_StageT2                              1.4839     0.6739    1.3350    1.6493
## T_StageT3                              2.1348     0.4684    1.8671    2.4410
## T_StageT4                              3.1309     0.3194    2.6338    3.7217
## treat2yes                              0.6363     1.5717    0.5643    0.7174
## Year_of_diagnosis2019                  0.9177     1.0897    0.8293    1.0155
## Year_of_diagnosis2020                  0.8403     1.1900    0.7059    1.0003
## 
## Concordance= 0.714  (se = 0.006 )
## Likelihood ratio test= 1211  on 26 df,   p=<2e-16
## Wald test            = 1241  on 26 df,   p=<2e-16
## Score (logrank) test = 1360  on 26 df,   p=<2e-16

##Lung cancer specific mortality

#any nodes examined
km_fit <- survfit(Surv(sur_time, dead_lung) ~ node_any, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("0 Nodes", "1+ Nodes"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model8 <- coxph(Surv(sur_time, dead_lung) ~ node_any + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model8)
## Call:
## coxph(formula = Surv(sur_time, dead_lung) ~ node_any + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 1279 
## 
##                                          coef exp(coef)  se(coef)      z
## node_anyyes                         -0.409112  0.664240  0.095604 -4.279
## age2                                 0.029765  1.030213  0.003533  8.426
## SexMale                              0.372490  1.451343  0.059448  6.266
## race2Black                           0.244615  1.277130  0.148381  1.649
## race2Other/Unknown                   0.485099  1.624335  0.290712  1.669
## race2White                           0.234761  1.264606  0.115301  2.036
## married2single/seperated             0.174322  1.190439  0.067852  2.569
## married2Unknown                      0.146767  1.158084  0.136582  1.075
## married2Widowed                      0.179263  1.196335  0.087495  2.049
## hist2Bronchioalveolar               -0.215107  0.806455  0.117665 -1.828
## hist2Carcinoid                      -1.180632  0.307085  0.255264 -4.625
## hist2Large cell                      0.706108  2.026090  0.160695  4.394
## hist2Other                          -0.086895  0.916774  0.074496 -1.166
## hist2Squamous cell                   0.148363  1.159934  0.074587  1.989
## site2Middle lobe                    -0.005947  0.994070  0.132882 -0.045
## site2Other,NOS                      -0.123875  0.883490  0.168986 -0.733
## site2Upper lobe                     -0.151304  0.859586  0.059717 -2.534
## grade2G2: Moderately differentiated  0.621010  1.860806  0.126075  4.926
## grade2G3: Poorly differentiated      1.045154  2.843837  0.128291  8.147
## grade2G4: Undifferentiated/Unknown   0.581904  1.789443  0.135346  4.299
## T_StageT2                            0.638704  1.894025  0.069773  9.154
## T_StageT3                            1.008292  2.740916  0.086656 11.636
## T_StageT4                            1.558086  4.749723  0.104964 14.844
## treat2yes                            0.195345  1.215730  0.066844  2.922
## Year_of_diagnosis2019               -0.101844  0.903171  0.064983 -1.567
## Year_of_diagnosis2020               -0.299654  0.741075  0.118830 -2.522
##                                     Pr(>|z|)    
## node_anyyes                         1.88e-05 ***
## age2                                 < 2e-16 ***
## SexMale                             3.71e-10 ***
## race2Black                           0.09924 .  
## race2Other/Unknown                   0.09519 .  
## race2White                           0.04174 *  
## married2single/seperated             0.01019 *  
## married2Unknown                      0.28257    
## married2Widowed                      0.04048 *  
## hist2Bronchioalveolar                0.06753 .  
## hist2Carcinoid                      3.74e-06 ***
## hist2Large cell                     1.11e-05 ***
## hist2Other                           0.24344    
## hist2Squamous cell                   0.04669 *  
## site2Middle lobe                     0.96430    
## site2Other,NOS                       0.46353    
## site2Upper lobe                      0.01129 *  
## grade2G2: Moderately differentiated 8.40e-07 ***
## grade2G3: Poorly differentiated     3.74e-16 ***
## grade2G4: Undifferentiated/Unknown  1.71e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                            0.00347 ** 
## Year_of_diagnosis2019                0.11706    
## Year_of_diagnosis2020                0.01168 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## node_anyyes                            0.6642     1.5055    0.5507    0.8011
## age2                                   1.0302     0.9707    1.0231    1.0374
## SexMale                                1.4513     0.6890    1.2917    1.6307
## race2Black                             1.2771     0.7830    0.9548    1.7082
## race2Other/Unknown                     1.6243     0.6156    0.9188    2.8716
## race2White                             1.2646     0.7908    1.0088    1.5853
## married2single/seperated               1.1904     0.8400    1.0422    1.3598
## married2Unknown                        1.1581     0.8635    0.8861    1.5136
## married2Widowed                        1.1963     0.8359    1.0078    1.4201
## hist2Bronchioalveolar                  0.8065     1.2400    0.6404    1.0156
## hist2Carcinoid                         0.3071     3.2564    0.1862    0.5065
## hist2Large cell                        2.0261     0.4936    1.4787    2.7762
## hist2Other                             0.9168     1.0908    0.7922    1.0609
## hist2Squamous cell                     1.1599     0.8621    1.0022    1.3425
## site2Middle lobe                       0.9941     1.0060    0.7661    1.2898
## site2Other,NOS                         0.8835     1.1319    0.6344    1.2304
## site2Upper lobe                        0.8596     1.1634    0.7646    0.9663
## grade2G2: Moderately differentiated    1.8608     0.5374    1.4534    2.3824
## grade2G3: Poorly differentiated        2.8438     0.3516    2.2116    3.6568
## grade2G4: Undifferentiated/Unknown     1.7894     0.5588    1.3725    2.3331
## T_StageT2                              1.8940     0.5280    1.6519    2.1716
## T_StageT3                              2.7409     0.3648    2.3128    3.2483
## T_StageT4                              4.7497     0.2105    3.8665    5.8346
## treat2yes                              1.2157     0.8226    1.0664    1.3859
## Year_of_diagnosis2019                  0.9032     1.1072    0.7952    1.0258
## Year_of_diagnosis2020                  0.7411     1.3494    0.5871    0.9354
## 
## Concordance= 0.731  (se = 0.007 )
## Likelihood ratio test= 925.9  on 26 df,   p=<2e-16
## Wald test            = 871.5  on 26 df,   p=<2e-16
## Score (logrank) test = 1030  on 26 df,   p=<2e-16
#any mediastinal examined
km_fit <- survfit(Surv(sur_time, dead_lung) ~ mediastinal, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("No-mediastinal", "Yes-mediastinal"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model9 <- coxph(Surv(sur_time, dead_lung) ~ mediastinal + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model9)
## Call:
## coxph(formula = Surv(sur_time, dead_lung) ~ mediastinal + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 1279 
## 
##                                          coef exp(coef)  se(coef)      z
## mediastinalyes                      -0.224404  0.798993  0.061171 -3.668
## age2                                 0.029693  1.030138  0.003538  8.392
## SexMale                              0.375360  1.455515  0.059448  6.314
## race2Black                           0.261538  1.298926  0.148324  1.763
## race2Other/Unknown                   0.463709  1.589959  0.290799  1.595
## race2White                           0.235796  1.265916  0.115314  2.045
## married2single/seperated             0.168278  1.183266  0.067900  2.478
## married2Unknown                      0.143065  1.153804  0.136602  1.047
## married2Widowed                      0.182888  1.200679  0.087522  2.090
## hist2Bronchioalveolar               -0.223810  0.799467  0.117654 -1.902
## hist2Carcinoid                      -1.176556  0.308339  0.255198 -4.610
## hist2Large cell                      0.704033  2.021890  0.160659  4.382
## hist2Other                          -0.093623  0.910626  0.074443 -1.258
## hist2Squamous cell                   0.146055  1.157260  0.074572  1.959
## site2Middle lobe                     0.011442  1.011508  0.132900  0.086
## site2Other,NOS                      -0.124098  0.883293  0.168971 -0.734
## site2Upper lobe                     -0.148993  0.861575  0.059725 -2.495
## grade2G2: Moderately differentiated  0.624646  1.867585  0.126087  4.954
## grade2G3: Poorly differentiated      1.047255  2.849819  0.128271  8.164
## grade2G4: Undifferentiated/Unknown   0.596505  1.815761  0.135258  4.410
## T_StageT2                            0.638925  1.894444  0.069777  9.157
## T_StageT3                            1.015969  2.762037  0.086781 11.707
## T_StageT4                            1.569247  4.803029  0.105151 14.924
## treat2yes                            0.203206  1.225325  0.067012  3.032
## Year_of_diagnosis2019               -0.107725  0.897875  0.064942 -1.659
## Year_of_diagnosis2020               -0.305510  0.736747  0.118783 -2.572
##                                     Pr(>|z|)    
## mediastinalyes                      0.000244 ***
## age2                                 < 2e-16 ***
## SexMale                             2.72e-10 ***
## race2Black                          0.077853 .  
## race2Other/Unknown                  0.110802    
## race2White                          0.040872 *  
## married2single/seperated            0.013201 *  
## married2Unknown                     0.294956    
## married2Widowed                     0.036652 *  
## hist2Bronchioalveolar               0.057136 .  
## hist2Carcinoid                      4.02e-06 ***
## hist2Large cell                     1.18e-05 ***
## hist2Other                          0.208521    
## hist2Squamous cell                  0.050161 .  
## site2Middle lobe                    0.931391    
## site2Other,NOS                      0.462683    
## site2Upper lobe                     0.012607 *  
## grade2G2: Moderately differentiated 7.27e-07 ***
## grade2G3: Poorly differentiated     3.23e-16 ***
## grade2G4: Undifferentiated/Unknown  1.03e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.002426 ** 
## Year_of_diagnosis2019               0.097162 .  
## Year_of_diagnosis2020               0.010111 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## mediastinalyes                         0.7990     1.2516    0.7087    0.9008
## age2                                   1.0301     0.9707    1.0230    1.0373
## SexMale                                1.4555     0.6870    1.2954    1.6354
## race2Black                             1.2989     0.7699    0.9712    1.7372
## race2Other/Unknown                     1.5900     0.6289    0.8992    2.8114
## race2White                             1.2659     0.7899    1.0098    1.5869
## married2single/seperated               1.1833     0.8451    1.0358    1.3517
## married2Unknown                        1.1538     0.8667    0.8828    1.5080
## married2Widowed                        1.2007     0.8329    1.0114    1.4254
## hist2Bronchioalveolar                  0.7995     1.2508    0.6348    1.0068
## hist2Carcinoid                         0.3083     3.2432    0.1870    0.5085
## hist2Large cell                        2.0219     0.4946    1.4757    2.7702
## hist2Other                             0.9106     1.0981    0.7870    1.0537
## hist2Squamous cell                     1.1573     0.8641    0.9999    1.3394
## site2Middle lobe                       1.0115     0.9886    0.7796    1.3125
## site2Other,NOS                         0.8833     1.1321    0.6343    1.2301
## site2Upper lobe                        0.8616     1.1607    0.7664    0.9686
## grade2G2: Moderately differentiated    1.8676     0.5355    1.4587    2.3911
## grade2G3: Poorly differentiated        2.8498     0.3509    2.2163    3.6644
## grade2G4: Undifferentiated/Unknown     1.8158     0.5507    1.3929    2.3670
## T_StageT2                              1.8944     0.5279    1.6523    2.1721
## T_StageT3                              2.7620     0.3621    2.3300    3.2741
## T_StageT4                              4.8030     0.2082    3.9085    5.9023
## treat2yes                              1.2253     0.8161    1.0745    1.3973
## Year_of_diagnosis2019                  0.8979     1.1137    0.7906    1.0198
## Year_of_diagnosis2020                  0.7367     1.3573    0.5837    0.9299
## 
## Concordance= 0.73  (se = 0.007 )
## Likelihood ratio test= 922.5  on 26 df,   p=<2e-16
## Wald test            = 866.7  on 26 df,   p=<2e-16
## Score (logrank) test = 1026  on 26 df,   p=<2e-16
#nodes examined by group
km_fit <- survfit(Surv(sur_time, dead_lung) ~ node_group, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("0 Nodes", "1-4 Nodes", "10+ nodes", "5-10 nodes"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model10 <- coxph(Surv(sur_time, dead_lung) ~ node_group + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model10)
## Call:
## coxph(formula = Surv(sur_time, dead_lung) ~ node_group + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 1279 
## 
##                                          coef exp(coef)  se(coef)      z
## node_group1-4 nodes                 -0.281566  0.754601  0.115059 -2.447
## node_group10+ nodes                 -0.428726  0.651338  0.100228 -4.278
## node_group5-10 nodes                -0.451441  0.636710  0.105620 -4.274
## age2                                 0.029523  1.029963  0.003533  8.357
## SexMale                              0.372533  1.451406  0.059451  6.266
## race2Black                           0.241634  1.273328  0.148410  1.628
## race2Other/Unknown                   0.485507  1.624999  0.290756  1.670
## race2White                           0.236920  1.267340  0.115314  2.055
## married2single/seperated             0.172847  1.188685  0.067872  2.547
## married2Unknown                      0.145834  1.157004  0.136616  1.067
## married2Widowed                      0.180805  1.198182  0.087505  2.066
## hist2Bronchioalveolar               -0.213182  0.808009  0.117674 -1.812
## hist2Carcinoid                      -1.179988  0.307282  0.255284 -4.622
## hist2Large cell                      0.706046  2.025965  0.160728  4.393
## hist2Other                          -0.083094  0.920265  0.074534 -1.115
## hist2Squamous cell                   0.150595  1.162526  0.074632  2.018
## site2Middle lobe                    -0.009955  0.990095  0.132919 -0.075
## site2Other,NOS                      -0.121822  0.885306  0.169060 -0.721
## site2Upper lobe                     -0.150709  0.860098  0.059737 -2.523
## grade2G2: Moderately differentiated  0.624860  1.867985  0.126106  4.955
## grade2G3: Poorly differentiated      1.049633  2.856602  0.128334  8.179
## grade2G4: Undifferentiated/Unknown   0.586484  1.797657  0.135385  4.332
## T_StageT2                            0.641393  1.899124  0.069798  9.189
## T_StageT3                            1.016514  2.763544  0.086810 11.710
## T_StageT4                            1.568440  4.799155  0.105540 14.861
## treat2yes                            0.203292  1.225430  0.067242  3.023
## Year_of_diagnosis2019               -0.100731  0.904176  0.064979 -1.550
## Year_of_diagnosis2020               -0.298247  0.742118  0.118831 -2.510
##                                     Pr(>|z|)    
## node_group1-4 nodes                   0.0144 *  
## node_group10+ nodes                 1.89e-05 ***
## node_group5-10 nodes                1.92e-05 ***
## age2                                 < 2e-16 ***
## SexMale                             3.70e-10 ***
## race2Black                            0.1035    
## race2Other/Unknown                    0.0950 .  
## race2White                            0.0399 *  
## married2single/seperated              0.0109 *  
## married2Unknown                       0.2858    
## married2Widowed                       0.0388 *  
## hist2Bronchioalveolar                 0.0700 .  
## hist2Carcinoid                      3.80e-06 ***
## hist2Large cell                     1.12e-05 ***
## hist2Other                            0.2649    
## hist2Squamous cell                    0.0436 *  
## site2Middle lobe                      0.9403    
## site2Other,NOS                        0.4712    
## site2Upper lobe                       0.0116 *  
## grade2G2: Moderately differentiated 7.23e-07 ***
## grade2G3: Poorly differentiated     2.86e-16 ***
## grade2G4: Undifferentiated/Unknown  1.48e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                             0.0025 ** 
## Year_of_diagnosis2019                 0.1211    
## Year_of_diagnosis2020                 0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## node_group1-4 nodes                    0.7546     1.3252    0.6023    0.9455
## node_group10+ nodes                    0.6513     1.5353    0.5352    0.7927
## node_group5-10 nodes                   0.6367     1.5706    0.5177    0.7832
## age2                                   1.0300     0.9709    1.0229    1.0371
## SexMale                                1.4514     0.6890    1.2918    1.6308
## race2Black                             1.2733     0.7853    0.9519    1.7032
## race2Other/Unknown                     1.6250     0.6154    0.9191    2.8731
## race2White                             1.2673     0.7891    1.0110    1.5887
## married2single/seperated               1.1887     0.8413    1.0406    1.3578
## married2Unknown                        1.1570     0.8643    0.8852    1.5122
## married2Widowed                        1.1982     0.8346    1.0093    1.4224
## hist2Bronchioalveolar                  0.8080     1.2376    0.6416    1.0176
## hist2Carcinoid                         0.3073     3.2543    0.1863    0.5068
## hist2Large cell                        2.0260     0.4936    1.4785    2.7762
## hist2Other                             0.9203     1.0866    0.7952    1.0650
## hist2Squamous cell                     1.1625     0.8602    1.0043    1.3456
## site2Middle lobe                       0.9901     1.0100    0.7630    1.2847
## site2Other,NOS                         0.8853     1.1296    0.6356    1.2331
## site2Upper lobe                        0.8601     1.1627    0.7651    0.9669
## grade2G2: Moderately differentiated    1.8680     0.5353    1.4589    2.3917
## grade2G3: Poorly differentiated        2.8566     0.3501    2.2213    3.6736
## grade2G4: Undifferentiated/Unknown     1.7977     0.5563    1.3787    2.3439
## T_StageT2                              1.8991     0.5266    1.6563    2.1775
## T_StageT3                              2.7635     0.3619    2.3312    3.2761
## T_StageT4                              4.7992     0.2084    3.9024    5.9020
## treat2yes                              1.2254     0.8160    1.0741    1.3981
## Year_of_diagnosis2019                  0.9042     1.1060    0.7961    1.0270
## Year_of_diagnosis2020                  0.7421     1.3475    0.5879    0.9367
## 
## Concordance= 0.731  (se = 0.007 )
## Likelihood ratio test= 929.7  on 28 df,   p=<2e-16
## Wald test            = 874.4  on 28 df,   p=<2e-16
## Score (logrank) test = 1033  on 28 df,   p=<2e-16
#nodes continuous 
cox_model11 <- coxph(Surv(sur_time, dead_lung) ~ Regional_nodes_examined__1988__ + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model11)
## Call:
## coxph(formula = Surv(sur_time, dead_lung) ~ Regional_nodes_examined__1988__ + 
##     age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + 
##     T_Stage + treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 1279 
## 
##                                          coef exp(coef)  se(coef)      z
## Regional_nodes_examined__1988__     -0.002564  0.997440  0.003022 -0.848
## age2                                 0.030192  1.030652  0.003539  8.530
## SexMale                              0.374253  1.453906  0.059506  6.289
## race2Black                           0.260672  1.297802  0.148340  1.757
## race2Other/Unknown                   0.483108  1.621105  0.290692  1.662
## race2White                           0.246209  1.279167  0.115291  2.136
## married2single/seperated             0.174573  1.190738  0.067864  2.572
## married2Unknown                      0.155495  1.168236  0.136609  1.138
## married2Widowed                      0.184388  1.202482  0.087542  2.106
## hist2Bronchioalveolar               -0.226184  0.797571  0.117630 -1.923
## hist2Carcinoid                      -1.175773  0.308580  0.255287 -4.606
## hist2Large cell                      0.696589  2.006896  0.160700  4.335
## hist2Other                          -0.096647  0.907876  0.074463 -1.298
## hist2Squamous cell                   0.147863  1.159354  0.074649  1.981
## site2Middle lobe                    -0.002819  0.997185  0.132888 -0.021
## site2Other,NOS                      -0.089902  0.914021  0.168618 -0.533
## site2Upper lobe                     -0.148619  0.861897  0.059714 -2.489
## grade2G2: Moderately differentiated  0.615003  1.849663  0.126064  4.879
## grade2G3: Poorly differentiated      1.040087  2.829464  0.128296  8.107
## grade2G4: Undifferentiated/Unknown   0.595130  1.813267  0.135259  4.400
## T_StageT2                            0.623527  1.865496  0.069658  8.951
## T_StageT3                            0.997481  2.711442  0.086700 11.505
## T_StageT4                            1.547338  4.698946  0.105552 14.659
## treat2yes                            0.187112  1.205762  0.067155  2.786
## Year_of_diagnosis2019               -0.107666  0.897928  0.064953 -1.658
## Year_of_diagnosis2020               -0.305647  0.736647  0.118799 -2.573
##                                     Pr(>|z|)    
## Regional_nodes_examined__1988__      0.39627    
## age2                                 < 2e-16 ***
## SexMale                             3.19e-10 ***
## race2Black                           0.07887 .  
## race2Other/Unknown                   0.09653 .  
## race2White                           0.03272 *  
## married2single/seperated             0.01010 *  
## married2Unknown                      0.25502    
## married2Widowed                      0.03518 *  
## hist2Bronchioalveolar                0.05450 .  
## hist2Carcinoid                      4.11e-06 ***
## hist2Large cell                     1.46e-05 ***
## hist2Other                           0.19431    
## hist2Squamous cell                   0.04762 *  
## site2Middle lobe                     0.98308    
## site2Other,NOS                       0.59392    
## site2Upper lobe                      0.01282 *  
## grade2G2: Moderately differentiated 1.07e-06 ***
## grade2G3: Poorly differentiated     5.19e-16 ***
## grade2G4: Undifferentiated/Unknown  1.08e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                            0.00533 ** 
## Year_of_diagnosis2019                0.09740 .  
## Year_of_diagnosis2020                0.01009 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## Regional_nodes_examined__1988__        0.9974     1.0026    0.9915    1.0034
## age2                                   1.0307     0.9703    1.0235    1.0378
## SexMale                                1.4539     0.6878    1.2939    1.6338
## race2Black                             1.2978     0.7705    0.9704    1.7357
## race2Other/Unknown                     1.6211     0.6169    0.9170    2.8658
## race2White                             1.2792     0.7818    1.0204    1.6035
## married2single/seperated               1.1907     0.8398    1.0424    1.3601
## married2Unknown                        1.1682     0.8560    0.8938    1.5269
## married2Widowed                        1.2025     0.8316    1.0129    1.4276
## hist2Bronchioalveolar                  0.7976     1.2538    0.6333    1.0044
## hist2Carcinoid                         0.3086     3.2406    0.1871    0.5089
## hist2Large cell                        2.0069     0.4983    1.4647    2.7499
## hist2Other                             0.9079     1.1015    0.7846    1.0505
## hist2Squamous cell                     1.1594     0.8625    1.0016    1.3420
## site2Middle lobe                       0.9972     1.0028    0.7685    1.2939
## site2Other,NOS                         0.9140     1.0941    0.6568    1.2720
## site2Upper lobe                        0.8619     1.1602    0.7667    0.9689
## grade2G2: Moderately differentiated    1.8497     0.5406    1.4447    2.3681
## grade2G3: Poorly differentiated        2.8295     0.3534    2.2004    3.6384
## grade2G4: Undifferentiated/Unknown     1.8133     0.5515    1.3910    2.3637
## T_StageT2                              1.8655     0.5361    1.6274    2.1384
## T_StageT3                              2.7114     0.3688    2.2877    3.2137
## T_StageT4                              4.6989     0.2128    3.8208    5.7789
## treat2yes                              1.2058     0.8294    1.0571    1.3754
## Year_of_diagnosis2019                  0.8979     1.1137    0.7906    1.0198
## Year_of_diagnosis2020                  0.7366     1.3575    0.5836    0.9298
## 
## Concordance= 0.729  (se = 0.007 )
## Likelihood ratio test= 910.1  on 26 df,   p=<2e-16
## Wald test            = 856.4  on 26 df,   p=<2e-16
## Score (logrank) test = 1015  on 26 df,   p=<2e-16
#nodes continuous (log transformed)
cox_model12 <- coxph(Surv(sur_time, dead_lung) ~ node_log + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model12)
## Call:
## coxph(formula = Surv(sur_time, dead_lung) ~ node_log + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 24041, number of events= 1279 
## 
##                                          coef exp(coef)  se(coef)      z
## node_log                            -0.106573  0.898909  0.030498 -3.494
## age2                                 0.029870  1.030321  0.003536  8.448
## SexMale                              0.375730  1.456054  0.059468  6.318
## race2Black                           0.249375  1.283224  0.148394  1.680
## race2Other/Unknown                   0.485234  1.624555  0.290748  1.669
## race2White                           0.245098  1.277746  0.115277  2.126
## married2single/seperated             0.171845  1.187494  0.067871  2.532
## married2Unknown                      0.155782  1.168571  0.136608  1.140
## married2Widowed                      0.184409  1.202507  0.087517  2.107
## hist2Bronchioalveolar               -0.219186  0.803172  0.117637 -1.863
## hist2Carcinoid                      -1.176567  0.308335  0.255282 -4.609
## hist2Large cell                      0.706233  2.026343  0.160713  4.394
## hist2Other                          -0.086302  0.917318  0.074516 -1.158
## hist2Squamous cell                   0.153959  1.166443  0.074623  2.063
## site2Middle lobe                    -0.009766  0.990281  0.132911 -0.073
## site2Other,NOS                      -0.103971  0.901251  0.168789 -0.616
## site2Upper lobe                     -0.148193  0.862265  0.059718 -2.482
## grade2G2: Moderately differentiated  0.622064  1.862769  0.126087  4.934
## grade2G3: Poorly differentiated      1.048970  2.854709  0.128296  8.176
## grade2G4: Undifferentiated/Unknown   0.591917  1.807450  0.135292  4.375
## T_StageT2                            0.635139  1.887285  0.069757  9.105
## T_StageT3                            1.015276  2.760126  0.086788 11.698
## T_StageT4                            1.579481  4.852437  0.105541 14.966
## treat2yes                            0.208199  1.231458  0.067213  3.098
## Year_of_diagnosis2019               -0.103626  0.901562  0.064964 -1.595
## Year_of_diagnosis2020               -0.300460  0.740477  0.118810 -2.529
##                                     Pr(>|z|)    
## node_log                            0.000475 ***
## age2                                 < 2e-16 ***
## SexMale                             2.65e-10 ***
## race2Black                          0.092862 .  
## race2Other/Unknown                  0.095135 .  
## race2White                          0.033490 *  
## married2single/seperated            0.011344 *  
## married2Unknown                     0.254138    
## married2Widowed                     0.035106 *  
## hist2Bronchioalveolar               0.062429 .  
## hist2Carcinoid                      4.05e-06 ***
## hist2Large cell                     1.11e-05 ***
## hist2Other                          0.246796    
## hist2Squamous cell                  0.039098 *  
## site2Middle lobe                    0.941423    
## site2Other,NOS                      0.537904    
## site2Upper lobe                     0.013081 *  
## grade2G2: Moderately differentiated 8.07e-07 ***
## grade2G3: Poorly differentiated     2.93e-16 ***
## grade2G4: Undifferentiated/Unknown  1.21e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           0.001951 ** 
## Year_of_diagnosis2019               0.110686    
## Year_of_diagnosis2020               0.011442 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## node_log                               0.8989     1.1125    0.8468    0.9543
## age2                                   1.0303     0.9706    1.0232    1.0375
## SexMale                                1.4561     0.6868    1.2959    1.6361
## race2Black                             1.2832     0.7793    0.9594    1.7164
## race2Other/Unknown                     1.6246     0.6156    0.9189    2.8722
## race2White                             1.2777     0.7826    1.0193    1.6017
## married2single/seperated               1.1875     0.8421    1.0396    1.3564
## married2Unknown                        1.1686     0.8557    0.8941    1.5273
## married2Widowed                        1.2025     0.8316    1.0130    1.4275
## hist2Bronchioalveolar                  0.8032     1.2451    0.6378    1.0114
## hist2Carcinoid                         0.3083     3.2432    0.1870    0.5085
## hist2Large cell                        2.0263     0.4935    1.4788    2.7766
## hist2Other                             0.9173     1.0901    0.7927    1.0616
## hist2Squamous cell                     1.1664     0.8573    1.0077    1.3502
## site2Middle lobe                       0.9903     1.0098    0.7632    1.2850
## site2Other,NOS                         0.9013     1.1096    0.6474    1.2546
## site2Upper lobe                        0.8623     1.1597    0.7670    0.9693
## grade2G2: Moderately differentiated    1.8628     0.5368    1.4549    2.3850
## grade2G3: Poorly differentiated        2.8547     0.3503    2.2200    3.6709
## grade2G4: Undifferentiated/Unknown     1.8074     0.5533    1.3865    2.3563
## T_StageT2                              1.8873     0.5299    1.6461    2.1638
## T_StageT3                              2.7601     0.3623    2.3284    3.2719
## T_StageT4                              4.8524     0.2061    3.9457    5.9676
## treat2yes                              1.2315     0.8120    1.0795    1.4049
## Year_of_diagnosis2019                  0.9016     1.1092    0.7938    1.0240
## Year_of_diagnosis2020                  0.7405     1.3505    0.5867    0.9346
## 
## Concordance= 0.73  (se = 0.007 )
## Likelihood ratio test= 921.2  on 26 df,   p=<2e-16
## Wald test            = 865.4  on 26 df,   p=<2e-16
## Score (logrank) test = 1025  on 26 df,   p=<2e-16
#nodes positive continuous 
cox_model13 <- coxph(Surv(sur_time, dead_lung) ~ Regional_nodes_positive__1988__ + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model13)
## Call:
## coxph(formula = Surv(sur_time, dead_lung) ~ Regional_nodes_positive__1988__ + 
##     age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + 
##     T_Stage + treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 22131, number of events= 1154 
##    (1910 observations deleted due to missingness)
## 
##                                          coef exp(coef)  se(coef)      z
## Regional_nodes_positive__1988__      0.112405  1.118966  0.008067 13.934
## age2                                 0.028426  1.028834  0.003699  7.685
## SexMale                              0.384595  1.469019  0.062686  6.135
## race2Black                           0.196410  1.217026  0.155724  1.261
## race2Other/Unknown                   0.435278  1.545393  0.301512  1.444
## race2White                           0.214454  1.239185  0.119331  1.797
## married2single/seperated             0.145444  1.156552  0.071461  2.035
## married2Unknown                      0.051254  1.052590  0.150550  0.340
## married2Widowed                      0.152097  1.164273  0.093167  1.633
## hist2Bronchioalveolar               -0.188499  0.828201  0.123395 -1.528
## hist2Carcinoid                      -1.392259  0.248513  0.300898 -4.627
## hist2Large cell                      0.764439  2.147790  0.168599  4.534
## hist2Other                          -0.104009  0.901217  0.078216 -1.330
## hist2Squamous cell                   0.185526  1.203851  0.078892  2.352
## site2Middle lobe                     0.040872  1.041718  0.139443  0.293
## site2Other,NOS                      -0.108759  0.896946  0.179820 -0.605
## site2Upper lobe                     -0.135947  0.872889  0.062958 -2.159
## grade2G2: Moderately differentiated  0.606966  1.834856  0.136194  4.457
## grade2G3: Poorly differentiated      1.017492  2.766249  0.138467  7.348
## grade2G4: Undifferentiated/Unknown   0.610009  1.840448  0.146140  4.174
## T_StageT2                            0.613260  1.846441  0.073797  8.310
## T_StageT3                            0.995425  2.705874  0.090628 10.984
## T_StageT4                            1.508400  4.519493  0.108716 13.875
## treat2yes                           -0.017691  0.982464  0.071911 -0.246
## Year_of_diagnosis2019               -0.142878  0.866860  0.068323 -2.091
## Year_of_diagnosis2020               -0.324559  0.722846  0.125234 -2.592
##                                     Pr(>|z|)    
## Regional_nodes_positive__1988__      < 2e-16 ***
## age2                                1.53e-14 ***
## SexMale                             8.50e-10 ***
## race2Black                           0.20721    
## race2Other/Unknown                   0.14884    
## race2White                           0.07231 .  
## married2single/seperated             0.04182 *  
## married2Unknown                      0.73352    
## married2Widowed                      0.10257    
## hist2Bronchioalveolar                0.12661    
## hist2Carcinoid                      3.71e-06 ***
## hist2Large cell                     5.79e-06 ***
## hist2Other                           0.18359    
## hist2Squamous cell                   0.01869 *  
## site2Middle lobe                     0.76944    
## site2Other,NOS                       0.54530    
## site2Upper lobe                      0.03082 *  
## grade2G2: Moderately differentiated 8.33e-06 ***
## grade2G3: Poorly differentiated     2.01e-13 ***
## grade2G4: Undifferentiated/Unknown  2.99e-05 ***
## T_StageT2                            < 2e-16 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                            0.80567    
## Year_of_diagnosis2019                0.03651 *  
## Year_of_diagnosis2020                0.00955 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## Regional_nodes_positive__1988__        1.1190     0.8937    1.1014    1.1368
## age2                                   1.0288     0.9720    1.0214    1.0363
## SexMale                                1.4690     0.6807    1.2992    1.6611
## race2Black                             1.2170     0.8217    0.8969    1.6514
## race2Other/Unknown                     1.5454     0.6471    0.8558    2.7905
## race2White                             1.2392     0.8070    0.9808    1.5657
## married2single/seperated               1.1566     0.8646    1.0054    1.3304
## married2Unknown                        1.0526     0.9500    0.7836    1.4139
## married2Widowed                        1.1643     0.8589    0.9700    1.3975
## hist2Bronchioalveolar                  0.8282     1.2074    0.6503    1.0548
## hist2Carcinoid                         0.2485     4.0239    0.1378    0.4482
## hist2Large cell                        2.1478     0.4656    1.5434    2.9888
## hist2Other                             0.9012     1.1096    0.7731    1.0505
## hist2Squamous cell                     1.2039     0.8307    1.0314    1.4052
## site2Middle lobe                       1.0417     0.9600    0.7926    1.3691
## site2Other,NOS                         0.8969     1.1149    0.6305    1.2759
## site2Upper lobe                        0.8729     1.1456    0.7716    0.9875
## grade2G2: Moderately differentiated    1.8349     0.5450    1.4050    2.3962
## grade2G3: Poorly differentiated        2.7662     0.3615    2.1088    3.6287
## grade2G4: Undifferentiated/Unknown     1.8404     0.5433    1.3821    2.4509
## T_StageT2                              1.8464     0.5416    1.5978    2.1338
## T_StageT3                              2.7059     0.3696    2.2655    3.2318
## T_StageT4                              4.5195     0.2213    3.6522    5.5928
## treat2yes                              0.9825     1.0178    0.8533    1.1312
## Year_of_diagnosis2019                  0.8669     1.1536    0.7582    0.9911
## Year_of_diagnosis2020                  0.7228     1.3834    0.5655    0.9239
## 
## Concordance= 0.749  (se = 0.008 )
## Likelihood ratio test= 963.7  on 26 df,   p=<2e-16
## Wald test            = 1034  on 26 df,   p=<2e-16
## Score (logrank) test = 1271  on 26 df,   p=<2e-16
#nodes positive yes vs no
km_fit <- survfit(Surv(sur_time, dead_lung) ~ pos_node2, data = seer)
ggsurvplot(km_fit, data = seer,
           pval = TRUE,              # Adds the p-value
           conf.int = TRUE,          # Adds confidence intervals
           risk.table = TRUE,        # Adds a risk table below the plot
           risk.table.col = "strata",# Colors risk table by groups
           legend.title = "Nodes",   # Label for the legend
           legend.labs = c("0 Nodes Positive", "1+ Nodes Positive"), # Custom legend labels
           xlab = "Time (months)",   # X-axis label
           ylab = "Survival Probability", # Y-axis label
           surv.median.line = "hv",  # Adds median survival lines
           ggtheme = theme_minimal()) # Applies a minimalistic theme
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

cox_model14 <- coxph(Surv(sur_time, dead_any) ~ pos_node2 + age2 + Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + treat2 + Year_of_diagnosis, data = seer)
summary(cox_model14)
## Call:
## coxph(formula = Surv(sur_time, dead_any) ~ pos_node2 + age2 + 
##     Sex + race2 + married2 + hist2 + site2 + grade2 + T_Stage + 
##     treat2 + Year_of_diagnosis, data = seer)
## 
##   n= 22131, number of events= 2008 
##    (1910 observations deleted due to missingness)
## 
##                                          coef exp(coef)  se(coef)      z
## pos_node2yes                         0.888487  2.431449  0.057343 15.494
## age2                                 0.029482  1.029921  0.002812 10.484
## SexMale                              0.393348  1.481934  0.047529  8.276
## race2Black                           0.467655  1.596246  0.119251  3.922
## race2Other/Unknown                   0.411430  1.508974  0.242492  1.697
## race2White                           0.345060  1.412074  0.095114  3.628
## married2single/seperated             0.149934  1.161758  0.054390  2.757
## married2Unknown                      0.120517  1.128080  0.111513  1.081
## married2Widowed                      0.225369  1.252785  0.068725  3.279
## hist2Bronchioalveolar               -0.180614  0.834758  0.090331 -1.999
## hist2Carcinoid                      -1.082031  0.338907  0.174361 -6.206
## hist2Large cell                      0.674410  1.962875  0.137096  4.919
## hist2Other                          -0.123817  0.883542  0.059606 -2.077
## hist2Squamous cell                   0.206493  1.229359  0.060099  3.436
## site2Middle lobe                     0.076568  1.079576  0.101807  0.752
## site2Other,NOS                      -0.061178  0.940656  0.138323 -0.442
## site2Upper lobe                     -0.136338  0.872548  0.047748 -2.855
## grade2G2: Moderately differentiated  0.358556  1.431261  0.090103  3.979
## grade2G3: Poorly differentiated      0.629316  1.876326  0.093364  6.740
## grade2G4: Undifferentiated/Unknown   0.369451  1.446940  0.098314  3.758
## T_StageT2                            0.394653  1.483869  0.053943  7.316
## T_StageT3                            0.758394  2.134845  0.068365 11.093
## T_StageT4                            1.141308  3.130860  0.088198 12.940
## treat2yes                           -0.452146  0.636261  0.061260 -7.381
## Year_of_diagnosis2019               -0.085892  0.917693  0.051662 -1.663
## Year_of_diagnosis2020               -0.173982  0.840312  0.088901 -1.957
##                                     Pr(>|z|)    
## pos_node2yes                         < 2e-16 ***
## age2                                 < 2e-16 ***
## SexMale                              < 2e-16 ***
## race2Black                          8.80e-05 ***
## race2Other/Unknown                  0.089757 .  
## race2White                          0.000286 ***
## married2single/seperated            0.005840 ** 
## married2Unknown                     0.279811    
## married2Widowed                     0.001041 ** 
## hist2Bronchioalveolar               0.045558 *  
## hist2Carcinoid                      5.45e-10 ***
## hist2Large cell                     8.69e-07 ***
## hist2Other                          0.037778 *  
## hist2Squamous cell                  0.000591 ***
## site2Middle lobe                    0.451995    
## site2Other,NOS                      0.658284    
## site2Upper lobe                     0.004299 ** 
## grade2G2: Moderately differentiated 6.91e-05 ***
## grade2G3: Poorly differentiated     1.58e-11 ***
## grade2G4: Undifferentiated/Unknown  0.000171 ***
## T_StageT2                           2.55e-13 ***
## T_StageT3                            < 2e-16 ***
## T_StageT4                            < 2e-16 ***
## treat2yes                           1.57e-13 ***
## Year_of_diagnosis2019               0.096398 .  
## Year_of_diagnosis2020               0.050345 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## pos_node2yes                           2.4314     0.4113    2.1730    2.7207
## age2                                   1.0299     0.9709    1.0243    1.0356
## SexMale                                1.4819     0.6748    1.3501    1.6266
## race2Black                             1.5962     0.6265    1.2636    2.0165
## race2Other/Unknown                     1.5090     0.6627    0.9382    2.4271
## race2White                             1.4121     0.7082    1.1719    1.7014
## married2single/seperated               1.1618     0.8608    1.0443    1.2924
## married2Unknown                        1.1281     0.8865    0.9066    1.4037
## married2Widowed                        1.2528     0.7982    1.0949    1.4334
## hist2Bronchioalveolar                  0.8348     1.1980    0.6993    0.9964
## hist2Carcinoid                         0.3389     2.9507    0.2408    0.4770
## hist2Large cell                        1.9629     0.5095    1.5004    2.5680
## hist2Other                             0.8835     1.1318    0.7861    0.9930
## hist2Squamous cell                     1.2294     0.8134    1.0928    1.3830
## site2Middle lobe                       1.0796     0.9263    0.8843    1.3180
## site2Other,NOS                         0.9407     1.0631    0.7173    1.2336
## site2Upper lobe                        0.8725     1.1461    0.7946    0.9581
## grade2G2: Moderately differentiated    1.4313     0.6987    1.1996    1.7077
## grade2G3: Poorly differentiated        1.8763     0.5330    1.5626    2.2531
## grade2G4: Undifferentiated/Unknown     1.4469     0.6911    1.1933    1.7544
## T_StageT2                              1.4839     0.6739    1.3350    1.6493
## T_StageT3                              2.1348     0.4684    1.8671    2.4410
## T_StageT4                              3.1309     0.3194    2.6338    3.7217
## treat2yes                              0.6363     1.5717    0.5643    0.7174
## Year_of_diagnosis2019                  0.9177     1.0897    0.8293    1.0155
## Year_of_diagnosis2020                  0.8403     1.1900    0.7059    1.0003
## 
## Concordance= 0.714  (se = 0.006 )
## Likelihood ratio test= 1211  on 26 df,   p=<2e-16
## Wald test            = 1241  on 26 df,   p=<2e-16
## Score (logrank) test = 1360  on 26 df,   p=<2e-16