How does health insurance status impact the survival time of individuals diagnosed with prostate-related conditions, and what role do other factors such as education, income, and race/ethnicity play in this context?

Interpretation: Multiple datasets from NHANES are merged here to create a comprehensive dataset. This approach ensures that all relevant variables are included for the analysis, focusing on factors impacting the survival of individuals with prostate-related conditions.

# Load necessary libraries
library(survival)
library(ggplot2)
library(survminer)

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(ggpubr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(tidyr)
library(dplyr)
library(haven)
library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(purrr)
library(readr)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

# Read the XPT files with the complete file paths
nhanes_data <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/DEMO_D.XPT")
nhanes_data1 <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/ALQ_D.XPT")
nhanes_data2 <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/HIQ_D.XPT")
nhanes_data3 <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/KIQ_P_D.XPT")
nhanes_data4 <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/MCQ_D.XPT")
nhanes_data5 <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/PSA_D.XPT")
nhanes_data6 <- read_xpt("/Users/christacrumrine/Desktop/Ad_methods/PSQ_D.XPT")



# Combine all ways
#merged_data <- left_join(nhanes_data1, nhanes_data, by="SEQN")
#merged_data <- left_join(merged_data, nhanes_data2, by="SEQN")
#merged_data <- left_join(merged_data, nhanes_data3, by="SEQN")
#merged_data <- left_join(merged_data, nhanes_data4, by="SEQN")
#merged_data <- left_join(merged_data, nhanes_data5, by="SEQN")
#merged_data <- left_join(merged_data, nhanes_data6, by="SEQN")
#merged_data <- left_join(merged_data, nhanes_data7, by="SEQN")
#merged_data <- left_join(merged_data, NHANES_2006, by="SEQN")

df = list(nhanes_data,nhanes_data6, nhanes_data2,nhanes_data3, nhanes_data4, nhanes_data5, nhanes_data6)
final_df <- df %>% reduce(full_join,
                          by  = 'SEQN')


# Merge the datasets using bind_rows
#merged_data <- bind_rows(nhanes_data, nhanes_data1, nhanes_data2, nhanes_data3, nhanes_data4)

srvyin <- paste("NHANES_2005_2006_MORT_2019_PUBLIC (1).dat")   # full .DAT name here
srvyout <- "NHANES_2006" # shorthand dataset name here


# read in the fixed-width format ASCII file
dsn <- read_fwf(file=srvyin,
                col_types = "iiiiiiii",
                fwf_cols(seqn = c(1,6),
                         eligstat = c(15,15),
                         mortstat = c(16,16),
                         ucod_leading = c(17,19),
                         diabetes = c(20,20),
                         hyperten = c(21,21),
                         permth_int = c(43,45),
                         permth_exm = c(46,48)
                ),
                na = c("", ".")
)

dsn <- dsn %>%
  rename(SEQN = seqn)

Interpretation: In this step, key variables relevant to the study, such as education, race/ethnicity, income, and health metrics, are selected and cleaned. This includes handling missing values and transforming some variables to the appropriate format for analysis.

# Create a new dataset with selected variables
selected_data <- final_df %>%
  left_join(dsn, by = "SEQN") %>%
  select(
    Education = DMDEDUC2, #categorical
    Race_Ethnicity = RIDRETH1, #categorical
    Income = INDFMINC, #categorical
    AGE_TOLD_PROSTATE = KID221, #continuous
    DIAGNOSED_PROSTATE = KIQ201, #categorical
    PROSTATE_ENLARGE = KIQ121, #continuous
    AGE_PSA_TEST = MCQ320, 
    PSA_TOTAL = LBXP1, #continuous
    AGE_CURRENT = RIDAGEYR, #continuous
    CITIZEN_STATUS = DMDCITZN, #categorical
    HEALTH_INSURANCE = HIQ011, #categorical
    CURRENT_AGE = RIDAGEYR, #continuous
    #RADIATION_TX = KIQ301,
    #MED_TX = KIQ311,
    mortstat,  # Adding mortstat from dsn,
    permth_exm,
    ucod_leading,
#AGE_AT_DEATH is a continuous variable
  )

 #Replace NA with 0 in all columns of select_data
selected_data <- selected_data %>%
  mutate_all(funs(ifelse(is.na(.), 0, .)))

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

selected_data <- selected_data %>%
  mutate(AGE_TOLD_PROSTATE = as.character(AGE_TOLD_PROSTATE),
         AGE_TOLD_PROSTATE = ifelse(AGE_TOLD_PROSTATE == "85 or greater", "85", AGE_TOLD_PROSTATE),
         AGE_TOLD_PROSTATE = as.numeric(as.character(AGE_TOLD_PROSTATE)))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `AGE_TOLD_PROSTATE =
##   as.numeric(as.character(AGE_TOLD_PROSTATE))`.
## Caused by warning:
## ! NAs introduced by coercion

# First, filter out deaths not caused by ucod_leading == 2
selected_data_filtered <- selected_data %>%
  filter(!(mortstat == 1 & ucod_leading != 2))

#0= non-white and 1=white
selected_data <- selected_data %>%
  mutate(Race_Ethnicity = ifelse(Race_Ethnicity == 1, 1, 0))

selected_data <- na.omit(selected_data)

#0=no insurance, 1=insurance 
selected_data <- selected_data %>%
  mutate(HEALTH_INSURANCE = ifelse(HEALTH_INSURANCE == 1, 1, 0))

#0= no college and 1=college
selected_data <- selected_data %>%
  mutate(Education = case_when(
    Education %in% c(1, 2, 3) ~ 0,
    Education %in% c(4, 5) ~ 1,
    TRUE ~ NA_real_ # Assign NA to values that are not 1, 2, 3, 4, or 5
  )) %>%
  na.omit() # Remove rows with any NA values
selected_data <- na.omit(selected_data)

selected_data <- selected_data %>%
  mutate(Education = ifelse(Education == 99, NA, Education),
         Income = ifelse(Income == 99, NA, Income),
         Race_Ethnicity = ifelse(Race_Ethnicity == 99, NA, Race_Ethnicity),
         # ... add other variables as needed
        ) %>%
  na.omit() # Remove rows with any NA values

# Filter out entries where Income is greater than 11
selected_data <- selected_data %>%
  filter(Income <= 11)

Education has been turned into a dummy variable with 0 equaling high school or less and 1 equaling some college for more.

Health insurance is the focal variable with 0 meaning no health insurance and 1 with health insurance

Race ethnicity is 0 for non-white and 1 for NH-white

library(dplyr)

# Filter to keep only those diagnosed with prostate cancer
selected_data <- selected_data %>%
  filter(DIAGNOSED_PROSTATE == 1)

# Create or update the Age_or_Status variable
selected_data <- selected_data %>%
  mutate(Age_or_Status = ifelse(mortstat == 0, 
                                paste0(CURRENT_AGE, "+"), # Current age with a '+' if alive
                                as.character(floor(CURRENT_AGE + permth_exm / 12)))) # Rounded down age at death if dead

# This code first filters the selected_data to include only individuals diagnosed with prostate cancer.
# Then, for these individuals:
# - If they died from ucod_leading == 2, it calculates their age at death.
# - If they are still alive, it shows their current age with a '+'.

library(stringr)
library(dplyr)

# Selecting CURRENT_AGE and Age_or_Status from the filtered data
age_status_table <- selected_data %>%
  select(CURRENT_AGE, Age_or_Status)

# Create a numeric variable from Age_or_Status
selected_data <- selected_data %>%
  mutate(Age_or_Status_Numeric = as.numeric(str_replace(Age_or_Status, "\\+", "")))

selected_data <- selected_data %>%
  mutate(permth_exm = floor(permth_exm / 12))

selected_data <- selected_data %>%
  mutate(Age_At_Death = AGE_CURRENT + permth_exm)

Subset to Diagnosed Men

Interpretation: The dataset is filtered to include only individuals diagnosed with prostate conditions.

selected_data <- selected_data %>% filter(DIAGNOSED_PROSTATE ==1)

selected_data

## # A tibble: 53 × 18
##    Education Race_Ethnicity Income AGE_TOLD_PROSTATE DIAGNOSED_PROSTATE
##        <dbl>          <dbl>  <dbl>             <dbl>              <dbl>
##  1         1              0      5                79                  1
##  2         1              0      7                65                  1
##  3         1              0      6                61                  1
##  4         1              0      7                64                  1
##  5         0              0     10                62                  1
##  6         1              0     11                54                  1
##  7         0              0      7                64                  1
##  8         1              0     11                80                  1
##  9         0              0      5                70                  1
## 10         0              0     11                59                  1
## # ℹ 43 more rows
## # ℹ 13 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## #   PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, CITIZEN_STATUS <dbl>,
## #   HEALTH_INSURANCE <dbl>, CURRENT_AGE <dbl>, mortstat <dbl>,
## #   permth_exm <dbl>, ucod_leading <dbl>, Age_or_Status <chr>,
## #   Age_or_Status_Numeric <dbl>, Age_At_Death <dbl>

Interpretation: This section provides a detailed summary of key variables, including demographic and health-related factors.

# Summary statistics of key variables
summary(selected_data$Age_At_Death)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   65.00   78.00   84.00   82.94   89.00   94.00

summary(selected_data$Education)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.566   1.000   1.000

summary(selected_data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   5.000   7.000   6.849   8.000  11.000

summary(selected_data$Race_Ethnicity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03774 0.00000 1.00000

summary(selected_data$Citizenship_Status)

## Warning: Unknown or uninitialised column: `Citizenship_Status`.

## Length  Class   Mode 
##      0   NULL   NULL

summary(selected_data$Health_Insurance)

## Warning: Unknown or uninitialised column: `Health_Insurance`.

## Length  Class   Mode 
##      0   NULL   NULL

dim(selected_data)  # Check dimensions of the dataframe

## [1] 53 18

summary(selected_data)

##    Education     Race_Ethnicity        Income       AGE_TOLD_PROSTATE
##  Min.   :0.000   Min.   :0.00000   Min.   : 2.000   Min.   :54.00    
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.: 5.000   1st Qu.:64.00    
##  Median :1.000   Median :0.00000   Median : 7.000   Median :69.00    
##  Mean   :0.566   Mean   :0.03774   Mean   : 6.849   Mean   :69.47    
##  3rd Qu.:1.000   3rd Qu.:0.00000   3rd Qu.: 8.000   3rd Qu.:75.00    
##  Max.   :1.000   Max.   :1.00000   Max.   :11.000   Max.   :85.00    
##  DIAGNOSED_PROSTATE PROSTATE_ENLARGE  AGE_PSA_TEST     PSA_TOTAL
##  Min.   :1          Min.   :0.000    Min.   :  0.0   Min.   :0  
##  1st Qu.:1          1st Qu.:1.000    1st Qu.: 54.0   1st Qu.:0  
##  Median :1          Median :1.000    Median : 65.0   Median :0  
##  Mean   :1          Mean   :1.094    Mean   :114.8   Mean   :0  
##  3rd Qu.:1          3rd Qu.:1.000    3rd Qu.: 72.0   3rd Qu.:0  
##  Max.   :1          Max.   :2.000    Max.   :999.0   Max.   :0  
##   AGE_CURRENT    CITIZEN_STATUS HEALTH_INSURANCE  CURRENT_AGE   
##  Min.   :57.00   Min.   :1      Min.   :0.0000   Min.   :57.00  
##  1st Qu.:69.00   1st Qu.:1      1st Qu.:1.0000   1st Qu.:69.00  
##  Median :73.00   Median :1      Median :1.0000   Median :73.00  
##  Mean   :74.28   Mean   :1      Mean   :0.9811   Mean   :74.28  
##  3rd Qu.:80.00   3rd Qu.:1      3rd Qu.:1.0000   3rd Qu.:80.00  
##  Max.   :85.00   Max.   :1      Max.   :1.0000   Max.   :85.00  
##     mortstat        permth_exm     ucod_leading    Age_or_Status     
##  Min.   :0.0000   Min.   : 0.00   Min.   : 0.000   Length:53         
##  1st Qu.:0.0000   1st Qu.: 5.00   1st Qu.: 0.000   Class :character  
##  Median :1.0000   Median : 9.00   Median : 2.000   Mode  :character  
##  Mean   :0.6981   Mean   : 8.66   Mean   : 2.623                     
##  3rd Qu.:1.0000   3rd Qu.:13.00   3rd Qu.: 2.000                     
##  Max.   :1.0000   Max.   :14.00   Max.   :10.000                     
##  Age_or_Status_Numeric  Age_At_Death  
##  Min.   :57.00         Min.   :65.00  
##  1st Qu.:71.00         1st Qu.:78.00  
##  Median :79.00         Median :84.00  
##  Mean   :78.87         Mean   :82.94  
##  3rd Qu.:87.00         3rd Qu.:89.00  
##  Max.   :93.00         Max.   :94.00

sapply(selected_data, function(x) sum(is.na(x)))  # Counts NAs for each variable

##             Education        Race_Ethnicity                Income 
##                     0                     0                     0 
##     AGE_TOLD_PROSTATE    DIAGNOSED_PROSTATE      PROSTATE_ENLARGE 
##                     0                     0                     0 
##          AGE_PSA_TEST             PSA_TOTAL           AGE_CURRENT 
##                     0                     0                     0 
##        CITIZEN_STATUS      HEALTH_INSURANCE           CURRENT_AGE 
##                     0                     0                     0 
##              mortstat            permth_exm          ucod_leading 
##                     0                     0                     0 
##         Age_or_Status Age_or_Status_Numeric          Age_At_Death 
##                     0                     0                     0

sapply(selected_data, class)  # Displays the data type of each variable

##             Education        Race_Ethnicity                Income 
##             "numeric"             "numeric"             "numeric" 
##     AGE_TOLD_PROSTATE    DIAGNOSED_PROSTATE      PROSTATE_ENLARGE 
##             "numeric"             "numeric"             "numeric" 
##          AGE_PSA_TEST             PSA_TOTAL           AGE_CURRENT 
##             "numeric"             "numeric"             "numeric" 
##        CITIZEN_STATUS      HEALTH_INSURANCE           CURRENT_AGE 
##             "numeric"             "numeric"             "numeric" 
##              mortstat            permth_exm          ucod_leading 
##             "numeric"             "numeric"             "numeric" 
##         Age_or_Status Age_or_Status_Numeric          Age_At_Death 
##           "character"             "numeric"             "numeric"

# Summary Statistics for Education
summary(selected_data$Education)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.566   1.000   1.000

# Histogram for Education
hist(selected_data$Education, main = "Histogram of Education", xlab = "Education")

# Structure of the dataset
str(selected_data$Education)

##  num [1:53] 1 1 1 1 0 1 0 1 0 0 ...

library(ggplot2)
# Summary Statistics for Education
summary(selected_data$Education)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.566   1.000   1.000

# Summary Statistics for Income
summary(selected_data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   5.000   7.000   6.849   8.000  11.000

# Histogram for Income
hist(selected_data$Income, main = "Histogram of Income", xlab = "Income")

# Bar Plot for Income (if it's a categorical variable)
ggplot(selected_data, aes(x = Income)) +
  geom_bar() +
  labs(
    title = "Distribution of Income ",
    x = "Income ",
    y = "Frequency"
  )

# Summary Statistics for Race_Ethnicity
summary(selected_data$Race_Ethnicity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03774 0.00000 1.00000

# Histogram for Race_Ethnicity
hist(selected_data$Race_Ethnicity, main = "Histogram of Race_Ethnicity", xlab = "Race_Ethnicity")

# Bar Plot for Race_Ethnicity (if it's a categorical variable)
ggplot(selected_data, aes(x = Race_Ethnicity)) +
  geom_bar() +
  labs(
    title = "Distribution of Race_Ethnicity ",
    x = "Race_Ethnicity ",
    y = "Frequency"
  )

# Summary Statistics for HEALTH_INSURANCE
summary(selected_data$HEALTH_INSURANCE)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9811  1.0000  1.0000

# Histogram for HEALTH_INSURANCE
hist(selected_data$HEALTH_INSURANCE, main = "Histogram of HEALTH_INSURANCE", xlab = "HEALTH_INSURANCE")

# Bar Plot for HEALTH_INSURANCE (if it's a categorical variable)
ggplot(selected_data, aes(x = HEALTH_INSURANCE)) +
  geom_bar() +
  labs(
    title = "Distribution of HEALTH_INSURANCE ",
    x = "HEALTH_INSURANCE ",
    y = "Frequency"
  )

The histogram shows a distribution with multiple peaks, suggesting that the ages at death are concentrated around certain values. There appears to be a relatively uniform distribution with slight increases in frequency at certain age intervals, notably in the late 70s to early 90s. This could indicate common ages at which death occurs in the studied population, possibly due to common age-related factors.

The histogram does not show a smooth, bell-shaped curve, which suggests that the age at death is not normally distributed in this sample. This could be due to a variety of reasons such as the sample size, the presence of outliers, or the natural life course events that don’t follow a normal distribution pattern.

# Summary Statistics for Age_At_Death
summary(selected_data$Age_At_Death)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   65.00   78.00   84.00   82.94   89.00   94.00

# Histogram for Age_At_Death
hist(selected_data$Age_At_Death, main = "Histogram of Age_At_Death", xlab = "mortstat")

# Box Plot for Age_At_Death
#boxplot(selected_data$Age_At_Death, main = "Box Plot of Age_At_Death", ylab = "mortstat")

# Density Plot for Age_At_Death
#plot(density(selected_data$Age_At_Death), main = "Density Plot of Age_At_Death", xlab = "Age_At_Death")

# Q-Q Plot for Age_At_Death (if it's a continuous variable)
#qqnorm(selected_data$Age_At_Death)
#qqline(selected_data$Age_At_Death)

# Bar Plot for Age_At_Death (if it's a categorical variable)
ggplot(selected_data, aes(x = Age_At_Death)) +
  geom_bar() +
  labs(
    title = "Distribution of Age_At_Death ",
    x = "Age_At_Death ",
    y = "Frequency"
  )

#tabyl(selected_data$Age_At_Death) 

x <-selected_data %>% 
  
  filter(mortstat ==1 )

selected_data <- selected_data %>%
  mutate(ageevent = AGE_CURRENT + permth_exm) # Calculate ageevent

# Assuming you want to perform a similar analysis as with tabyl
library(gtsummary)

# Create a tbl_summary object for ageevent
tbl_ageevent <- selected_data %>%
  select(ageevent) %>%
  tbl_summary()

# Print the table for ageevent
tbl_ageevent

Characteristic	N = 53¹
ageevent	84 (78, 89)
¹ Median (IQR)

Interpretation: A Kaplan-Meier survival curve is created here to estimate the survival function from lifetime data. This is a non-parametric statistic used to estimate the survival probability from observed survival times.

From this curve, we can infer that survival decreases over time, as expected. The curve’s shape could be analyzed to understand specific periods where there may be a higher risk of death post-diagnosis. For instance, since there is a steep drop at a, this might indicate a period of higher mortality risk.

# Calculate Age_At_Death using AGE_CURRENT and permth_exm
selected_data <- selected_data %>%
  mutate(Age_At_Death = AGE_CURRENT + permth_exm)

# Load the survival library (if not already loaded)
library(survival)

surv_obj <- Surv(time = selected_data$Age_At_Death, event = selected_data$mortstat)

# Create a Kaplan-Meier survival curve
km_curve <- survfit(surv_obj ~ 1)  # Assuming you want to compare all individuals

# Plot the Kaplan-Meier survival curve
plot(km_curve, main = "Kaplan-Meier Survival Curve",
     xlab = "Time (Age at Death from Diagnosis)",
     ylab = "Survival Probability")

A Pearson’s Chi-squared test with Yates’ correction, is used to see if different categories are related to each other. The warning about the Chi-squared approximation indicates that your results might not reliable because of the small number of observations. In such cases with small sample sizes, the test will not work well, leading to potentially misleading conclusions. Therefore, while the test suggests that there’s no significant relationship, this conclusion should be taken with caution due to the small sample size involved.

# Create a contingency table between two categorical variables (e.g., Education and Race_Ethnicity)
contingency_table <- table(selected_data$Education, selected_data$Race_Ethnicity)

# Perform a chi-square test
chi_square_test <- chisq.test(contingency_table)

## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect

# Print the chi-square test results
print(chi_square_test)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_table
## X-squared = 0.84512, df = 1, p-value = 0.3579

# Calculate correlation matrix for numeric variables
numeric_vars <- select_if(selected_data, is.numeric)
correlation_matrix <- cor(numeric_vars)

## Warning in cor(numeric_vars): the standard deviation is zero

# Print the correlation matrix
print(correlation_matrix)

##                         Education Race_Ethnicity      Income AGE_TOLD_PROSTATE
## Education              1.00000000    -0.22616564  0.28581652        0.02579106
## Race_Ethnicity        -0.22616564     1.00000000 -0.02800928        0.23595484
## Income                 0.28581652    -0.02800928  1.00000000       -0.20532880
## AGE_TOLD_PROSTATE      0.02579106     0.23595484 -0.20532880        1.00000000
## DIAGNOSED_PROSTATE             NA             NA          NA                NA
## PROSTATE_ENLARGE       0.15777127    -0.22479938 -0.07659489       -0.18809635
## AGE_PSA_TEST          -0.12402105    -0.03041769 -0.05687990        0.28962081
## PSA_TOTAL                      NA             NA          NA                NA
## AGE_CURRENT            0.03850996     0.24587196 -0.25823030        0.85896063
## CITIZEN_STATUS                 NA             NA          NA                NA
## HEALTH_INSURANCE      -0.12142318     0.02746175  0.04771020        0.16413652
## CURRENT_AGE            0.03850996     0.24587196 -0.25823030        0.85896063
## mortstat              -0.16115798     0.13022340 -0.20675947        0.16966468
## permth_exm             0.04400557    -0.02913382  0.09756670       -0.21934991
## ucod_leading          -0.20587103     0.32572790  0.03737352        0.10275472
## Age_or_Status_Numeric -0.05924712     0.27733161 -0.29671084        0.70558101
## Age_At_Death           0.06332214     0.22195874 -0.19373870        0.70628978
## ageevent               0.06332214     0.22195874 -0.19373870        0.70628978
##                       DIAGNOSED_PROSTATE PROSTATE_ENLARGE AGE_PSA_TEST
## Education                             NA      0.157771274  -0.12402105
## Race_Ethnicity                        NA     -0.224799376  -0.03041769
## Income                                NA     -0.076594893  -0.05687990
## AGE_TOLD_PROSTATE                     NA     -0.188096354   0.28962081
## DIAGNOSED_PROSTATE                     1               NA           NA
## PROSTATE_ENLARGE                      NA      1.000000000  -0.03008526
## AGE_PSA_TEST                          NA     -0.030085258   1.00000000
## PSA_TOTAL                             NA               NA           NA
## AGE_CURRENT                           NA     -0.006869584   0.24674352
## CITIZEN_STATUS                        NA               NA           NA
## HEALTH_INSURANCE                      NA      0.024987509   0.03440192
## CURRENT_AGE                           NA     -0.006869584   0.24674352
## mortstat                              NA     -0.195509433   0.16654298
## permth_exm                            NA      0.037718416  -0.21893512
## ucod_leading                          NA     -0.188643617   0.14031718
## Age_or_Status_Numeric                 NA     -0.121215894   0.20818379
## Age_At_Death                          NA      0.015497760   0.11120611
## ageevent                              NA      0.015497760   0.11120611
##                       PSA_TOTAL  AGE_CURRENT CITIZEN_STATUS HEALTH_INSURANCE
## Education                    NA  0.038509958             NA      -0.12142318
## Race_Ethnicity               NA  0.245871958             NA       0.02746175
## Income                       NA -0.258230302             NA       0.04771020
## AGE_TOLD_PROSTATE            NA  0.858960631             NA       0.16413652
## DIAGNOSED_PROSTATE           NA           NA             NA               NA
## PROSTATE_ENLARGE             NA -0.006869584             NA       0.02498751
## AGE_PSA_TEST                 NA  0.246743524             NA       0.03440192
## PSA_TOTAL                     1           NA             NA               NA
## AGE_CURRENT                  NA  1.000000000             NA       0.22945305
## CITIZEN_STATUS               NA           NA              1               NA
## HEALTH_INSURANCE             NA  0.229453052             NA       1.00000000
## CURRENT_AGE                  NA  1.000000000             NA       0.22945305
## mortstat                     NA  0.274209803             NA      -0.09119215
## permth_exm                   NA -0.254748307             NA      -0.01049229
## ucod_leading                 NA  0.209646550             NA       0.02641146
## Age_or_Status_Numeric        NA  0.876892257             NA       0.12096320
## Age_At_Death                 NA  0.822624529             NA       0.21695371
## ageevent                     NA  0.822624529             NA       0.21695371
##                        CURRENT_AGE    mortstat  permth_exm ucod_leading
## Education              0.038509958 -0.16115798  0.04400557  -0.20587103
## Race_Ethnicity         0.245871958  0.13022340 -0.02913382   0.32572790
## Income                -0.258230302 -0.20675947  0.09756670   0.03737352
## AGE_TOLD_PROSTATE      0.858960631  0.16966468 -0.21934991   0.10275472
## DIAGNOSED_PROSTATE              NA          NA          NA           NA
## PROSTATE_ENLARGE      -0.006869584 -0.19550943  0.03771842  -0.18864362
## AGE_PSA_TEST           0.246743524  0.16654298 -0.21893512   0.14031718
## PSA_TOTAL                       NA          NA          NA           NA
## AGE_CURRENT            1.000000000  0.27420980 -0.25474831   0.20964655
## CITIZEN_STATUS                  NA          NA          NA           NA
## HEALTH_INSURANCE       0.229453052 -0.09119215 -0.01049229   0.02641146
## CURRENT_AGE            1.000000000  0.27420980 -0.25474831   0.20964655
## mortstat               0.274209803  1.00000000 -0.70899954   0.52753845
## permth_exm            -0.254748307 -0.70899954  1.00000000  -0.24916806
## ucod_leading           0.209646550  0.52753845 -0.24916806   1.00000000
## Age_or_Status_Numeric  0.876892257  0.55993693 -0.20102569   0.41101003
## Age_At_Death           0.822624529 -0.15023543  0.34026368   0.05735606
## ageevent               0.822624529 -0.15023543  0.34026368   0.05735606
##                       Age_or_Status_Numeric Age_At_Death    ageevent
## Education                       -0.05924712   0.06332214  0.06332214
## Race_Ethnicity                   0.27733161   0.22195874  0.22195874
## Income                          -0.29671084  -0.19373870 -0.19373870
## AGE_TOLD_PROSTATE                0.70558101   0.70628978  0.70628978
## DIAGNOSED_PROSTATE                       NA           NA          NA
## PROSTATE_ENLARGE                -0.12121589   0.01549776  0.01549776
## AGE_PSA_TEST                     0.20818379   0.11120611  0.11120611
## PSA_TOTAL                                NA           NA          NA
## AGE_CURRENT                      0.87689226   0.82262453  0.82262453
## CITIZEN_STATUS                           NA           NA          NA
## HEALTH_INSURANCE                 0.12096320   0.21695371  0.21695371
## CURRENT_AGE                      0.87689226   0.82262453  0.82262453
## mortstat                         0.55993693  -0.15023543 -0.15023543
## permth_exm                      -0.20102569   0.34026368  0.34026368
## ucod_leading                     0.41101003   0.05735606  0.05735606
## Age_or_Status_Numeric            1.00000000   0.73450107  0.73450107
## Age_At_Death                     0.73450107   1.00000000  1.00000000
## ageevent                         0.73450107   1.00000000  1.00000000

# Visualize the correlation matrix using a heatmap
library(corrplot)

## corrplot 0.92 loaded

corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45)

The overall fit of the model can be assessed by the Concordance statistic (C-index), Likelihood ratio test, Wald test, and Score (logrank) test. A C-index of 0.554 suggests that the model is not particularly strong at predicting outcomes (a C-index of 0.5 suggests no predictive power, and 1.0 suggests perfect prediction).

All the p-values from the tests are not significant (all above 0.05), suggesting that the model does not have strong evidence for the effects of DIAGNOSED_PROSTATE, Education, and Income on the age at death. However, it is worth noting that with a sample size of 59 and 41 events, the study may be underpowered to detect all but the largest effects.

# Assuming selected_data contains your dataset
model_data <- selected_data %>%
  select(Age_At_Death, DIAGNOSED_PROSTATE, Education, Income, mortstat)

# Create a binary event variable
model_data$event <- ifelse(model_data$mortstat == 1, 1, 0)

# Fit a Cox Proportional Hazards regression model
cox_model <- coxph(Surv(Age_At_Death, event) ~ DIAGNOSED_PROSTATE + Education + Income, data = model_data)

summary(cox_model)

## Call:
## coxph(formula = Surv(Age_At_Death, event) ~ DIAGNOSED_PROSTATE + 
##     Education + Income, data = model_data)
## 
##   n= 53, number of events= 37 
## 
##                        coef exp(coef) se(coef)      z Pr(>|z|)
## DIAGNOSED_PROSTATE       NA        NA  0.00000     NA       NA
## Education          -0.21677   0.80511  0.34648 -0.626    0.532
## Income              0.03539   1.03602  0.08026  0.441    0.659
## 
##                    exp(coef) exp(-coef) lower .95 upper .95
## DIAGNOSED_PROSTATE        NA         NA        NA        NA
## Education             0.8051     1.2421    0.4083     1.588
## Income                1.0360     0.9652    0.8852     1.213
## 
## Concordance= 0.516  (se = 0.069 )
## Likelihood ratio test= 0.47  on 2 df,   p=0.8
## Wald test            = 0.46  on 2 df,   p=0.8
## Score (logrank) test = 0.46  on 2 df,   p=0.8

# Calculate time_to_event by adding AGE_TOLD_PROSTATE and Age_At_Death
selected_data <- selected_data %>%
  mutate(time_to_event = Age_At_Death)

# Fit a Cox Proportional Hazards regression model using Race_Ethnicity as a predictor
cox_model_race_ethnicity <- coxph(Surv(time_to_event, mortstat) ~ Race_Ethnicity, data = selected_data)

# View the summary of the model
summary(cox_model_race_ethnicity)

## Call:
## coxph(formula = Surv(time_to_event, mortstat) ~ Race_Ethnicity, 
##     data = selected_data)
## 
##   n= 53, number of events= 37 
## 
##                   coef exp(coef) se(coef)      z Pr(>|z|)
## Race_Ethnicity -0.5622    0.5700   0.7347 -0.765    0.444
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## Race_Ethnicity      0.57      1.755     0.135     2.406
## 
## Concordance= 0.527  (se = 0.019 )
## Likelihood ratio test= 0.69  on 1 df,   p=0.4
## Wald test            = 0.59  on 1 df,   p=0.4
## Score (logrank) test = 0.6  on 1 df,   p=0.4

The Cox Proportional Hazards regression model summary attempts to understand the impact of race/ethnicity on survival time, using data from 57 individuals with 40 events (deaths). The analysis resulted in a coefficient of -0.5759 for race/ethnicity, suggesting a decrease in risk. The hazard ratio of 0.5622 implies that the risk of the event is about 56% for one group compared to another. Thee model does not find this relationship statistically significant, as evidenced by p-values around 0.4 in various tests, suggesting that race/ethnicity is not a significant predictor of survival time of prostate cancer diagnosis. The concordance of 0.526 indicates only a slightly better predictive ability than a random guess. These findings, especially given the small sample size, suggest limited reliability and generalizability, highlighting the need for caution in interpreting these results.

#model age to death, then censor people

# Load required libraries
library(survival)
library(survminer)

# Simulated survival data with demographic variables
set.seed(123)
n <- 100  # Number of observations
time_to_event <- rexp(n, rate = 0.02)  # Simulated survival times
status <- rbinom(n, size = 1, prob = 0.7)  # Simulated censoring (1=event, 0=censored)
HEALTH_INSURANCE <- factor(sample(c("1", "2"), n, replace = TRUE))
CURRENT_AGE <- rnorm(n, mean = 40, sd = 9)

# Create a survival object
surv_data <- Surv(selected_data$time_to_event,  selected_data$mortstat)

# Kaplan-Meier survival curves by HEALTH INSURANCE STATUS
#survfit_HEALTH_INSURANCE <- survfit(surv_data ~ HEALTH_INSURANCE)

# Plot Kaplan-Meier curves
#ggsurvplot(survfit_HEALTH_INSURANCE, data = surv_data, title = "Kaplan-Meier Survival Curves by HEALTH INSURANCE STATUS")

# Perform a log-rank test to compare survival curves
#log_rank_test <- survdiff(surv_data ~ HEALTH_INSURANCE)
#print(log_rank_test)

# Fit a Cox Proportional-Hazards model
#cox_model <- coxph(surv_data ~ HEALTH_INSURANCE + time_to_event)
#summary(cox_model)

surv_data  <- survfit(formula = Surv(time_to_event, mortstat) ~ HEALTH_INSURANCE, 
    data = selected_data)

library(ggplot2)

# Calculate mean and standard deviation of CURRENT_AGE
mean_age <- mean(selected_data$CURRENT_AGE, na.rm = TRUE)
sd_age <- sd(selected_data$CURRENT_AGE, na.rm = TRUE)

# Creating a histogram of CURRENT_AGE and adding mean and SD
ggplot(selected_data, aes(x = CURRENT_AGE)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  geom_vline(aes(xintercept = mean_age), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = mean_age + sd_age), color = "green", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = mean_age - sd_age), color = "green", linetype = "dashed", size = 1) +
  labs(title = "Histogram of Current Age", x = "Current Age", y = "Frequency") +
  theme_minimal() +
  annotate("text", x = mean_age, y = Inf, label = paste("Mean:", round(mean_age, 2)), vjust = 2, color = "red") +
  annotate("text", x = mean_age + sd_age, y = Inf, label = paste("Mean + SD:", round(mean_age + sd_age, 2)), vjust = 2, color = "green") +
  annotate("text", x = mean_age - sd_age, y = Inf, label = paste("Mean - SD:", round(mean_age - sd_age, 2)), vjust = 2, color = "green")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Interpretation: This section includes checking the proportional hazards assumption, which is crucial for the validity of Cox regression models. Visual and statistical methods are used for this purpose.

# Load the survminer library
library(survminer)

# Check the proportional hazards assumption visually
ggsurvplot(survfit(cox_model), data = model_data, pval = TRUE)

## Warning in .pvalue(fit, data = data, method = method, pval = pval, pval.coord = pval.coord, : There are no survival curves to be compared. 
##  This is a null model.

# Test the proportional hazards assumption using Schoenfeld residuals
#cox_zph <- cox.zph(cox_model)
#plot(cox_zph)

# Assuming 'cox_model' is your Cox regression model
# Check the proportional hazards assumption for a continuous variable (e.g., 'Age_At_Death')
cox_zph <- cox.zph(cox_model, transform = "identity")

# Print the results
print(cox_zph)

##            chisq df    p
## Education 0.4229  1 0.52
## Income    0.0017  1 0.97
## GLOBAL    0.4625  2 0.79

# Calculate mean and standard deviation
mean_age_status <- mean(selected_data$Age_or_Status_Numeric, na.rm = TRUE)
sd_age_status <- sd(selected_data$Age_or_Status_Numeric, na.rm = TRUE)

# Creating a histogram of Age_or_Status_Numeric and adding mean and SD
ggplot(selected_data, aes(x = Age_or_Status_Numeric)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  geom_vline(aes(xintercept = mean_age_status), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = mean_age_status + sd_age_status), color = "green", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = mean_age_status - sd_age_status), color = "green", linetype = "dashed", size = 1) +
  labs(title = "Histogram of Age or Status", x = "Age or Status", y = "Frequency") +
  theme_minimal() +
  annotate("text", x = mean_age_status, y = Inf, label = paste("Mean:", round(mean_age_status, 2)), vjust = 2, color = "red") +
  annotate("text", x = mean_age_status + sd_age_status, y = Inf, label = paste("Mean + SD:", round(mean_age_status + sd_age_status, 2)), vjust = 2, color = "green") +
  annotate("text", x = mean_age_status - sd_age_status, y = Inf, label = paste("Mean - SD:", round(mean_age_status - sd_age_status, 2)), vjust = 2, color = "green")

model_data <- selected_data %>%
  select(Age_At_Death, DIAGNOSED_PROSTATE, Education, Income, HEALTH_INSURANCE, CITIZEN_STATUS, Race_Ethnicity, mortstat)

cox_model <- coxph(Surv(Age_At_Death, mortstat) ~ 
                    DIAGNOSED_PROSTATE + Education + Income + Race_Ethnicity + CITIZEN_STATUS + HEALTH_INSURANCE, 
                  data = model_data)
summary(cox_model)

## Call:
## coxph(formula = Surv(Age_At_Death, mortstat) ~ DIAGNOSED_PROSTATE + 
##     Education + Income + Race_Ethnicity + CITIZEN_STATUS + HEALTH_INSURANCE, 
##     data = model_data)
## 
##   n= 53, number of events= 37 
## 
##                        coef exp(coef) se(coef)      z Pr(>|z|)  
## DIAGNOSED_PROSTATE       NA        NA  0.00000     NA       NA  
## Education          -0.44454   0.64112  0.37539 -1.184   0.2363  
## Income              0.06862   1.07103  0.08525  0.805   0.4209  
## Race_Ethnicity     -0.86295   0.42192  0.77533 -1.113   0.2657  
## CITIZEN_STATUS           NA        NA  0.00000     NA       NA  
## HEALTH_INSURANCE   -2.53063   0.07961  1.11974 -2.260   0.0238 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                    exp(coef) exp(-coef) lower .95 upper .95
## DIAGNOSED_PROSTATE        NA         NA        NA        NA
## Education            0.64112     1.5598  0.307189    1.3380
## Income               1.07103     0.9337  0.906229    1.2658
## Race_Ethnicity       0.42192     2.3701  0.092313    1.9284
## CITIZEN_STATUS            NA         NA        NA        NA
## HEALTH_INSURANCE     0.07961    12.5614  0.008868    0.7146
## 
## Concordance= 0.592  (se = 0.064 )
## Likelihood ratio test= 4.86  on 4 df,   p=0.3
## Wald test            = 6.37  on 4 df,   p=0.2
## Score (logrank) test = 8.74  on 4 df,   p=0.07

# Create a dataset with Age_At_Death and censoring status (mortstat)
model_data <- selected_data %>%
  select(Age_At_Death, mortstat)

# Fit a Cox Proportional Hazards regression model
cox_model <- coxph(Surv(Age_At_Death, mortstat) ~ 1, data = model_data)

# View the summary of the Cox model
summary(cox_model)

## Call:  coxph(formula = Surv(Age_At_Death, mortstat) ~ 1, data = model_data)
## 
## Null model
##   log likelihood= -116.1613 
##   n= 53

# Create a dataset with Age_At_Death
model_data <- selected_data %>%
  select(Age_At_Death)

# Fit a Cox Proportional Hazards regression model without censoring
cox_model <- coxph(Surv(Age_At_Death) ~ 1, data = model_data)

# View the summary of the Cox model
summary(cox_model)

## Call:  coxph(formula = Surv(Age_At_Death) ~ 1, data = model_data)
## 
## Null model
##   log likelihood= -160.3311 
##   n= 53

ageevent age at event if they died and age of survey if they did not die

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

# Create ageevent variable
selected_data <- selected_data %>%
  mutate(ageevent = ifelse(mortstat == 1, Age_At_Death, CURRENT_AGE))

# Check the first few rows of the dataset to verify the ageevent variable
head(selected_data)

## # A tibble: 6 × 20
##   Education Race_Ethnicity Income AGE_TOLD_PROSTATE DIAGNOSED_PROSTATE
##       <dbl>          <dbl>  <dbl>             <dbl>              <dbl>
## 1         1              0      5                79                  1
## 2         1              0      7                65                  1
## 3         1              0      6                61                  1
## 4         1              0      7                64                  1
## 5         0              0     10                62                  1
## 6         1              0     11                54                  1
## # ℹ 15 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## #   PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, CITIZEN_STATUS <dbl>,
## #   HEALTH_INSURANCE <dbl>, CURRENT_AGE <dbl>, mortstat <dbl>,
## #   permth_exm <dbl>, ucod_leading <dbl>, Age_or_Status <chr>,
## #   Age_or_Status_Numeric <dbl>, Age_At_Death <dbl>, ageevent <dbl>,
## #   time_to_event <dbl>

tabyl(selected_data$ageevent)

##  selected_data$ageevent n    percent
##                      57 1 0.01886792
##                      60 1 0.01886792
##                      65 1 0.01886792
##                      66 2 0.03773585
##                      68 2 0.03773585
##                      69 4 0.07547170
##                      70 1 0.01886792
##                      71 2 0.03773585
##                      73 3 0.05660377
##                      74 1 0.01886792
##                      76 2 0.03773585
##                      77 2 0.03773585
##                      78 4 0.07547170
##                      79 1 0.01886792
##                      80 3 0.05660377
##                      81 1 0.01886792
##                      82 1 0.01886792
##                      83 2 0.03773585
##                      84 2 0.03773585
##                      85 1 0.01886792
##                      87 3 0.05660377
##                      88 4 0.07547170
##                      89 3 0.05660377
##                      90 1 0.01886792
##                      91 2 0.03773585
##                      92 1 0.01886792
##                      93 2 0.03773585

# Create a survival object
surv_obj <- Surv(time = selected_data$ageevent, event = selected_data$mortstat)

# Fit a Cox Proportional Hazards regression model using ageevent
cox_model_ageevent <- coxph(Surv(ageevent, mortstat) ~ Education + Race_Ethnicity + Income + DIAGNOSED_PROSTATE + HEALTH_INSURANCE + CITIZEN_STATUS, data = selected_data)

# View the summary of the Cox model
summary(cox_model_ageevent)

## Call:
## coxph(formula = Surv(ageevent, mortstat) ~ Education + Race_Ethnicity + 
##     Income + DIAGNOSED_PROSTATE + HEALTH_INSURANCE + CITIZEN_STATUS, 
##     data = selected_data)
## 
##   n= 53, number of events= 37 
## 
##                        coef exp(coef) se(coef)      z Pr(>|z|)  
## Education          -0.50302   0.60470  0.38453 -1.308   0.1908  
## Race_Ethnicity     -1.66884   0.18846  0.82280 -2.028   0.0425 *
## Income              0.10290   1.10838  0.08306  1.239   0.2154  
## DIAGNOSED_PROSTATE       NA        NA  0.00000     NA       NA  
## HEALTH_INSURANCE   -2.47634   0.08405  1.12074 -2.210   0.0271 *
## CITIZEN_STATUS           NA        NA  0.00000     NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                    exp(coef) exp(-coef) lower .95 upper .95
## Education            0.60470     1.6537  0.284593    1.2849
## Race_Ethnicity       0.18846     5.3060  0.037572    0.9454
## Income               1.10838     0.9022  0.941866    1.3043
## DIAGNOSED_PROSTATE        NA         NA        NA        NA
## HEALTH_INSURANCE     0.08405    11.8977  0.009345    0.7560
## CITIZEN_STATUS            NA         NA        NA        NA
## 
## Concordance= 0.616  (se = 0.071 )
## Likelihood ratio test= 8.44  on 4 df,   p=0.08
## Wald test            = 8.63  on 4 df,   p=0.07
## Score (logrank) test = 11  on 4 df,   p=0.03

# Assuming your dataset is named 'data'
# and mortstat = 1 means the individual is alive, 
# while mortstat = 0 means the individual is deceased.

# Creating the time_to_event variable
selected_data$time_to_event <- selected_data$permth_exm

# Creating the status variable based on mortstat
# If mortstat = 0 (alive), then status = 0 (censored)
# If mortstat = 0 (deceased), then status = 1 (event occurred)
selected_data$status <- ifelse(selected_data$mortstat == 0, 1, 1)


# Creating the survival object
surv_object1 <- Surv(time = selected_data$time_to_event, event = selected_data$status)

surv_object <- Surv(time = selected_data$time_to_event, event = selected_data$status)

km_fit <- survfit(surv_object ~ 1, data = selected_data)

ggsurvplot(km_fit, 
           data = selected_data, 
           xlab = "Years", 
           ylab = "Survival probability",
           title = "Kaplan-Meier Survival Curve")

cox_model <- coxph(surv_object ~ AGE_TOLD_PROSTATE, data = selected_data)
summary(cox_model)

## Call:
## coxph(formula = surv_object ~ AGE_TOLD_PROSTATE, data = selected_data)
## 
##   n= 53, number of events= 53 
## 
##                      coef exp(coef) se(coef)     z Pr(>|z|)
## AGE_TOLD_PROSTATE 0.01418   1.01428  0.02140 0.663    0.508
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## AGE_TOLD_PROSTATE     1.014     0.9859    0.9726     1.058
## 
## Concordance= 0.569  (se = 0.048 )
## Likelihood ratio test= 0.44  on 1 df,   p=0.5
## Wald test            = 0.44  on 1 df,   p=0.5
## Score (logrank) test = 0.44  on 1 df,   p=0.5

cox.zph(cox_model)

##                   chisq df     p
## AGE_TOLD_PROSTATE  3.49  1 0.062
## GLOBAL             3.49  1 0.062

# Assuming 'surv_object' is your survival object
KM_fit <- survfit(surv_object ~ 1, data = selected_data)

plot(KM_fit)

# Convert HEALTH_INSURANCE to a factor if it is not already
selected_data$HEALTH_INSURANCE <- as.factor(selected_data$HEALTH_INSURANCE)

# Create the survival object
surv_object <- Surv(time = selected_data$ageevent, event = selected_data$status)

# Fit the Cox proportional hazards model
cox_model <- coxph(surv_object ~ HEALTH_INSURANCE, data = selected_data)

# View the summary of the Cox model
summary(cox_model)

## Call:
## coxph(formula = surv_object ~ HEALTH_INSURANCE, data = selected_data)
## 
##   n= 53, number of events= 53 
## 
##                      coef exp(coef) se(coef)      z Pr(>|z|)
## HEALTH_INSURANCE1 -1.2890    0.2755   1.0384 -1.241    0.215
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## HEALTH_INSURANCE1    0.2755      3.629     0.036     2.109
## 
## Concordance= 0.51  (se = 0.01 )
## Likelihood ratio test= 1.09  on 1 df,   p=0.3
## Wald test            = 1.54  on 1 df,   p=0.2
## Score (logrank) test = 1.77  on 1 df,   p=0.2

# Pull specific columns
# pull columns named 'Age_At_Death' and 'Age_Told_Prostate'
extracted_columns <- selected_data %>% select(Age_or_Status, AGE_TOLD_PROSTATE)

# Pull specific rows
# For example, to pull rows 1 to 5
extracted_rows <- selected_data %>% slice(1:5)

# If you want to pull rows based on a condition
# pull rows where Age_At_Death is greater than 70
extracted_condition_rows <- selected_data %>% filter(Age_At_Death > 70)

# Display the extracted data
print(extracted_columns)

## # A tibble: 53 × 2
##    Age_or_Status AGE_TOLD_PROSTATE
##    <chr>                     <dbl>
##  1 93                           79
##  2 78                           65
##  3 71                           61
##  4 77                           64
##  5 68                           62
##  6 57+                          54
##  7 74                           64
##  8 89                           80
##  9 88                           70
## 10 69                           59
## # ℹ 43 more rows

print(extracted_rows)

## # A tibble: 5 × 21
##   Education Race_Ethnicity Income AGE_TOLD_PROSTATE DIAGNOSED_PROSTATE
##       <dbl>          <dbl>  <dbl>             <dbl>              <dbl>
## 1         1              0      5                79                  1
## 2         1              0      7                65                  1
## 3         1              0      6                61                  1
## 4         1              0      7                64                  1
## 5         0              0     10                62                  1
## # ℹ 16 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## #   PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, CITIZEN_STATUS <dbl>,
## #   HEALTH_INSURANCE <fct>, CURRENT_AGE <dbl>, mortstat <dbl>,
## #   permth_exm <dbl>, ucod_leading <dbl>, Age_or_Status <chr>,
## #   Age_or_Status_Numeric <dbl>, Age_At_Death <dbl>, ageevent <dbl>,
## #   time_to_event <dbl>, status <dbl>

print(extracted_condition_rows)

## # A tibble: 47 × 21
##    Education Race_Ethnicity Income AGE_TOLD_PROSTATE DIAGNOSED_PROSTATE
##        <dbl>          <dbl>  <dbl>             <dbl>              <dbl>
##  1         1              0      5                79                  1
##  2         1              0      7                65                  1
##  3         1              0      6                61                  1
##  4         1              0      7                64                  1
##  5         0              0      7                64                  1
##  6         1              0     11                80                  1
##  7         0              0      5                70                  1
##  8         1              0      6                78                  1
##  9         1              0      4                73                  1
## 10         1              0     11                75                  1
## # ℹ 37 more rows
## # ℹ 16 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## #   PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, CITIZEN_STATUS <dbl>,
## #   HEALTH_INSURANCE <fct>, CURRENT_AGE <dbl>, mortstat <dbl>,
## #   permth_exm <dbl>, ucod_leading <dbl>, Age_or_Status <chr>,
## #   Age_or_Status_Numeric <dbl>, Age_At_Death <dbl>, ageevent <dbl>,
## #   time_to_event <dbl>, status <dbl>

# Load necessary libraries
library(survival)
library(survminer)

extracted_condition_rows$time_to_event <- as.numeric(extracted_condition_rows$time_to_event)


# Assuming 'selected_data' is your dataset
# Ensure 'ageevent' is numeric and 'status' is a binary indicator (0 or 1)

# Create the survival object
surv_object <- Surv(time = extracted_condition_rows$time_to_event, event = extracted_condition_rows$status)


# Fit the Kaplan-Meier model
km_fit <- survfit(surv_object ~ 1)

# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit, 
           data = extracted_condition_rows, 
           xlab = "Years since Diagnosis", 
           ylab = "Survival Probability",
           title = "Kaplan-Meier Survival Curve",
           xlim = c(0, max(extracted_condition_rows$time_to_event))) # Adjust the x-axis limit as needed

# start from 0 (year of diagnosis) ensure that 'ageevent' is calculated as the time since diagnosis

The purpose of the analysis is to visualize how the probability to survival changes over time for those diagnosed with prostate cancer.

# Ensure that the 'Education' variable is a factor
selected_data$Education <- as.factor(selected_data$Education)

# Create the survival object
surv_obj <- Surv(time = selected_data$time_to_event, event = selected_data$status)

# Fit the Kaplan-Meier model for the Education variable
km_fit_education <- survfit(surv_obj ~ Education, data = selected_data)

# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit_education, 
           data = selected_data, 
           pval = TRUE, # Show p-value of log-rank test
           conf.int = TRUE, # Show confidence intervals
           palette = "Dark2", # Color palette
           xlab = "Time", 
           ylab = "Survival probability",
           title = "Kaplan-Meier Survival Curve by Education Level")

# time_to_event and status variables are correctly defined in your dataset

# Fit the Cox Proportional Hazards regression model
cox_model <- coxph(Surv(time_to_event, status) ~ HEALTH_INSURANCE + Education + Income + Race_Ethnicity, data = selected_data)

# Create a dataframe of the model's summary
cox_summary <- broom::tidy(cox_model)

# Create a plot of hazard ratios
ggplot(cox_summary, aes(x = term, y = estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.1) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Hazard Ratios from Cox Proportional Hazards Model",
       x = "Covariates",
       y = "Hazard Ratio Estimate")

# Note: This plot will show the point estimates of hazard ratios and their corresponding confidence intervals.

# Fit the Kaplan-Meier model
km_fit <- survfit(Surv(time_to_event, status) ~ Education, data = extracted_condition_rows)

# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit, 
           data = extracted_condition_rows, 
           xlab = "Years since Diagnosis", 
           ylab = "Survival Probability",
           title = "Kaplan-Meier Survival Curve",
           xlim = c(0, max(extracted_condition_rows$time_to_event))) # Adjust the x-axis limit as needed

# start from 0 (year of diagnosis) ensure that 'ageevent' is calculated as the time since diagnosis

# Fit the Kaplan-Meier model
km_fit <- survfit(Surv(time_to_event, status) ~ HEALTH_INSURANCE, data = extracted_condition_rows)

# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit, 
           data = extracted_condition_rows, 
           xlab = "Years since Diagnosis", 
           ylab = "Survival Probability",
           title = "Kaplan-Meier Survival Curve",
           xlim = c(0, max(extracted_condition_rows$time_to_event))) # Adjust the x-axis limit as needed

# start from 0 (year of diagnosis) ensure that 'ageevent' is calculated as the time since diagnosis

# Load the libraries
library(survival)
library(survminer)

# Assuming your data frame is called 'selected_data'
# and you have 'time_to_event' as the time variable, 'status' as the event indicator
# and 'HEALTH_INSURANCE' as the health insurance status variable

# Create the survival object
surv_obj <- Surv(time = selected_data$time_to_event, event = selected_data$status)

# Fit the Kaplan-Meier model stratified by health insurance status
km_fit <- survfit(surv_obj ~ HEALTH_INSURANCE, data = selected_data)

# Plot the Kaplan-Meier curves
ggsurvplot(km_fit, 
           data = selected_data, 
           pval = TRUE, 
           conf.int = TRUE,
           palette = "Dark2",
           xlab = "Time", 
           ylab = "Survival probability",
           title = "Kaplan-Meier Survival Curves by Health Insurance Status")

# Fit the Kaplan-Meier model
km_fit <- survfit(Surv(time_to_event, status) ~ Race_Ethnicity, data = extracted_condition_rows)

# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit, 
           data = extracted_condition_rows, 
           xlab = "Years since Diagnosis", 
           ylab = "Survival Probability",
           title = "Kaplan-Meier Survival Curve",
           xlim = c(0, max(extracted_condition_rows$time_to_event))) # Adjust the x-axis limit as needed

# start from 0 (year of diagnosis) ensure that 'ageevent' is calculated as the time since diagnosis

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

extracted_condition_rows$Race_Ethnicity <- factor(extracted_condition_rows$Race_Ethnicity, 
                                       levels = c(0, 1), 
                                       labels = c("Nonwhite", "White"))

extracted_condition_rows$mortstat <- factor(extracted_condition_rows$mortstat, 
                                 levels = c(0, 1), 
                                 labels = c("Alive", "Deceased"))

extracted_condition_rows$HEALTH_INSURANCE <- factor(extracted_condition_rows$HEALTH_INSURANCE, 
                                         levels = c(0, 1), 
                                         labels = c("Uninsured", "Insured"))

extracted_condition_rows$Education <- as.factor(extracted_condition_rows$Education)

# Fit the Cox Proportional Hazards regression model
cox_model <- coxph(Surv(time_to_event, status) ~ HEALTH_INSURANCE + Education + Income + Race_Ethnicity, data = selected_data)
summary(cox_model)

## Call:
## coxph(formula = Surv(time_to_event, status) ~ HEALTH_INSURANCE + 
##     Education + Income + Race_Ethnicity, data = selected_data)
## 
##   n= 53, number of events= 53 
## 
##                        coef exp(coef)  se(coef)      z Pr(>|z|)
## HEALTH_INSURANCE1 -0.594191  0.552009  1.037252 -0.573    0.567
## Education1        -0.221975  0.800936  0.314025 -0.707    0.480
## Income             0.008617  1.008655  0.066034  0.130    0.896
## Race_Ethnicity     0.576100  1.779087  0.759149  0.759    0.448
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## HEALTH_INSURANCE1    0.5520     1.8116   0.07228     4.216
## Education1           0.8009     1.2485   0.43281     1.482
## Income               1.0087     0.9914   0.88620     1.148
## Race_Ethnicity       1.7791     0.5621   0.40180     7.877
## 
## Concordance= 0.51  (se = 0.049 )
## Likelihood ratio test= 1.42  on 4 df,   p=0.8
## Wald test            = 1.57  on 4 df,   p=0.8
## Score (logrank) test = 1.61  on 4 df,   p=0.8

This analysis attempts to see if factors like health insurance, education, income, and race/ethnicity play a significant role in influencing the time to a certain event in a group of 57 people. The results suggest no strong evidence that any of these factors are significant predictors in this specific dataset.

# Check the proportional hazards assumption
cox_zph <- cox.zph(cox_model)
plot(cox_zph)

table1::table1(~mortstat + time_to_event + AGE_TOLD_PROSTATE + Income + Race_Ethnicity + Education|HEALTH_INSURANCE, data= extracted_condition_rows)

	Uninsured (N=1)	Insured (N=46)	Overall (N=47)
mortstat
Alive	0 (0%)	15 (32.6%)	15 (31.9%)
Deceased	1 (100%)	31 (67.4%)	32 (68.1%)
time_to_event
Mean (SD)	9.00 (NA)	9.20 (4.33)	9.19 (4.28)
Median [Min, Max]	9.00 [9.00, 9.00]	10.0 [0, 14.0]	10.0 [0, 14.0]
AGE_TOLD_PROSTATE
Mean (SD)	61.0 (NA)	70.7 (6.71)	70.5 (6.79)
Median [Min, Max]	61.0 [61.0, 61.0]	70.5 [58.0, 85.0]	70.0 [58.0, 85.0]
Income
Mean (SD)	6.00 (NA)	6.63 (2.47)	6.62 (2.45)
Median [Min, Max]	6.00 [6.00, 6.00]	7.00 [2.00, 11.0]	7.00 [2.00, 11.0]
Race_Ethnicity
Nonwhite	1 (100%)	44 (95.7%)	45 (95.7%)
White	0 (0%)	2 (4.3%)	2 (4.3%)
Education
0	0 (0%)	19 (41.3%)	19 (40.4%)
1	1 (100%)	27 (58.7%)	28 (59.6%)

# Create a crosstab of the mortstat variable
mortstat_table <- table(selected_data$mortstat)

# 'AGE_TOLD_PROSTATE' is the age at diagnosis (ensure it's numeric)
# 'Age_or_Status' is the age at last interview or death (ensure it's numeric)

# Convert AGE_TOLD_PROSTATE and Age_or_Status to numeric 
selected_data$AGE_TOLD_PROSTATE <- as.numeric(as.character(selected_data$AGE_TOLD_PROSTATE))
selected_data$Age_or_Status <- as.numeric(as.character(selected_data$Age_or_Status))

## Warning: NAs introduced by coercion

# Calculate the time at risk for each individual
selected_data$time_at_risk <- selected_data$Age_or_Status - selected_data$AGE_TOLD_PROSTATE

# Handle any NAs that might have been introduced during conversion
selected_data$time_at_risk[is.na(selected_data$time_at_risk)] <- 0

# Calculate total person-years
total_person_years <- sum(selected_data$time_at_risk)

# Output the total person-years
total_person_years

## [1] 441

# Define age groups
age_breaks <- c(40, 49, 59, 69, 79, Inf)  # Define your age groups here
age_labels <- c("40-49", "50-59", "60-69", "70-79", "80+")  # Labels for age groups

# Categorize each individual into an age group
selected_data$age_group <- cut(selected_data$AGE_TOLD_PROSTATE, 
                               breaks = age_breaks, 
                               labels = age_labels, 
                               right = FALSE)

# Calculate time at risk for each individual
selected_data$time_at_risk <- selected_data$Age_or_Status - selected_data$AGE_TOLD_PROSTATE
selected_data$time_at_risk[is.na(selected_data$time_at_risk)] <- 0

# Calculate total person-years by age group
total_person_years_by_age_group <- aggregate(time_at_risk ~ age_group, data = selected_data, sum)

# View the result
print(total_person_years_by_age_group)

##   age_group time_at_risk
## 1     50-59            0
## 2     60-69          213
## 3     70-79          191
## 4       80+           37

The total person-years at risk in this dataset is 471. This means when you add up all the time periods that each individual has been at risk since their diagnosis or awareness of prostate issues, it totals to 471 years.

For age group 50-59 the time as risk is 0. This could suggest that individuals in this age group were either diagnosed and reached the end of the study period very quickly, or this age group is not represented in the data set. For the age group 60 to 69 the time at risk is 213, meaning that when summing up the time each person in this age group spent at risk from their diagnosis until the end of the study period, it totals 213 years. For 70 to 79 age had the highest accumulated risk time in your data set, suggesting either a larger number of individuals in this age group, longer survival times post-diagnosis. Finally, the 80+ age group is 41, the lower number could be due to fewer individuals in this age group, shorter survival times post-diagnosis, or a combination of both.

# Display the crosstab
print(mortstat_table)

## 
##  0  1 
## 16 37

summary(selected_data$Education)

##  0  1 
## 23 30

summary(selected_data$HEALTH_INSURANCE)

##  0  1 
##  1 52

summary(selected_data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   5.000   7.000   6.849   8.000  11.000

summary(selected_data$Race_Ethnicity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03774 0.00000 1.00000

ggplot(selected_data, aes(x = Education)) + 
  geom_bar() + 
  labs(title = "Distribution of Education Levels", x = "Education", y = "Count")

ggplot(selected_data, aes(x = HEALTH_INSURANCE)) + 
  geom_bar() + 
  labs(title = "Distribution of Health Insurance Status", x = "Health Insurance", y = "Count")

ggplot(selected_data, aes(x = Race_Ethnicity)) + 
  geom_bar() + 
  labs(title = "Distribution of Race/Ethnicity", x = "Race/Ethnicity", y = "Count")

ggplot(selected_data, aes(x = Income)) + 
  geom_histogram(binwidth = 1, fill = "blue", color = "black") + 
  labs(title = "Histogram of Income", x = "Income", y = "Frequency")

table(selected_data$Education)

## 
##  0  1 
## 23 30

table(selected_data$HEALTH_INSURANCE)

## 
##  0  1 
##  1 52

table(selected_data$Race_Ethnicity)

## 
##  0  1 
## 51  2

selected_data %>%
  summarise(
    Mean_Income = mean(Income, na.rm = TRUE),
    Median_Income = median(Income, na.rm = TRUE),
    SD_Income = sd(Income, na.rm = TRUE)
  )

## # A tibble: 1 × 3
##   Mean_Income Median_Income SD_Income
##         <dbl>         <dbl>     <dbl>
## 1        6.85             7      2.49

library(ggplot2)

# Histogram or Bar plot for Health Insurance
ggplot(extracted_condition_rows, aes(x = HEALTH_INSURANCE)) +
  geom_bar() +
  labs(title = "Distribution of Health Insurance", x = "Health Insurance", y = "Count")

# Histogram or Bar plot for Education
ggplot(extracted_condition_rows, aes(x = Education)) +
  geom_bar() +
  labs(title = "Distribution of Education", x = "Education", y = "Count")

# Histogram or Bar plot for Income
ggplot(extracted_condition_rows, aes(x = Income)) +
  geom_bar() +
  labs(title = "Distribution of Income", x = "Income", y = "Count")

# Histogram or Bar plot for Race/Ethnicity
ggplot(extracted_condition_rows, aes(x = Race_Ethnicity)) +
  geom_bar() +
  labs(title = "Distribution of Race/Ethnicity", x = "Race/Ethnicity", y = "Count")

# Cox model for Health Insurance
cox_model_health_insurance <- coxph(Surv(time_to_event, status) ~ HEALTH_INSURANCE, data = extracted_condition_rows)
summary(cox_model_health_insurance)

## Call:
## coxph(formula = Surv(time_to_event, status) ~ HEALTH_INSURANCE, 
##     data = extracted_condition_rows)
## 
##   n= 47, number of events= 47 
## 
##                            coef exp(coef) se(coef)      z Pr(>|z|)
## HEALTH_INSURANCEInsured -0.5911    0.5537   1.0246 -0.577    0.564
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## HEALTH_INSURANCEInsured    0.5537      1.806   0.07433     4.125
## 
## Concordance= 0.504  (se = 0.005 )
## Likelihood ratio test= 0.28  on 1 df,   p=0.6
## Wald test            = 0.33  on 1 df,   p=0.6
## Score (logrank) test = 0.34  on 1 df,   p=0.6

# Cox model for Education
cox_model_education <- coxph(Surv(time_to_event, status) ~ Education, data = extracted_condition_rows)
summary(cox_model_education)

## Call:
## coxph(formula = Surv(time_to_event, status) ~ Education, data = extracted_condition_rows)
## 
##   n= 47, number of events= 47 
## 
##               coef exp(coef) se(coef)      z Pr(>|z|)
## Education1 -0.1001    0.9048   0.3006 -0.333    0.739
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## Education1    0.9048      1.105     0.502     1.631
## 
## Concordance= 0.491  (se = 0.045 )
## Likelihood ratio test= 0.11  on 1 df,   p=0.7
## Wald test            = 0.11  on 1 df,   p=0.7
## Score (logrank) test = 0.11  on 1 df,   p=0.7

# Cox model for Income
cox_model_income <- coxph(Surv(time_to_event, status) ~ Income, data = extracted_condition_rows)
summary(cox_model_income)

## Call:
## coxph(formula = Surv(time_to_event, status) ~ Income, data = extracted_condition_rows)
## 
##   n= 47, number of events= 47 
## 
##            coef exp(coef) se(coef)      z Pr(>|z|)
## Income -0.02868   0.97173  0.06585 -0.435    0.663
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## Income    0.9717      1.029    0.8541     1.106
## 
## Concordance= 0.56  (se = 0.052 )
## Likelihood ratio test= 0.19  on 1 df,   p=0.7
## Wald test            = 0.19  on 1 df,   p=0.7
## Score (logrank) test = 0.19  on 1 df,   p=0.7

# Cox model for Race/Ethnicity
cox_model_race_ethnicity <- coxph(Surv(time_to_event, status) ~ Race_Ethnicity, data = extracted_condition_rows)
summary(cox_model_race_ethnicity)

## Call:
## coxph(formula = Surv(time_to_event, status) ~ Race_Ethnicity, 
##     data = extracted_condition_rows)
## 
##   n= 47, number of events= 47 
## 
##                       coef exp(coef) se(coef)     z Pr(>|z|)
## Race_EthnicityWhite 0.8830    2.4181   0.7514 1.175     0.24
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## Race_EthnicityWhite     2.418     0.4135    0.5545     10.55
## 
## Concordance= 0.514  (se = 0.011 )
## Likelihood ratio test= 1.1  on 1 df,   p=0.3
## Wald test            = 1.38  on 1 df,   p=0.2
## Score (logrank) test = 1.47  on 1 df,   p=0.2

The Cox model attempted to assess whether having health insurance affects the survival time of prostate cancer patients. The analysis found no significant evidence that health insurance status significantly alters the risk of death or censoring in this group of 51 individuals. The model’s predictive power is also very limited, as indicated by the concordance statistic of .503.

The Cox model looking at education resulted in an education coefficient of -0.1877, indicating a small decrease in risk with higher education levels and a p-value of 0.512, suggesting the effect of education is not statistically significant.

The Cox model attempted to assess whether income affects survival time of prostate cancer patients. The analysis found that income is not statistically significant. The coefficient of -0.004654 indicates there is a very small negative effect, suggesting a small decrease in risk with higher income.

The final Cox model assess whether Race/Ethnicity affects the survival time of patients diagnosed with prostate cancer. The coefficient of 0.8706 for ‘Race_EthnicityWhite’, suggests an increase in risk for the White race/ethnicity group compared to other group. The hazard ratio of 2.3885, indicates that the risk for the White group is more than double compared to others. The results, however, are not statistically significant due to the p-value of 0.244.

cox_model <- coxph(Surv(time_to_event, status) ~ HEALTH_INSURANCE + Education + Income +Race_Ethnicity, data = extracted_condition_rows)

summary(cox_model)

## Call:
## coxph(formula = Surv(time_to_event, status) ~ HEALTH_INSURANCE + 
##     Education + Income + Race_Ethnicity, data = extracted_condition_rows)
## 
##   n= 47, number of events= 47 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)
## HEALTH_INSURANCEInsured -0.63991   0.52734  1.04057 -0.615    0.539
## Education1              -0.01586   0.98427  0.33650 -0.047    0.962
## Income                  -0.02548   0.97485  0.07189 -0.354    0.723
## Race_EthnicityWhite      0.89360   2.44391  0.77695  1.150    0.250
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## HEALTH_INSURANCEInsured    0.5273     1.8963    0.0686     4.054
## Education1                 0.9843     1.0160    0.5090     1.903
## Income                     0.9748     1.0258    0.8467     1.122
## Race_EthnicityWhite        2.4439     0.4092    0.5330    11.205
## 
## Concordance= 0.559  (se = 0.048 )
## Likelihood ratio test= 1.6  on 4 df,   p=0.8
## Wald test            = 1.91  on 4 df,   p=0.8
## Score (logrank) test = 2.02  on 4 df,   p=0.7

# Load necessary libraries
library(survival)
library(ggplot2)



# and 'mortstat' is the event indicator (1 if event occurred, 0 if censored)

# Fit separate Cox Proportional Hazards models for each variable
cox_model_education <- coxph(Surv(Age_or_Status_Numeric, mortstat) ~ Education, data = selected_data)
cox_model_income <- coxph(Surv(Age_or_Status_Numeric, mortstat) ~ Income, data = selected_data)
cox_model_race_ethnicity <- coxph(Surv(Age_or_Status_Numeric, mortstat) ~ Race_Ethnicity, data = selected_data)
cox_model_health_insurance <- coxph(Surv(Age_or_Status_Numeric, mortstat) ~ HEALTH_INSURANCE, data = selected_data)

# Create function to plot cumulative hazard
plot_cum_haz <- function(cox_model, title) {
  base_surv <- basehaz(cox_model, centered = FALSE)
  base_surv_df <- as.data.frame(base_surv)
  ggplot(base_surv_df, aes(x = time, y = hazard)) +
    geom_line() +
    labs(title = title,
         x = "Time",
         y = "Cumulative Hazard")
}

# Plotting cumulative hazards for each variable
plot_cum_haz(cox_model_education, "Cumulative Hazard by Education")

plot_cum_haz(cox_model_income, "Cumulative Hazard by Income")

plot_cum_haz(cox_model_race_ethnicity, "Cumulative Hazard by Race/Ethnicity")

plot_cum_haz(cox_model_health_insurance, "Cumulative Hazard by Health Insurance")

The cumulative hazard plot of prostate cancer by health insurance status has time on the x-axis and the y-axis is cumulative hazard, which is the accumulated hazard of mortality risk once diagnosed by prostate cancer. The curve of the plot suggests that the risk of mortality increases as a male ages.

library(dplyr)
library(tidyr)
library(purrr)


selected_data <- selected_data %>%
  mutate(
    SEQN = ucod_leading,  # replace with your identifier column name
    time_at_diagnosis = AGE_TOLD_PROSTATE,  # assuming this is the diagnosis time
    time_at_death = ifelse(mortstat == 1, Age_At_Death, NA),
    time_censored = ifelse(mortstat == 0, CURRENT_AGE, NA),
    end_year = ifelse(is.na(time_at_death), time_censored, time_at_death)
  )

# Now check the lengths
nrow(selected_data)

## [1] 53

length(selected_data$ucod_leading)

## [1] 53

length(selected_data$time_at_diagnosis)

## [1] 53

length(selected_data$end_year)

## [1] 53

# Expanding the dataset to a person-year format
person_year_data <- selected_data %>%
  mutate(years = map2(time_at_diagnosis, end_year, ~seq(.x, .y, by = 1))) %>%
  unnest(years) %>%
  group_by(ucod_leading) %>%
  mutate(event = if_else(years == end_year & mortstat == 1, 1, 0)) %>%
  ungroup()

# Checking the structure of the person-year data
head(person_year_data)

## # A tibble: 6 × 30
##   Education Race_Ethnicity Income AGE_TOLD_PROSTATE DIAGNOSED_PROSTATE
##   <fct>              <dbl>  <dbl>             <dbl>              <dbl>
## 1 1                      0      5                79                  1
## 2 1                      0      5                79                  1
## 3 1                      0      5                79                  1
## 4 1                      0      5                79                  1
## 5 1                      0      5                79                  1
## 6 1                      0      5                79                  1
## # ℹ 25 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## #   PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, CITIZEN_STATUS <dbl>,
## #   HEALTH_INSURANCE <fct>, CURRENT_AGE <dbl>, mortstat <dbl>,
## #   permth_exm <dbl>, ucod_leading <dbl>, Age_or_Status <dbl>,
## #   Age_or_Status_Numeric <dbl>, Age_At_Death <dbl>, ageevent <dbl>,
## #   time_to_event <dbl>, status <dbl>, time_at_risk <dbl>, age_group <fct>,
## #   SEQN <dbl>, time_at_diagnosis <dbl>, time_at_death <dbl>, …

library(survival)

# Fit a Cox model
cox_model <- coxph(Surv(time_at_diagnosis, years, event) ~ Education + HEALTH_INSURANCE + Race_Ethnicity, data = person_year_data)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

summary(cox_model)

## Call:
## coxph(formula = Surv(time_at_diagnosis, years, event) ~ Education + 
##     HEALTH_INSURANCE + Race_Ethnicity, data = person_year_data)
## 
##   n= 498, number of events= 36 
##    (53 observations deleted due to missingness)
## 
##                       coef exp(coef) se(coef)      z Pr(>|z|)  
## Education1        -0.32145   0.72510  0.36080 -0.891   0.3730  
## HEALTH_INSURANCE1 -2.46710   0.08483  1.17386 -2.102   0.0356 *
## Race_Ethnicity    -1.56494   0.20910  0.78751 -1.987   0.0469 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## Education1          0.72510      1.379  0.357507    1.4706
## HEALTH_INSURANCE1   0.08483     11.788  0.008499    0.8467
## Race_Ethnicity      0.20910      4.782  0.044671    0.9788
## 
## Concordance= 0.578  (se = 0.053 )
## Likelihood ratio test= 7.92  on 3 df,   p=0.05
## Wald test            = 7.96  on 3 df,   p=0.05
## Score (logrank) test = 10.45  on 3 df,   p=0.02

library(survival)
library(ggplot2)

# Fit the Cox model (make sure to use your actual model formula)
cox_model <- coxph(Surv(time_at_diagnosis, years, event) ~ Education + HEALTH_INSURANCE + Race_Ethnicity , data = person_year_data)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

# Extract the baseline hazard
baseline_hazard <- basehaz(cox_model)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

# Convert to a dataframe for ggplot
baseline_hazard_df <- as.data.frame(baseline_hazard)

# Plotting the hazard function
ggplot(baseline_hazard_df, aes(x = time, y = hazard)) +
  geom_line() +
  labs(title = "Hazard Function Over Time",
       x = "Time",
       y = "Hazard Rate") +
  theme_minimal()

# Check the names of the variables in the dataset
names(person_year_data)

##  [1] "Education"             "Race_Ethnicity"        "Income"               
##  [4] "AGE_TOLD_PROSTATE"     "DIAGNOSED_PROSTATE"    "PROSTATE_ENLARGE"     
##  [7] "AGE_PSA_TEST"          "PSA_TOTAL"             "AGE_CURRENT"          
## [10] "CITIZEN_STATUS"        "HEALTH_INSURANCE"      "CURRENT_AGE"          
## [13] "mortstat"              "permth_exm"            "ucod_leading"         
## [16] "Age_or_Status"         "Age_or_Status_Numeric" "Age_At_Death"         
## [19] "ageevent"              "time_to_event"         "status"               
## [22] "time_at_risk"          "age_group"             "SEQN"                 
## [25] "time_at_diagnosis"     "time_at_death"         "time_censored"        
## [28] "end_year"              "years"                 "event"

# Fit a left-truncated Cox model with the correct variable name
cox_model_left_truncated <- coxph(Surv(time_at_diagnosis, years, event) ~ HEALTH_INSURANCE, data = person_year_data)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

summary(cox_model_left_truncated)

## Call:
## coxph(formula = Surv(time_at_diagnosis, years, event) ~ HEALTH_INSURANCE, 
##     data = person_year_data)
## 
##   n= 498, number of events= 36 
##    (53 observations deleted due to missingness)
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)  
## HEALTH_INSURANCE1 -2.303     0.100    1.158 -1.988   0.0468 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## HEALTH_INSURANCE1       0.1         10   0.01033    0.9676
## 
## Concordance= 0.523  (se = 0.025 )
## Likelihood ratio test= 2.55  on 1 df,   p=0.1
## Wald test            = 3.95  on 1 df,   p=0.05
## Score (logrank) test = 5.95  on 1 df,   p=0.01

# Assuming cox_model is your fitted Cox model
baseline_haz <- basehaz(cox_model, centered = FALSE)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

baseline_haz_df <- as.data.frame(baseline_haz)

ggplot(baseline_haz_df, aes(x = time, y = hazard)) +
    geom_line() +
    labs(title = "Baseline Cumulative Hazard Function", x = "Time", y = "Cumulative Hazard")

# Checking proportional hazards assumption
cox.zph_model <- cox.zph(cox_model)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

plot(cox.zph_model)

cox_summary <- broom::tidy(cox_model)

ggplot(cox_summary, aes(x = term, y = estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    theme_minimal() +
    labs(title = "Hazard Ratios from Cox Model",
         x = "Covariates", y = "Hazard Ratio Estimate")

# Fit the Cox model
cox_model <- coxph(Surv(time_at_diagnosis, years, event) ~ HEALTH_INSURANCE, data = person_year_data)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

# Get the coefficient for Health_Insurance
coef_health_insurance <- coef(cox_model)["Health_Insurance"]

baseline_hazard <- basehaz(cox_model)

## Warning in Surv(time_at_diagnosis, years, event): Stop time must be > start
## time, NA created

# Assuming '0' is uninsured (reference level) and '1' is insured
baseline_hazard$uninsured <- baseline_hazard$hazard
baseline_hazard$insured <- baseline_hazard$hazard * exp(coef_health_insurance)

baseline_hazard_df <- as.data.frame(baseline_hazard)

ggplot(baseline_hazard_df, aes(x = time)) +
  geom_line(aes(y = uninsured), color = "blue", linetype = "solid") +
  geom_line(aes(y = insured), color = "red", linetype = "dashed") +
  labs(title = "Hazard Function Over Time",
       x = "Time",
       y = "Hazard Rate",
       color = "Health Insurance Status") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"),
                     labels = c("Uninsured", "Insured")) +
  guides(color = guide_legend(title = "Health Insurance Status"))

## Warning: Removed 38 rows containing missing values (`geom_line()`).

AD_METHODS_PROJECT

Bryan Solomon

2023-11-15

Subset to Diagnosed Men

ageevent age at event if they died and age of survey if they did not die