# Load necessary libraries
library(survival)
library(ggplot2)
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggpubr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(tidyr)
library(dplyr)

# Load the `haven` package if you haven't already
library(haven)

# Read the XPT files with the complete file paths
nhanes_data <- read_xpt("C:/Users/bryan/OneDrive/Desktop/DEMO_C.XPT")
nhanes_data1 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/L11PSA_c.XPT")
nhanes_data2 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/KIQ_P_C.XPT")
nhanes_data3 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/MCQ_D.XPT")
nhanes_data4 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/PSQ_C.XPT")
nhanes_data5 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/ALQ_C.XPT")
nhanes_data6 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/HIQ_C.XPT")
nhanes_data7 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/MCQ_C.XPT")



# Combine all ways
merged_data <- left_join(nhanes_data1, nhanes_data, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data2, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data3, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data4, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data5, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data6, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data7, by="SEQN")




# Merge the datasets using bind_rows
#merged_data <- bind_rows(nhanes_data, nhanes_data1, nhanes_data2, nhanes_data3, nhanes_data4)
# Assuming you have already imported your dataset into the 'data' variable
# You can use the 'rename' function to rename the variables

# Create a new dataset with selected variables
selected_data <- merged_data %>% 
  select(
    Education = DMDEDUC2,
    Race_Ethnicity = RIDRETH1,
    Income = INDHHINC,
    Ever_diagnosed = KIQ201,
    AGE_TOLD_PROSTATE = KID221,
    PROSTATE_ENLARGE = KIQ121,
    AGE_PSA_TEST = MCQ320,
    ROUTINE_SCREENING = PSQ100A,
    PSA_TOTAL = LBXP1,
    AGE_CURRENT = RIDAGEYR,
    GENDER = RIAGENDR,
    CITIZEN_STATUS = DMDCITZN,
    FIVE_MORE_DRINK = ALQ150,
    HEALTH_INSURANCE = HID010,
    INSURANCE_LAPSE = HIQ220,

  )

# Replace NA with 0 in all columns of select_data
#seleced_data <- selected_data %>%
#  mutate_all(funs(ifelse(is.na(.), 0, .)))



selected_data$AGE_TOLD_PROSTATE <- ifelse(selected_data$AGE_TOLD_PROSTATE == "85 or greater years", 85 , selected_data$AGE_TOLD_PROSTATE)
selected_data$AGE_TOLD_PROSTATE <- as.numeric(selected_data$AGE_TOLD_PROSTATE)

selected_data$Ever_diagnosed <- ifelse(selected_data$Ever_diagnosed == 1, 1 , 0)

#if diagnosis is 1, pick the age event is the age they had prostate, if 0 make current age
selected_data$ageevent <- ifelse(selected_data$Ever_diagnosed == 1, selected_data$AGE_TOLD_PROSTATE, selected_data$AGE_CURRENT)

selected_data <- selected_data %>% 
  filter( !is.na(Ever_diagnosed))

selected_data <- selected_data %>% 
  filter( !is.na(ageevent))

# Assuming you have a dataset named 'selected_data'
selected_data <- selected_data %>%
  mutate(
    Education = case_when(
      Education %in% c(1, 2) ~ "1",
      Education == 3 ~ "2",
      Education == 4 ~ "3",
      Education == 5 ~ "4",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!Education %in% c("9", "7", "."))


selected_data <- selected_data %>% 
  mutate(INSURANCE_LAPSE = ifelse(INSURANCE_LAPSE %in% c(".", "9", "7", "NA"), 0, INSURANCE_LAPSE))
# Create a survival object with complete cases
surv_obj <- with(selected_data, Surv(time = ageevent, event = Ever_diagnosed))
summary(surv_obj)
##       time           status       
##  Min.   :17.00   Min.   :0.00000  
##  1st Qu.:49.00   1st Qu.:0.00000  
##  Median :61.00   Median :0.00000  
##  Mean   :61.19   Mean   :0.04064  
##  3rd Qu.:72.00   3rd Qu.:0.00000  
##  Max.   :85.00   Max.   :1.00000
library(ggsurvfit)
survfit2(Surv(ageevent, Ever_diagnosed) ~ 1, data = selected_data) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probability"
    ) + 
  add_confidence_interval() +
  add_risktable()

survfit2(Surv(ageevent, Ever_diagnosed) ~ Education, data = selected_data) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probability"
    ) + 
  add_confidence_interval() +
  add_risktable()

# Fit the Cox proportional hazards model
cox_model <- coxph(Surv(time = ageevent, event = Ever_diagnosed) ~ AGE_TOLD_PROSTATE, data = selected_data)
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## one or more coefficients may be infinite
# View the summary of the model
summary(cox_model)
## Call:
## coxph(formula = Surv(time = ageevent, event = Ever_diagnosed) ~ 
##     AGE_TOLD_PROSTATE, data = selected_data)
## 
##   n= 56, number of events= 56 
##    (1322 observations deleted due to missingness)
## 
##                         coef  exp(coef)   se(coef)      z Pr(>|z|)
## AGE_TOLD_PROSTATE -1.357e+01  1.272e-06  1.092e+02 -0.124    0.901
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## AGE_TOLD_PROSTATE 1.272e-06     786005 1.322e-99 1.224e+87
## 
## Concordance= 1  (se = 0 )
## Likelihood ratio test= 278.1  on 1 df,   p=<2e-16
## Wald test            = 0.02  on 1 df,   p=0.9
## Score (logrank) test = 136.3  on 1 df,   p=<2e-16
# Fit a Cox proportional hazards model with interaction terms
cox_model_interaction <- coxph(Surv(time = ageevent, event = Ever_diagnosed) ~ Race_Ethnicity, AGE_CURRENT, data = selected_data)

# View the summary of the model
summary(cox_model_interaction)
## Call:
## coxph(formula = Surv(time = ageevent, event = Ever_diagnosed) ~ 
##     Race_Ethnicity, data = selected_data, weights = AGE_CURRENT)
## 
##   n= 1378, number of events= 56 
## 
##                   coef exp(coef) se(coef)     z Pr(>|z|)    
## Race_Ethnicity 0.24784   1.28125  0.01803 13.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## Race_Ethnicity     1.281     0.7805     1.237     1.327
## 
## Concordance= 0.585  (se = 0.039 )
## Likelihood ratio test= 197.6  on 1 df,   p=<2e-16
## Wald test            = 188.9  on 1 df,   p=<2e-16
## Score (logrank) test = 188.3  on 1 df,   p=<2e-16
# Fit a logistic regression model
logistic_model <- glm(Ever_diagnosed ~ Income + Education, data = selected_data, family = binomial)

# View the summary of the model
summary(logistic_model)
## 
## Call:
## glm(formula = Ever_diagnosed ~ Income + Education, family = binomial, 
##     data = selected_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.15782    0.25220 -12.521   <2e-16 ***
## Income       0.01806    0.01464   1.234   0.2174    
## Education2  -0.09063    0.36935  -0.245   0.8062    
## Education3  -0.80479    0.45108  -1.784   0.0744 .  
## Education4   0.03711    0.37232   0.100   0.9206    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 437.55  on 1310  degrees of freedom
## Residual deviance: 432.15  on 1306  degrees of freedom
##   (67 observations deleted due to missingness)
## AIC: 442.15
## 
## Number of Fisher Scoring iterations: 6
# Fit a logistic regression model
logistic_model <- glm(Ever_diagnosed ~  AGE_CURRENT, data = selected_data, family = binomial)

# View the summary of the model
summary(logistic_model)
## 
## Call:
## glm(formula = Ever_diagnosed ~ AGE_CURRENT, family = binomial, 
##     data = selected_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.43293    1.12957  -9.236  < 2e-16 ***
## AGE_CURRENT   0.10491    0.01487   7.055 1.73e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 468.43  on 1377  degrees of freedom
## Residual deviance: 394.58  on 1376  degrees of freedom
## AIC: 398.58
## 
## Number of Fisher Scoring iterations: 7
# Define the breaks for the 10-year intervals
breaks <- seq(0, 100, by = 10)  # Adjust the range as needed

# Create a new variable "AGE_INTERVAL" with the 10-year intervals
selected_data$AGE_INTERVAL <- cut(selected_data$AGE_CURRENT, breaks, right = FALSE, labels = FALSE)

# You can also create labels for the intervals if desired
selected_data$AGE_INTERVAL_LABEL <- cut(selected_data$AGE_CURRENT, breaks, right = FALSE, labels = c("0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-99"))

# Now, "AGE_INTERVAL" contains the numeric interval codes (0-9, 10-19, etc.)
# "AGE_INTERVAL_LABEL" contains the corresponding labels

# View the first few rows of the modified dataset
head(selected_data)
## # A tibble: 6 × 18
##   Education Race_Ethnicity Income Ever_diagnosed AGE_TOLD_PROSTATE
##   <chr>              <dbl>  <dbl>          <dbl>             <dbl>
## 1 2                      3      8              0                NA
## 2 2                      4      2              0                NA
## 3 3                      3      4              0                NA
## 4 4                      3     11              0                NA
## 5 2                      3      5              0                NA
## 6 2                      3     11              0                NA
## # ℹ 13 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## #   ROUTINE_SCREENING <dbl>, PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, GENDER <dbl>,
## #   CITIZEN_STATUS <dbl>, FIVE_MORE_DRINK <dbl>, HEALTH_INSURANCE <dbl>,
## #   INSURANCE_LAPSE <dbl>, ageevent <dbl>, AGE_INTERVAL <int>,
## #   AGE_INTERVAL_LABEL <fct>
survfit2(Surv(ageevent, Ever_diagnosed) ~ AGE_INTERVAL_LABEL, data = selected_data) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probability"
    ) + 
  add_confidence_interval() +
  add_risktable()

#Does race and ethnicity influence the time to diagnosis of prostate cancer among NHANES respondents, with a particular emphasis on the influence of age, while accounting for the differences in age at the time of diagnosis?

# Fit the Cox proportional hazards model with AGE_TOLD_PROSTATE
cox_model_race_ethnicity <- coxph(Surv(time = ageevent, event = Ever_diagnosed) ~ Race_Ethnicity, AGE_CURRENT, data = selected_data)

# View the summary of the model
summary(cox_model_race_ethnicity)
## Call:
## coxph(formula = Surv(time = ageevent, event = Ever_diagnosed) ~ 
##     Race_Ethnicity, data = selected_data, weights = AGE_CURRENT)
## 
##   n= 1378, number of events= 56 
## 
##                   coef exp(coef) se(coef)     z Pr(>|z|)    
## Race_Ethnicity 0.24784   1.28125  0.01803 13.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## Race_Ethnicity     1.281     0.7805     1.237     1.327
## 
## Concordance= 0.585  (se = 0.039 )
## Likelihood ratio test= 197.6  on 1 df,   p=<2e-16
## Wald test            = 188.9  on 1 df,   p=<2e-16
## Score (logrank) test = 188.3  on 1 df,   p=<2e-16
# Summary Statistics
summary(selected_data)
##   Education         Race_Ethnicity      Income       Ever_diagnosed   
##  Length:1378        Min.   :1.000   Min.   : 1.000   Min.   :0.00000  
##  Class :character   1st Qu.:3.000   1st Qu.: 4.250   1st Qu.:0.00000  
##  Mode  :character   Median :3.000   Median : 7.000   Median :0.00000  
##                     Mean   :2.811   Mean   : 7.185   Mean   :0.04064  
##                     3rd Qu.:3.000   3rd Qu.:10.000   3rd Qu.:0.00000  
##                     Max.   :5.000   Max.   :99.000   Max.   :1.00000  
##                                     NA's   :64                        
##  AGE_TOLD_PROSTATE PROSTATE_ENLARGE  AGE_PSA_TEST  ROUTINE_SCREENING
##  Min.   :17.00     Min.   :1.000    Min.   : NA    Min.   :1.000    
##  1st Qu.:64.75     1st Qu.:2.000    1st Qu.: NA    1st Qu.:1.000    
##  Median :69.50     Median :2.000    Median : NA    Median :1.000    
##  Mean   :68.52     Mean   :1.808    Mean   :NaN    Mean   :1.129    
##  3rd Qu.:75.00     3rd Qu.:2.000    3rd Qu.: NA    3rd Qu.:1.000    
##  Max.   :85.00     Max.   :9.000    Max.   : NA    Max.   :2.000    
##  NA's   :1322      NA's   :100      NA's   :1378   NA's   :1347     
##    PSA_TOTAL      AGE_CURRENT        GENDER  CITIZEN_STATUS  FIVE_MORE_DRINK
##  Min.   : 0.10   Min.   :40.00   Min.   :1   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 0.60   1st Qu.:49.00   1st Qu.:1   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 1.00   Median :61.00   Median :1   Median :1.000   Median :2.000  
##  Mean   : 1.83   Mean   :61.49   Mean   :1   Mean   :1.089   Mean   :1.738  
##  3rd Qu.: 1.90   3rd Qu.:72.00   3rd Qu.:1   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :72.70   Max.   :85.00   Max.   :1   Max.   :7.000   Max.   :2.000  
##  NA's   :125                                                 NA's   :158    
##  HEALTH_INSURANCE INSURANCE_LAPSE    ageevent      AGE_INTERVAL  
##  Min.   :1.000    Min.   :0.000   Min.   :17.00   Min.   :5.000  
##  1st Qu.:1.000    1st Qu.:3.000   1st Qu.:49.00   1st Qu.:5.000  
##  Median :1.000    Median :4.000   Median :61.00   Median :7.000  
##  Mean   :1.122    Mean   :3.473   Mean   :61.19   Mean   :6.743  
##  3rd Qu.:1.000    3rd Qu.:4.500   3rd Qu.:72.00   3rd Qu.:8.000  
##  Max.   :2.000    Max.   :5.000   Max.   :85.00   Max.   :9.000  
##  NA's   :11       NA's   :1211                                   
##  AGE_INTERVAL_LABEL
##  40-49  :351       
##  60-69  :327       
##  70-79  :278       
##  50-59  :258       
##  80-89  :164       
##  0-9    :  0       
##  (Other):  0
summary(selected_data$Age)
## Warning: Unknown or uninitialised column: `Age`.
## Length  Class   Mode 
##      0   NULL   NULL
# Data Visualization
# Create a bar plot for the distribution of Education levels
ggplot(selected_data, aes(x = Education)) +
  geom_bar() +
  labs(
    title = "Distribution of Education Levels",
    x = "Education Level",
    y = "Frequency"
  )

# Sample Size Reduction
# Calculate the initial sample size
initial_sample_size <- nrow(selected_data)

# Apply restrictions or data cleaning
# For example, you can remove rows with missing values in specific columns
selected_data_cleaned <- selected_data %>%
  filter(!is.na(Education), !is.na(Race_Ethnicity))

# Calculate the sample size after data cleaning
cleaned_sample_size <- nrow(selected_data_cleaned)

# Create a table to show the sample size reduction
sample_size_table <- data.frame(
  Step = c("Initial", "After Data Cleaning"),
  Sample_Size = c(initial_sample_size, cleaned_sample_size)
)

# Print the sample size table
print(sample_size_table)
##                  Step Sample_Size
## 1             Initial        1378
## 2 After Data Cleaning        1374