# Load necessary libraries
library(survival)
library(ggplot2)
library(survminer)
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(ggpubr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tidyr)
library(dplyr)
# Load the `haven` package if you haven't already
library(haven)
# Read the XPT files with the complete file paths
nhanes_data <- read_xpt("C:/Users/bryan/OneDrive/Desktop/DEMO_C.XPT")
nhanes_data1 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/L11PSA_c.XPT")
nhanes_data2 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/KIQ_P_C.XPT")
nhanes_data3 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/MCQ_D.XPT")
nhanes_data4 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/PSQ_C.XPT")
nhanes_data5 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/ALQ_C.XPT")
nhanes_data6 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/HIQ_C.XPT")
nhanes_data7 <- read_xpt("C:/Users/bryan/OneDrive/Desktop/MCQ_C.XPT")
# Combine all ways
merged_data <- left_join(nhanes_data1, nhanes_data, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data2, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data3, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data4, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data5, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data6, by="SEQN")
merged_data <- left_join(merged_data, nhanes_data7, by="SEQN")
# Merge the datasets using bind_rows
#merged_data <- bind_rows(nhanes_data, nhanes_data1, nhanes_data2, nhanes_data3, nhanes_data4)
# Assuming you have already imported your dataset into the 'data' variable
# You can use the 'rename' function to rename the variables
# Create a new dataset with selected variables
selected_data <- merged_data %>%
select(
Education = DMDEDUC2,
Race_Ethnicity = RIDRETH1,
Income = INDHHINC,
Ever_diagnosed = KIQ201,
AGE_TOLD_PROSTATE = KID221,
PROSTATE_ENLARGE = KIQ121,
AGE_PSA_TEST = MCQ320,
ROUTINE_SCREENING = PSQ100A,
PSA_TOTAL = LBXP1,
AGE_CURRENT = RIDAGEYR,
GENDER = RIAGENDR,
CITIZEN_STATUS = DMDCITZN,
FIVE_MORE_DRINK = ALQ150,
HEALTH_INSURANCE = HID010,
INSURANCE_LAPSE = HIQ220,
)
# Replace NA with 0 in all columns of select_data
#seleced_data <- selected_data %>%
# mutate_all(funs(ifelse(is.na(.), 0, .)))
selected_data$AGE_TOLD_PROSTATE <- ifelse(selected_data$AGE_TOLD_PROSTATE == "85 or greater years", 85 , selected_data$AGE_TOLD_PROSTATE)
selected_data$AGE_TOLD_PROSTATE <- as.numeric(selected_data$AGE_TOLD_PROSTATE)
selected_data$Ever_diagnosed <- ifelse(selected_data$Ever_diagnosed == 1, 1 , 0)
#if diagnosis is 1, pick the age event is the age they had prostate, if 0 make current age
selected_data$ageevent <- ifelse(selected_data$Ever_diagnosed == 1, selected_data$AGE_TOLD_PROSTATE, selected_data$AGE_CURRENT)
selected_data <- selected_data %>%
filter( !is.na(Ever_diagnosed))
selected_data <- selected_data %>%
filter( !is.na(ageevent))
# Assuming you have a dataset named 'selected_data'
selected_data <- selected_data %>%
mutate(
Education = case_when(
Education %in% c(1, 2) ~ "1",
Education == 3 ~ "2",
Education == 4 ~ "3",
Education == 5 ~ "4",
TRUE ~ NA_character_
)
) %>%
filter(!Education %in% c("9", "7", "."))
selected_data <- selected_data %>%
mutate(INSURANCE_LAPSE = ifelse(INSURANCE_LAPSE %in% c(".", "9", "7", "NA"), 0, INSURANCE_LAPSE))
# Create a survival object with complete cases
surv_obj <- with(selected_data, Surv(time = ageevent, event = Ever_diagnosed))
summary(surv_obj)
## time status
## Min. :17.00 Min. :0.00000
## 1st Qu.:49.00 1st Qu.:0.00000
## Median :61.00 Median :0.00000
## Mean :61.19 Mean :0.04064
## 3rd Qu.:72.00 3rd Qu.:0.00000
## Max. :85.00 Max. :1.00000
library(ggsurvfit)
survfit2(Surv(ageevent, Ever_diagnosed) ~ 1, data = selected_data) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probability"
) +
add_confidence_interval() +
add_risktable()
survfit2(Surv(ageevent, Ever_diagnosed) ~ Education, data = selected_data) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probability"
) +
add_confidence_interval() +
add_risktable()
# Fit the Cox proportional hazards model
cox_model <- coxph(Surv(time = ageevent, event = Ever_diagnosed) ~ AGE_TOLD_PROSTATE, data = selected_data)
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## one or more coefficients may be infinite
# View the summary of the model
summary(cox_model)
## Call:
## coxph(formula = Surv(time = ageevent, event = Ever_diagnosed) ~
## AGE_TOLD_PROSTATE, data = selected_data)
##
## n= 56, number of events= 56
## (1322 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## AGE_TOLD_PROSTATE -1.357e+01 1.272e-06 1.092e+02 -0.124 0.901
##
## exp(coef) exp(-coef) lower .95 upper .95
## AGE_TOLD_PROSTATE 1.272e-06 786005 1.322e-99 1.224e+87
##
## Concordance= 1 (se = 0 )
## Likelihood ratio test= 278.1 on 1 df, p=<2e-16
## Wald test = 0.02 on 1 df, p=0.9
## Score (logrank) test = 136.3 on 1 df, p=<2e-16
# Fit a Cox proportional hazards model with interaction terms
cox_model_interaction <- coxph(Surv(time = ageevent, event = Ever_diagnosed) ~ Race_Ethnicity, AGE_CURRENT, data = selected_data)
# View the summary of the model
summary(cox_model_interaction)
## Call:
## coxph(formula = Surv(time = ageevent, event = Ever_diagnosed) ~
## Race_Ethnicity, data = selected_data, weights = AGE_CURRENT)
##
## n= 1378, number of events= 56
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Race_Ethnicity 0.24784 1.28125 0.01803 13.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Race_Ethnicity 1.281 0.7805 1.237 1.327
##
## Concordance= 0.585 (se = 0.039 )
## Likelihood ratio test= 197.6 on 1 df, p=<2e-16
## Wald test = 188.9 on 1 df, p=<2e-16
## Score (logrank) test = 188.3 on 1 df, p=<2e-16
# Fit a logistic regression model
logistic_model <- glm(Ever_diagnosed ~ Income + Education, data = selected_data, family = binomial)
# View the summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = Ever_diagnosed ~ Income + Education, family = binomial,
## data = selected_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.15782 0.25220 -12.521 <2e-16 ***
## Income 0.01806 0.01464 1.234 0.2174
## Education2 -0.09063 0.36935 -0.245 0.8062
## Education3 -0.80479 0.45108 -1.784 0.0744 .
## Education4 0.03711 0.37232 0.100 0.9206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 437.55 on 1310 degrees of freedom
## Residual deviance: 432.15 on 1306 degrees of freedom
## (67 observations deleted due to missingness)
## AIC: 442.15
##
## Number of Fisher Scoring iterations: 6
# Fit a logistic regression model
logistic_model <- glm(Ever_diagnosed ~ AGE_CURRENT, data = selected_data, family = binomial)
# View the summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = Ever_diagnosed ~ AGE_CURRENT, family = binomial,
## data = selected_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.43293 1.12957 -9.236 < 2e-16 ***
## AGE_CURRENT 0.10491 0.01487 7.055 1.73e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 468.43 on 1377 degrees of freedom
## Residual deviance: 394.58 on 1376 degrees of freedom
## AIC: 398.58
##
## Number of Fisher Scoring iterations: 7
# Define the breaks for the 10-year intervals
breaks <- seq(0, 100, by = 10) # Adjust the range as needed
# Create a new variable "AGE_INTERVAL" with the 10-year intervals
selected_data$AGE_INTERVAL <- cut(selected_data$AGE_CURRENT, breaks, right = FALSE, labels = FALSE)
# You can also create labels for the intervals if desired
selected_data$AGE_INTERVAL_LABEL <- cut(selected_data$AGE_CURRENT, breaks, right = FALSE, labels = c("0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-99"))
# Now, "AGE_INTERVAL" contains the numeric interval codes (0-9, 10-19, etc.)
# "AGE_INTERVAL_LABEL" contains the corresponding labels
# View the first few rows of the modified dataset
head(selected_data)
## # A tibble: 6 × 18
## Education Race_Ethnicity Income Ever_diagnosed AGE_TOLD_PROSTATE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2 3 8 0 NA
## 2 2 4 2 0 NA
## 3 3 3 4 0 NA
## 4 4 3 11 0 NA
## 5 2 3 5 0 NA
## 6 2 3 11 0 NA
## # ℹ 13 more variables: PROSTATE_ENLARGE <dbl>, AGE_PSA_TEST <dbl>,
## # ROUTINE_SCREENING <dbl>, PSA_TOTAL <dbl>, AGE_CURRENT <dbl>, GENDER <dbl>,
## # CITIZEN_STATUS <dbl>, FIVE_MORE_DRINK <dbl>, HEALTH_INSURANCE <dbl>,
## # INSURANCE_LAPSE <dbl>, ageevent <dbl>, AGE_INTERVAL <int>,
## # AGE_INTERVAL_LABEL <fct>
survfit2(Surv(ageevent, Ever_diagnosed) ~ AGE_INTERVAL_LABEL, data = selected_data) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probability"
) +
add_confidence_interval() +
add_risktable()
#Does race and ethnicity influence the time to diagnosis of prostate cancer among NHANES respondents, with a particular emphasis on the influence of age, while accounting for the differences in age at the time of diagnosis?
# Fit the Cox proportional hazards model with AGE_TOLD_PROSTATE
cox_model_race_ethnicity <- coxph(Surv(time = ageevent, event = Ever_diagnosed) ~ Race_Ethnicity, AGE_CURRENT, data = selected_data)
# View the summary of the model
summary(cox_model_race_ethnicity)
## Call:
## coxph(formula = Surv(time = ageevent, event = Ever_diagnosed) ~
## Race_Ethnicity, data = selected_data, weights = AGE_CURRENT)
##
## n= 1378, number of events= 56
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Race_Ethnicity 0.24784 1.28125 0.01803 13.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Race_Ethnicity 1.281 0.7805 1.237 1.327
##
## Concordance= 0.585 (se = 0.039 )
## Likelihood ratio test= 197.6 on 1 df, p=<2e-16
## Wald test = 188.9 on 1 df, p=<2e-16
## Score (logrank) test = 188.3 on 1 df, p=<2e-16
# Summary Statistics
summary(selected_data)
## Education Race_Ethnicity Income Ever_diagnosed
## Length:1378 Min. :1.000 Min. : 1.000 Min. :0.00000
## Class :character 1st Qu.:3.000 1st Qu.: 4.250 1st Qu.:0.00000
## Mode :character Median :3.000 Median : 7.000 Median :0.00000
## Mean :2.811 Mean : 7.185 Mean :0.04064
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.:0.00000
## Max. :5.000 Max. :99.000 Max. :1.00000
## NA's :64
## AGE_TOLD_PROSTATE PROSTATE_ENLARGE AGE_PSA_TEST ROUTINE_SCREENING
## Min. :17.00 Min. :1.000 Min. : NA Min. :1.000
## 1st Qu.:64.75 1st Qu.:2.000 1st Qu.: NA 1st Qu.:1.000
## Median :69.50 Median :2.000 Median : NA Median :1.000
## Mean :68.52 Mean :1.808 Mean :NaN Mean :1.129
## 3rd Qu.:75.00 3rd Qu.:2.000 3rd Qu.: NA 3rd Qu.:1.000
## Max. :85.00 Max. :9.000 Max. : NA Max. :2.000
## NA's :1322 NA's :100 NA's :1378 NA's :1347
## PSA_TOTAL AGE_CURRENT GENDER CITIZEN_STATUS FIVE_MORE_DRINK
## Min. : 0.10 Min. :40.00 Min. :1 Min. :1.000 Min. :1.000
## 1st Qu.: 0.60 1st Qu.:49.00 1st Qu.:1 1st Qu.:1.000 1st Qu.:1.000
## Median : 1.00 Median :61.00 Median :1 Median :1.000 Median :2.000
## Mean : 1.83 Mean :61.49 Mean :1 Mean :1.089 Mean :1.738
## 3rd Qu.: 1.90 3rd Qu.:72.00 3rd Qu.:1 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :72.70 Max. :85.00 Max. :1 Max. :7.000 Max. :2.000
## NA's :125 NA's :158
## HEALTH_INSURANCE INSURANCE_LAPSE ageevent AGE_INTERVAL
## Min. :1.000 Min. :0.000 Min. :17.00 Min. :5.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:49.00 1st Qu.:5.000
## Median :1.000 Median :4.000 Median :61.00 Median :7.000
## Mean :1.122 Mean :3.473 Mean :61.19 Mean :6.743
## 3rd Qu.:1.000 3rd Qu.:4.500 3rd Qu.:72.00 3rd Qu.:8.000
## Max. :2.000 Max. :5.000 Max. :85.00 Max. :9.000
## NA's :11 NA's :1211
## AGE_INTERVAL_LABEL
## 40-49 :351
## 60-69 :327
## 70-79 :278
## 50-59 :258
## 80-89 :164
## 0-9 : 0
## (Other): 0
summary(selected_data$Age)
## Warning: Unknown or uninitialised column: `Age`.
## Length Class Mode
## 0 NULL NULL
# Data Visualization
# Create a bar plot for the distribution of Education levels
ggplot(selected_data, aes(x = Education)) +
geom_bar() +
labs(
title = "Distribution of Education Levels",
x = "Education Level",
y = "Frequency"
)
# Sample Size Reduction
# Calculate the initial sample size
initial_sample_size <- nrow(selected_data)
# Apply restrictions or data cleaning
# For example, you can remove rows with missing values in specific columns
selected_data_cleaned <- selected_data %>%
filter(!is.na(Education), !is.na(Race_Ethnicity))
# Calculate the sample size after data cleaning
cleaned_sample_size <- nrow(selected_data_cleaned)
# Create a table to show the sample size reduction
sample_size_table <- data.frame(
Step = c("Initial", "After Data Cleaning"),
Sample_Size = c(initial_sample_size, cleaned_sample_size)
)
# Print the sample size table
print(sample_size_table)
## Step Sample_Size
## 1 Initial 1378
## 2 After Data Cleaning 1374