{
library(readxl)
library(writexl)
library(gtsummary)
library(dplyr)
library(plyr)
library(parameters)
df = read_xlsx("CRC+Health+Survey_May+28,+2025_09.42/CRC Health Survey_May 28, 2025_09.42.xlsx", sheet = "Updated")
dictionary = names(df)
names(df) = unlist(df[1,])
df = df[-1,]
#age
df$age = 2025 - as.numeric(df$Q8)
# Let's work on CRC ever and up-to-date status
# table(df$Q20, df$Q21, useNA = "ifany")
# table(df$Q22, df$Q23, useNA = "ifany")
# Assuming Don't know/missing are "No"
df$ever = "No"
df$ever[df$Q20 %in% "Yes" | df$Q22 %in% "Yes"] = "Yes"
df$uptodate = "No"
df$uptodate[(df$Q20 %in% "Yes" & df$Q21 %in% "Less than 10 years age") |
df$Q22 %in% "Yes" & df$Q23 %in% "Less than 1 year ago"] = "Yes"
df$Q14 = factor(df$Q14, levels = c("Less than $10,000", "$15,000 to under $20,000",
"$50,000 to under $75,000", "$75,000 to under $100,000",
"$100,000 to under $200,000"))
df$Q11 = factor(df$Q11, levels = c("8 through 11 years", "12 years or completed high school", "College graduate"))
df$Q17 = factor(df$Q17, levels = c(c("Employed, full time", "Employed, part time", "Self employed", "Homemaker", "Student", "Unemployed", "Disabled")))
df$checkup = df$Q34
df$checkup[df$checkup %in% c("More than 2 up to 5 years ago", "More than 5 years ago")] = "More than 2 years ago"
df$checkup = factor(df$checkup, levels = c("One year ago or less", "More than 1 up to 2 years ago", "More than 2 years ago"))
df$generalhealth = df$Q35
df$generalhealth[df$generalhealth %in% c("Excellent", "Very good")] = c("Very good/Excellent")
df$generalhealth[df$generalhealth %in% c("Poor", "Fair")] = c("Poor/Fair")
df$generalhealth = factor(df$generalhealth, levels = c("Poor/Fair", "Good", "Very good/Excellent"))
}
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:gtsummary':
##
## mutate
## New names:
## • `` -> `...12`
## • `Have you ever been diagnosed with one of the following? - Selected Choice`
## -> `Have you ever been diagnosed with one of the following? - Selected
## Choice...71`
## • `Have you ever been diagnosed with one of the following? - Selected Choice`
## -> `Have you ever been diagnosed with one of the following? - Selected
## Choice...75`
df$Q16[df$Q15 %in% "No"] = "None"
df$healthinsurance = df$Q16
df$healthinsurance = mapvalues(df$healthinsurance, from = c("A plan purchased through an employer or union (includes plans purchased through another person's employer)",
"Alaska Native, Indian Health Service, Tribal Health Services",
"Medicaid or other state program", "Medicare", "None"),
to = c("Private", "Public","Public", "Public", "None"))
df$healthinsurance = factor(df$healthinsurance, levels = c("None", "Private", "Public"))
df$Q42 = factor(df$Q42, levels = c("No", "Yes", "Don't know"))
#county
table(df$Q4, useNA = "ifany")
##
## Amador Butte Fresno Merced Sacramento San Joaquin
## 3 11 30 21 16 14
## Yolo
## 5
tab1_columns = c("age", "Q10","Q9", "Q12",
"Q11","Q14", "Q17", "ever")
dftab1 = df[tab1_columns]
tab1_names = c("Age", "Sex", "Country", "Language spoken at home", "Education", "Household income", "Occupation", "ever")
names(dftab1) = tab1_names
dftab1|>
gtsummary::tbl_summary(
by = "ever",
missing_text = "Missing"
) |>
gtsummary::add_p() |>
gtsummary::add_overall() |>
gtsummary::bold_labels() |>
gtsummary::separate_p_footnotes()
Characteristic | Overall N = 1001 |
No N = 571 |
Yes N = 431 |
p-value |
---|---|---|---|---|
Age | 54 (50, 63) | 54 (50, 60) | 54 (50, 65) | 0.72 |
Sex | 0.23 | |||
Female | 63 (63%) | 33 (58%) | 30 (70%) | |
Male | 37 (37%) | 24 (42%) | 13 (30%) | |
Country | <0.0014 | |||
Laos | 53 (53%) | 39 (68%) | 14 (33%) | |
Thailand | 36 (36%) | 15 (26%) | 21 (49%) | |
USA | 11 (11%) | 3 (5.3%) | 8 (19%) | |
Language spoken at home | <0.0013 | |||
English | 17 (17%) | 15 (26%) | 2 (4.7%) | |
English,Hmong | 44 (44%) | 15 (26%) | 29 (67%) | |
Hmong | 39 (39%) | 27 (47%) | 12 (28%) | |
Education | 0.64 | |||
8 through 11 years | 29 (29%) | 15 (26%) | 14 (33%) | |
12 years or completed high school | 63 (63%) | 36 (63%) | 27 (63%) | |
College graduate | 8 (8.0%) | 6 (11%) | 2 (4.7%) | |
Household income | <0.0014 | |||
Less than $10,000 | 26 (32%) | 15 (38%) | 11 (26%) | |
$15,000 to under $20,000 | 18 (22%) | 8 (21%) | 10 (23%) | |
$50,000 to under $75,000 | 15 (18%) | 4 (10%) | 11 (26%) | |
$75,000 to under $100,000 | 14 (17%) | 3 (7.7%) | 11 (26%) | |
$100,000 to under $200,000 | 9 (11%) | 9 (23%) | 0 (0%) | |
Missing | 18 | 18 | 0 | |
Occupation | <0.0014 | |||
Employed, full time | 29 (29%) | 13 (23%) | 16 (37%) | |
Employed, part time | 15 (15%) | 5 (8.8%) | 10 (23%) | |
Self employed | 6 (6.0%) | 0 (0%) | 6 (14%) | |
Homemaker | 10 (10%) | 5 (8.8%) | 5 (12%) | |
Student | 6 (6.0%) | 6 (11%) | 0 (0%) | |
Unemployed | 30 (30%) | 24 (42%) | 6 (14%) | |
Disabled | 4 (4.0%) | 4 (7.0%) | 0 (0%) | |
1 Median (Q1, Q3); n (%) | ||||
2 Wilcoxon rank sum test | ||||
3 Pearson’s Chi-squared test | ||||
4 Fisher’s exact test |
tab2_columns = c("healthinsurance","generalhealth","Q18", "Q36",
"checkup","Q26","Q42","ever")
dftab2 = df[tab2_columns]
tab2_names = c("Health insurance","General health","Getting medical advice", "Medical delay",
"Routine check-up", "Smoked >100 cigarettes", "Family history of cancer", "ever")
names(dftab2) = tab2_names
dftab2|>
gtsummary::tbl_summary(
by = "ever",
missing_text = "Missing"
) |>
gtsummary::add_p() |>
gtsummary::add_overall() |>
gtsummary::bold_labels() |>
gtsummary::separate_p_footnotes()
Characteristic | Overall N = 1001 |
No N = 571 |
Yes N = 431 |
p-value |
---|---|---|---|---|
Health insurance | 0.62 | |||
None | 9 (10%) | 6 (13%) | 3 (7.0%) | |
Private | 25 (28%) | 14 (30%) | 11 (26%) | |
Public | 56 (62%) | 27 (57%) | 29 (67%) | |
Missing | 10 | 10 | 0 | |
General health | 0.0163 | |||
Poor/Fair | 45 (45%) | 23 (40%) | 22 (51%) | |
Good | 38 (38%) | 28 (49%) | 10 (23%) | |
Very good/Excellent | 17 (17%) | 6 (11%) | 11 (26%) | |
Getting medical advice | 0.93 | |||
A little confident | 37 (37%) | 20 (35%) | 17 (40%) | |
Somewhat confident | 43 (43%) | 25 (44%) | 18 (42%) | |
Very confident | 20 (20%) | 12 (21%) | 8 (19%) | |
Medical delay | 0.0662 | |||
Don't know | 3 (3.0%) | 3 (5.3%) | 0 (0%) | |
No | 38 (38%) | 17 (30%) | 21 (49%) | |
Yes | 59 (59%) | 37 (65%) | 22 (51%) | |
Routine check-up | 0.23 | |||
One year ago or less | 31 (31%) | 18 (32%) | 13 (30%) | |
More than 1 up to 2 years ago | 47 (47%) | 23 (40%) | 24 (56%) | |
More than 2 years ago | 22 (22%) | 16 (28%) | 6 (14%) | |
Smoked >100 cigarettes | 36 (36%) | 12 (21%) | 24 (56%) | <0.0013 |
Family history of cancer | 0.43 | |||
No | 47 (47%) | 28 (49%) | 19 (44%) | |
Yes | 29 (29%) | 18 (32%) | 11 (26%) | |
Don't know | 24 (24%) | 11 (19%) | 13 (30%) | |
1 n (%) | ||||
2 Fisher’s exact test | ||||
3 Pearson’s Chi-squared test |
dfglm = df[c(tab1_columns[-length(tab1_columns)], tab2_columns)]
names(dfglm) = c(tab1_names[-length(tab1_names)], tab2_names)
dfglm$`Family history of cancer` = factor(dfglm$`Family history of cancer`)
dfglm$ever = ifelse(dftab1$ever %in% "Yes", 1, 0)
mod1 = glm(ever~Age+Sex+Education+Country+`Health insurance` +`Routine check-up`+`Family history of cancer`,family = binomial(link = "logit"), data = dfglm)
mod1 |>
parameters(exponentiate = TRUE) |>
print_md()
Parameter | Odds Ratio | SE | 95% CI | z | p |
---|---|---|---|---|---|
(Intercept) | 0.08 | 0.30 | (4.73e-05, 140.47) | -0.67 | 0.500 |
Age | 1.01 | 0.05 | (0.92, 1.11) | 0.19 | 0.846 |
Sex (Male) | 0.81 | 0.64 | (0.16, 3.74) | -0.27 | 0.790 |
Education (12 years or completed high school) | 0.46 | 0.37 | (0.09, 2.15) | -0.98 | 0.327 |
Education (College graduate) | 0.27 | 0.35 | (0.02, 3.22) | -1.01 | 0.311 |
Country (Thailand) | 39.15 | 42.18 | (6.16, 465.88) | 3.40 | < .001 |
Country (USA) | 11.41 | 14.95 | (0.94, 174.09) | 1.86 | 0.063 |
Health insurance (Private) | 4.54 | 7.27 | (0.20, 134.20) | 0.95 | 0.344 |
Health insurance (Public) | 0.63 | 0.77 | (0.05, 7.54) | -0.38 | 0.703 |
Routine check-up (More than 1 up to 2 years ago) | 10.79 | 12.78 | (1.23, 142.52) | 2.01 | 0.045 |
Routine check-up (More than 2 years ago) | 3.91 | 5.11 | (0.32, 60.18) | 1.04 | 0.296 |
Family history of cancer (Yes) | 0.11 | 0.13 | (8.78e-03, 1.02) | -1.84 | 0.066 |
Family history of cancer (Don’t know) | 1.05 | 1.03 | (0.15, 7.56) | 0.05 | 0.960 |