Data source: https://mavenanalytics.io/data-playground/hcahps-patient-survey
The HCAHPS Patient Survey dataset contains approximately 4,000 to 5,000 records (each representing an individual hospital or healthcare facility) and around 20–25 columns describing patient experience and hospital performance. Among these, roughly 12–15 columns are numeric variables, including:
Categorical variables include:
About 5–10% of cells contain missing values, mostly in hospitals with low response counts or suppressed results.
#
#Install and Load Required Packages
#install.packages("tidyverse")
library(tidyverse)
#Import dataset
df_responses <- read_csv("C:/Users/DELL XPS 13/Downloads/HCAHPS+Patient+Survey/data_tables/responses.csv")
df_questions <- read_csv("C:/Users/DELL XPS 13/Downloads/HCAHPS+Patient+Survey/data_tables/questions.csv")
df_state_results <- read_csv("C:/Users/DELL XPS 13/Downloads/HCAHPS+Patient+Survey/data_tables/state_results.csv")
df_states <- read_csv("C:/Users/DELL XPS 13/Downloads/HCAHPS+Patient+Survey/data_tables/states.csv")
df_measures <- read_csv("C:/Users/DELL XPS 13/Downloads/HCAHPS+Patient+Survey/data_tables/measures.csv")
df_reports <- read_csv("C:/Users/DELL XPS 13/Downloads/HCAHPS+Patient+Survey/data_tables/reports.csv")
#Loading confirmation
cat("--- Data Loading Complete ---\n")
--- Data Loading Complete ---
#
#
dims_matrix <- sapply(list(state_results = df_state_results,
states = df_states,
measures = df_measures,
reports = df_reports,
response = df_responses,
questions = df_questions),
dim)
# Convert to a data frame and transpose for better readability
dims_table <- as.data.frame(t(dims_matrix))
colnames(dims_table) <- c("Rows", "Columns") # Rename columns for clarity
cat("Dimensions of main tables:
\n")
Dimensions of main tables:
print(dims_table)
#
#
df_analysis <- df_state_results %>%
left_join(df_states, by = c("State" = "State")) %>%
left_join(df_measures, by = c("Measure ID" = "Measure ID")) %>%
left_join(df_reports, by = c("Release Period" = "Release Period"))
df_analysis <- df_analysis %>%
rename(
State_Abbreviation = State,
Measure_ID = `Measure ID`,
Bottom_Box_Pct = `Bottom-box Percentage`,
Middle_Box_Pct = `Middle-box Percentage`,
Top_Box_Pct = `Top-box Percentage`,
State_Name = `State Name`
) %>%
mutate(
Start_Date = as.Date(`Start Date`),
End_Date = as.Date(`End Date`),
Year = as.numeric(substring(`Release Period`, 4)),
Numeric_Time = as.numeric(as.Date(paste0(str_sub(`Release Period`, 4, 7), "-", str_sub(`Release Period`, 1, 2), "-01"), format = "%Y-%m-%d")) / 365.25
)
cat("\n--- Structure of the Merged Data ---
\n")
--- Structure of the Merged Data ---
print(glimpse(df_analysis))
Rows: 4,580
Columns: 16
$ `Release Period` <chr> "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2…
$ State_Abbreviation <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A…
$ Measure_ID <chr> "H_CLEAN_HSP", "H_COMP_1", "H_COMP_2", "H_COMP_3", "H_COMP_5", "H_COMP_6", "H_COMP_7", "H_HSP_RATING", "…
$ Bottom_Box_Pct <dbl> 8, 9, 10, 11, 19, 15, 8, 13, 8, 7, 10, 5, 3, 10, 17, 15, 6, 7, 6, 5, 8, 5, 4, 9, 18, 17, 6, 8, 7, 5, 10,…
$ Middle_Box_Pct <dbl> 22, 17, 15, 21, 17, 0, 43, 22, 34, 23, 18, 14, 11, 21, 16, 0, 41, 20, 23, 24, 18, 15, 12, 22, 17, 0, 44,…
$ Top_Box_Pct <dbl> 70, 74, 75, 68, 64, 85, 49, 65, 58, 70, 72, 81, 86, 69, 67, 85, 53, 73, 71, 71, 74, 80, 84, 69, 65, 83, …
$ State_Name <chr> "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Ala…
$ Region <chr> "Pacific", "Pacific", "Pacific", "Pacific", "Pacific", "Pacific", "Pacific", "Pacific", "Pacific", "Paci…
$ Measure <chr> "Cleanliness of Hospital Environment", "Communication with Nurses", "Communication with Doctors", "Respo…
$ Type <chr> "Individual Item", "Composite Measure", "Composite Measure", "Composite Measure", "Composite Measure", "…
$ `Start Date` <date> 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10…
$ `End Date` <date> 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09…
$ Start_Date <date> 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10…
$ End_Date <date> 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09…
$ Year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ Numeric_Time <dbl> 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.4…
HCAHPS_ANALYSIS_DATA <- df_analysis
#
df_responses_clean <- df_responses %>%
mutate(
`Completed Surveys` = na_if(`Completed Surveys`, "Not Available"),
`Response Rate (%)` = na_if(`Response Rate (%)`, "Not Available"))
#
df_responses_clean <- df_responses_clean %>%
mutate(Response_Rate_Num = as.numeric(`Response Rate (%)`),
Completed_Surveys_Proxy = case_when(
`Completed Surveys` == "Fewer than 100" ~ 50, # Midpoint of 0-99
`Completed Surveys` == "Between 100 and 299" ~ 200, # Midpoint of 100-299
`Completed Surveys` == "300 or more" ~ 400,
TRUE ~ NA_real_ # Treat any remaining non-match as NA
)
)
#
df_responses_agg <- df_responses_clean %>%
filter(!is.na(Response_Rate_Num)) %>% # This is to filter out NA values before aggregation
group_by(`Release Period`, State) %>%
summarise(
Average_Response_Rate = mean(Response_Rate_Num, na.rm = TRUE),
Median_Surveys_Proxy = median(Completed_Surveys_Proxy, na.rm = TRUE),
.groups = 'drop'
)
#
HCAHPS_ANALYSIS_DATA <- df_analysis %>%
left_join(df_responses_agg, by = c("Release Period", "State_Abbreviation" = "State"))
# Data Cleaning on the main data
HCAHPS_ANALYSIS_DATA <- HCAHPS_ANALYSIS_DATA %>%
mutate(
across(c(Bottom_Box_Pct, Middle_Box_Pct, Top_Box_Pct), as.numeric)
) %>% mutate(
Region = as.factor(Region),
Type = as.factor(Type),
Measure = as.factor(Measure)
)
print(colSums(is.na(HCAHPS_ANALYSIS_DATA)))
Release Period State_Abbreviation Measure_ID Bottom_Box_Pct Middle_Box_Pct Top_Box_Pct
0 0 0 0 0 0
State_Name Region Measure Type Start Date End Date
0 0 0 0 0 0
Start_Date End_Date Year Numeric_Time Average_Response_Rate Median_Surveys_Proxy
0 0 0 0 0 4070
print(glimpse(HCAHPS_ANALYSIS_DATA))
Rows: 4,580
Columns: 18
$ `Release Period` <chr> "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "07_2015", "0…
$ State_Abbreviation <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AL", "AL", "AL", "AL", "AL", "AL", "AL",…
$ Measure_ID <chr> "H_CLEAN_HSP", "H_COMP_1", "H_COMP_2", "H_COMP_3", "H_COMP_5", "H_COMP_6", "H_COMP_7", "H_HSP_RATING"…
$ Bottom_Box_Pct <dbl> 8, 9, 10, 11, 19, 15, 8, 13, 8, 7, 10, 5, 3, 10, 17, 15, 6, 7, 6, 5, 8, 5, 4, 9, 18, 17, 6, 8, 7, 5, …
$ Middle_Box_Pct <dbl> 22, 17, 15, 21, 17, 0, 43, 22, 34, 23, 18, 14, 11, 21, 16, 0, 41, 20, 23, 24, 18, 15, 12, 22, 17, 0, …
$ Top_Box_Pct <dbl> 70, 74, 75, 68, 64, 85, 49, 65, 58, 70, 72, 81, 86, 69, 67, 85, 53, 73, 71, 71, 74, 80, 84, 69, 65, 8…
$ State_Name <chr> "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "Alaska", "…
$ Region <fct> Pacific, Pacific, Pacific, Pacific, Pacific, Pacific, Pacific, Pacific, Pacific, Pacific, East South …
$ Measure <fct> Cleanliness of Hospital Environment, Communication with Nurses, Communication with Doctors, Responsiv…
$ Type <fct> Individual Item, Composite Measure, Composite Measure, Composite Measure, Composite Measure, Composit…
$ `Start Date` <date> 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013…
$ `End Date` <date> 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014…
$ Start_Date <date> 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013-10-01, 2013…
$ End_Date <date> 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014-09-30, 2014…
$ Year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,…
$ Numeric_Time <dbl> 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 45.49487, 4…
$ Average_Response_Rate <dbl> 25.12500, 25.12500, 25.12500, 25.12500, 25.12500, 25.12500, 25.12500, 25.12500, 25.12500, 25.12500, 3…
$ Median_Surveys_Proxy <dbl> 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 2…
cat("\n
... Data Cleaning and Pre-processing complete! The HCAHPS_ANALYSIS_DATA is ready for EDA and modeling ...\n")
... Data Cleaning and Pre-processing complete! The HCAHPS_ANALYSIS_DATA is ready for EDA and modeling ...
#
#install.packages("scales")
library(scales)
overall_avg <- HCAHPS_ANALYSIS_DATA %>%
summarise(Avg_Top_Box_Pct = mean(Top_Box_Pct, na.rm = TRUE))
cat("Overall Average Top-Box Percentage (across all states, measures, and periods):
\n")
Overall Average Top-Box Percentage (across all states, measures, and periods):
print(paste0(round(overall_avg$Avg_Top_Box_Pct, 2), "%"))
[1] "71.33%"
##Average Top-Box Percentage by Measure Type
avg_by_type <- HCAHPS_ANALYSIS_DATA %>%
group_by(Type) %>%
summarise(
Avg_Top_Box_Pct = mean(Top_Box_Pct, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Avg_Top_Box_Pct))
cat("\nAverage Top-Box Percentage by Measure Type:
\n")
Average Top-Box Percentage by Measure Type:
print(avg_by_type)
##Average Top-Box Percentage by US Census Region
avg_by_region <- HCAHPS_ANALYSIS_DATA %>%
group_by(Region) %>%
summarise(
Avg_Top_Box_Pct = mean(Top_Box_Pct, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Avg_Top_Box_Pct))
cat("\nAverage Top-Box Percentage by US Census Region (Initial Insight into Geographic Differences):
\n")
Average Top-Box Percentage by US Census Region (Initial Insight into Geographic Differences):
print(avg_by_region)
#
#
##Top 5 and Bottom 5 Measures (Across All Time/States)
measure_performance <- HCAHPS_ANALYSIS_DATA %>%
group_by(Measure, Type) %>%
summarise(
Avg_Top_Box_Pct = mean(Top_Box_Pct, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Avg_Top_Box_Pct))
top_measures <- head(measure_performance, 5)
bottom_measures <- tail(measure_performance, 5)
combined_measures <- bind_rows(top_measures, bottom_measures)
plot_measure_performance <- ggplot(combined_measures,
aes(x = reorder(Measure, Avg_Top_Box_Pct),
y = Avg_Top_Box_Pct, fill = Type)) +
geom_bar(stat = "identity", color = "black", size = 0.5) +
geom_text(aes(label = paste0(round(Avg_Top_Box_Pct, 1), "%")),
hjust = -0.1, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20),
name = "Average Top-Box Percentage") +
scale_fill_manual(values = c("Composite Measure" = "#4e79a7",
"Global Item" = "#f28e2b",
"Individual Item" = "#e15759")) +
labs(title = "Best and Worst Performing HCAHPS Measures",
subtitle = "Average Top-Box Percentage (Highest Patient Satisfaction)",
x = "Measure") +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 10))
print(plot_measure_performance)
##Top 5 and Bottom 5 States (Across All Time/Measures)
state_performance <- HCAHPS_ANALYSIS_DATA %>%
group_by(State_Name, Region) %>%
summarise(
Avg_Top_Box_Pct = mean(Top_Box_Pct, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Avg_Top_Box_Pct))
top_states <- head(state_performance, 5)
bottom_states <- tail(state_performance, 5)
combined_states <- bind_rows(top_states, bottom_states)
plot_state_performance <- ggplot(combined_states,
aes(x = reorder(State_Name, Avg_Top_Box_Pct),
y = Avg_Top_Box_Pct, fill = Region)) +
geom_bar(stat = "identity", color = "white") +
geom_text(aes(label = paste0(round(Avg_Top_Box_Pct, 1), "%")),
hjust = -0.1, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20),
name = "Average Top-Box Percentage") +
labs(title = "Top and Bottom 5 States for Patient Satisfaction",
subtitle = "Average Top-Box Percentage (All Measures Combined)",
x = "State") +
theme_bw(base_size = 10) +
scale_fill_brewer(palette = "Set2") +
theme(plot.title = element_text(face = "bold"),
legend.position = "bottom")
print(plot_state_performance)
#
#
## Calculating the national average Top-Box Percentage for each unique Measure (Measure_Name) in each reporting period (Numeric_Time)
df_trend <- HCAHPS_ANALYSIS_DATA %>%
group_by(Numeric_Time, Measure) %>%
summarise(
National_Avg_Pct = mean(Top_Box_Pct, na.rm = TRUE),
.groups = 'drop'
)
## Identify the consistently highest and lowest performing measures to highlight in the plot (optional subsetting for cleaner visual)
top_measures <- df_trend %>%
group_by(Measure) %>%
summarise(Avg_Pct = mean(National_Avg_Pct)) %>%
slice_max(Avg_Pct, n = 3) %>%
pull(Measure)
bottom_measures <- df_trend %>%
group_by(Measure) %>%
summarise(Avg_Pct = mean(National_Avg_Pct)) %>%
slice_min(Avg_Pct, n = 3) %>%
pull(Measure)
## Combining the measures to focus on
highlight_measures <- unique(c(top_measures, bottom_measures))
df_trend_highlight <- df_trend %>%
filter(Measure %in% highlight_measures)
time_breaks <- unique(df_trend$Numeric_Time)
custom_breaks <- time_breaks[c(1, 5, 9)]
trend_plot <- ggplot(df_trend_highlight,
aes(x = Numeric_Time, y = National_Avg_Pct,
color = Measure, group = Measure)) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
# Custom y-axis to show percentages
scale_y_continuous(labels = label_percent(scale = 1, suffix = "%"),
name = "Average Top-Box Percentage") +
# Custom x-axis to map numeric time back to visible years
scale_x_continuous(breaks = custom_breaks,
labels = c("2015", "2019", "2023"),
name = "Report Period (Year)") +
labs(title = "National Trend in Patient Satisfaction Over Time (2015 - 2023)",
subtitle = "Comparing Consistently High and Low Performing Measures",
color = "Measure") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom")
print(trend_plot)
cat("\n--- EDA Complete ---\n")
--- EDA Complete ---
#
#install.packages("lme4")
#install.packages("car")
#install.packages("broom.mixed")
library(tidyverse)
library(lme4)
library(broom.mixed)
cat("\n--- 1. ANOVA: Top-Box Percentage by Measure Type ---\n")
--- 1. ANOVA: Top-Box Percentage by Measure Type ---
aov_measure_type <- aov(Top_Box_Pct ~ Type, data = HCAHPS_ANALYSIS_DATA)
summary_aov_type <- summary(aov_measure_type)
print(summary_aov_type)
Df Sum Sq Mean Sq F value Pr(>F)
Type 2 14320 7160 69.17 <2e-16 ***
Residuals 4577 473760 104
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
## Post-Hoc Test (Tukey's HSD): Check the P-value. If it's less than 0.05, the differences are significant.
if (summary_aov_type[[1]]$`Pr(>F)`[1] < 0.05) {
cat("\n-- Post-Hoc (Tukey HSD) for Measure Type (Measures that are significantly different) --\n")
tukey_type <- TukeyHSD(aov_measure_type)
print(tukey_type)
} else {
cat("\nANOVA P-value is > 0.05. No statistically significant difference between measure types found.\n")
}
-- Post-Hoc (Tukey HSD) for Measure Type (Measures that are significantly different) --
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Top_Box_Pct ~ Type, data = HCAHPS_ANALYSIS_DATA)
$Type
diff lwr upr p adj
Global Item-Composite Measure -0.8540757 -1.764102 0.05595103 0.0712129
Individual Item-Composite Measure -4.5560408 -5.466067 -3.64601404 0.0000000
Individual Item-Global Item -3.7019651 -4.816516 -2.58741451 0.0000000
# Visualization of Box Plot
aov_type_plot <- HCAHPS_ANALYSIS_DATA %>%
ggplot(aes(x = Type, y = Top_Box_Pct, fill = Type)) +
geom_boxplot(alpha = 0.8, outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.1, color = "black") +
labs(
title = "Patient Satisfaction (Top-Box %) Distribution by Type",
subtitle = "ANOVA analysis confirms if these observed differences are statistically significant.",
x = "HCAHPS Measure Category",
y = "Top-Box Percentage (%)",
fill = "Type"
) +
scale_y_continuous(limits = c(30, 100)) + # Set limits for better comparison
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold"),
legend.position = "none")
print(aov_type_plot)
cat("\n--- ANOVA: Top-Box Percentage by Region ---\n")
--- ANOVA: Top-Box Percentage by Region ---
aov_region <- aov(Top_Box_Pct ~ Region, data = HCAHPS_ANALYSIS_DATA)
summary_aov_region <- summary(aov_region)
print(summary_aov_region)
Df Sum Sq Mean Sq F value Pr(>F)
Region 8 21235 2654.3 25.99 <2e-16 ***
Residuals 4571 466845 102.1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Post-Hoc Test (Tukey's HSD) for Regional differences:
if (summary_aov_region[[1]]$`Pr(>F)`[1] < 0.05) {
cat("\n-- Post-Hoc (Tukey HSD) for Region --\n")
tukey_region <- TukeyHSD(aov_region)
print(tukey_region)
}
-- Post-Hoc (Tukey HSD) for Region --
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Top_Box_Pct ~ Region, data = HCAHPS_ANALYSIS_DATA)
$Region
diff lwr upr p adj
East South Central-East North Central -0.02166667 -2.2392475 2.1959142 1.0000000
Mid-Atlantic-East North Central -5.08370370 -7.4978999 -2.6695075 0.0000000
Mountain-East North Central -1.22583333 -3.1104147 0.6587480 0.5303374
New England-East North Central -1.21148148 -3.2132273 0.7902643 0.6290293
Pacific-East North Central -2.42888889 -4.5196442 -0.3381336 0.0095679
South Atlantic-East North Central -3.56125000 -5.4092340 -1.7132660 0.0000001
West North Central-East North Central 2.51365079 0.5779875 4.4493140 0.0018589
West South Central-East North Central 1.32277778 -0.8948031 3.5403586 0.6477020
Mid-Atlantic-East South Central -5.06203704 -7.5868639 -2.5372102 0.0000001
Mountain-East South Central -1.20416667 -3.2285318 0.8201984 0.6512329
New England-East South Central -1.18981481 -3.3236830 0.9440533 0.7277900
Pacific-East South Central -2.40722222 -4.6248031 -0.1896414 0.0217036
South Atlantic-East South Central -3.53958333 -5.5299231 -1.5492436 0.0000013
West North Central-East South Central 2.53531746 0.4633137 4.6073212 0.0046815
West South Central-East South Central 1.34444444 -0.9930910 3.6819799 0.6924107
Mountain-Mid-Atlantic 3.85787037 1.6198506 6.0958902 0.0000033
New England-Mid-Atlantic 3.87222222 1.5346868 6.2097577 0.0000102
Pacific-Mid-Atlantic 2.65481481 0.2406186 5.0690111 0.0187396
South Atlantic-Mid-Atlantic 1.52245370 -0.6848367 3.7297441 0.4456123
West North Central-Mid-Atlantic 7.59735450 5.3161535 9.8785555 0.0000000
West South Central-Mid-Atlantic 6.40648148 3.8816546 8.9313083 0.0000000
New England-Mountain 0.01435185 -1.7709703 1.7996740 1.0000000
Pacific-Mountain -1.20305556 -3.0876369 0.6815258 0.5568065
South Atlantic-Mountain -2.33541667 -3.9464518 -0.7243815 0.0002412
West North Central-Mountain 3.73948413 2.0285834 5.4503849 0.0000000
West South Central-Mountain 2.54861111 0.5242460 4.5729762 0.0030519
Pacific-New England -1.21740741 -3.2191532 0.7843384 0.6226460
South Atlantic-New England -2.34976852 -4.0964149 -0.6031221 0.0010141
West North Central-New England 3.72513228 1.8859692 5.5642953 0.0000001
West South Central-New England 2.53425926 0.4003911 4.6681274 0.0071310
South Atlantic-Pacific -1.13236111 -2.9803451 0.7156229 0.6127308
West North Central-Pacific 4.94253968 3.0068764 6.8782029 0.0000000
West South Central-Pacific 3.75166667 1.5340858 5.9692475 0.0000057
West North Central-South Atlantic 6.07490079 4.4043980 7.7454036 0.0000000
West South Central-South Atlantic 4.88402778 2.8936881 6.8743675 0.0000000
West South Central-West North Central -1.19087302 -3.2628767 0.8811307 0.6932527
region_aov_plot <- HCAHPS_ANALYSIS_DATA %>%
ggplot(aes(x = Region, y = Top_Box_Pct, fill = Region)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) +
labs(
title = "Patient Satisfaction Distribution Across US Regions",
subtitle = "Confirmed by ANOVA and Tukey Post-Hoc Test",
x = "US Census Region",
y = "Top-Box Percentage (%)"
) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(region_aov_plot)
aov_region <- aov(Top_Box_Pct ~ Region, data = HCAHPS_ANALYSIS_DATA)
print(summary(aov_region))
Df Sum Sq Mean Sq F value Pr(>F)
Region 8 21235 2654.3 25.99 <2e-16 ***
Residuals 4571 466845 102.1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
This is to determine if states with higher response rates or completed surveys report different levels of patient satisfaction.
cat("\n--- Linear Regression: Predicting Top-Box % with Response Rate ---\n")
--- Linear Regression: Predicting Top-Box % with Response Rate ---
lm_response_rate <- lm(Top_Box_Pct ~ Average_Response_Rate,
data = HCAHPS_ANALYSIS_DATA)
print(lm_response_rate)
Call:
lm(formula = Top_Box_Pct ~ Average_Response_Rate, data = HCAHPS_ANALYSIS_DATA)
Coefficients:
(Intercept) Average_Response_Rate
60.6883 0.3981
##Scatter Plot with Regression Line
plot_response_rate <- HCAHPS_ANALYSIS_DATA %>%
# Filter out potential NA values that might have been introduced during cleaning
filter(!is.na(Average_Response_Rate)) %>%
ggplot(aes(x = Average_Response_Rate, y = Top_Box_Pct)) +
geom_point(alpha = 0.3, color = "#0072B2") +
# Add the linear regression line
geom_smooth(method = "lm", color = "#D55E00", linewidth = 1.2, se = TRUE) +
labs(
title = "Does Response Rate Predict Patient Satisfaction?",
subtitle = paste("R-squared:", round(summary_lm_response$r.squared, 4), " | P-value for Slope:",
format.pval(summary_lm_response$coefficients[2, 4], digits = 3)),
x = "Average Survey Response Rate (%)",
y = "Average Top-Box Percentage (%)"
) +
scale_y_continuous(labels = scales::label_percent(scale = 1, suffix = "%")) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
print(plot_response_rate)
cat("\n--- Linear Regression: Predicting Top-Box % with Completed Surveys (Proxy) ---\n")
--- Linear Regression: Predicting Top-Box % with Completed Surveys (Proxy) ---
lm_survey_count <- lm(Top_Box_Pct ~ Median_Surveys_Proxy,
data = HCAHPS_ANALYSIS_DATA)
print(lm_survey_count)
Call:
lm(formula = Top_Box_Pct ~ Median_Surveys_Proxy, data = HCAHPS_ANALYSIS_DATA)
Coefficients:
(Intercept) Median_Surveys_Proxy
74.23045 -0.01012
##Scatter Plot with Regression Line
plot_survey_count <- HCAHPS_ANALYSIS_DATA %>%
filter(!is.na(Median_Surveys_Proxy)) %>%
ggplot(aes(x = Median_Surveys_Proxy, y = Top_Box_Pct)) +
geom_point(alpha = 0.3, color = "#0072B2") +
# Add the linear regression line
geom_smooth(method = "lm", color = "#D55E00", linewidth = 1.2, se = TRUE) +
labs(
title = "Does Survey Volume Predict Patient Satisfaction?",
subtitle = paste("R-squared:", round(summary_lm_surveys$r.squared, 4), " | P-value for Slope:",
format.pval(summary_lm_surveys$coefficients[2, 4], digits = 3)),
x = "Median Completed Surveys (Numeric Proxy)",
y = "Average Top-Box Percentage (%)"
) +
scale_y_continuous(labels = scales::label_percent(scale = 1, suffix = "%")) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
print(plot_survey_count)
#
This is to quantify the average annual change in Top-Box Percentage over the entire period. The ‘Numeric_Time’ variable is used as the predictor. The coefficient of Numeric_Time represents the change per unit of time (approximately 1 year).
cat("\n--- 5. Time Series Regression: Trend Over Time ---\n")
--- 5. Time Series Regression: Trend Over Time ---
lm_time_trend <- lm(Top_Box_Pct ~ Numeric_Time, data = HCAHPS_ANALYSIS_DATA)
summary_lm_time <- summary(lm_time_trend)
print(summary_lm_time)
Call:
lm(formula = Top_Box_Pct ~ Numeric_Time, data = HCAHPS_ANALYSIS_DATA)
Residuals:
Min 1Q Median 3Q Max
-29.737 -6.231 0.465 8.263 19.870
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 76.3385 2.9294 26.059 <2e-16 ***
Numeric_Time -0.1011 0.0591 -1.711 0.0871 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.32 on 4578 degrees of freedom
Multiple R-squared: 0.0006393, Adjusted R-squared: 0.000421
F-statistic: 2.929 on 1 and 4578 DF, p-value: 0.08709
national_trend_data <- HCAHPS_ANALYSIS_DATA %>%
group_by(Numeric_Time, Start_Date) %>%
summarise(Avg_Top_Box_Pct = mean(Top_Box_Pct, na.rm = TRUE), .groups = 'drop') %>%
mutate(Start_Date = as.Date(Start_Date, format = "%Y-%m-%d"))
time_trend_plot <- national_trend_data %>%
ggplot(aes(x = Start_Date, y = Avg_Top_Box_Pct)) +
geom_line(color = "#10b981", linewidth = 1) +
geom_point(color = "#059669", size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "#ef4444", linewidth = 0.5, linetype = "dashed") +
labs(
title = "National Patient Satisfaction Trend with Linear Fit",
subtitle = "The slope of the dashed line corresponds to the Numeric_Time coefficient.",
x = "Reporting Period",
y = "Average Top-Box Percentage (%)"
) +
theme_light(base_size = 14)
print(time_trend_plot)
lm_time_trend <- lm(Top_Box_Pct ~ Numeric_Time, data = HCAHPS_ANALYSIS_DATA)
print(summary(lm_time_trend))
Call:
lm(formula = Top_Box_Pct ~ Numeric_Time, data = HCAHPS_ANALYSIS_DATA)
Residuals:
Min 1Q Median 3Q Max
-29.737 -6.231 0.465 8.263 19.870
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 76.3385 2.9294 26.059 <2e-16 ***
Numeric_Time -0.1011 0.0591 -1.711 0.0871 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.32 on 4578 degrees of freedom
Multiple R-squared: 0.0006393, Adjusted R-squared: 0.000421
F-statistic: 2.929 on 1 and 4578 DF, p-value: 0.08709
cat("\n--- 3. Relationship Visualization: Top-Box % vs. Average Response Rate ---\n")
--- 3. Relationship Visualization: Top-Box % vs. Average Response Rate ---
# Plot the simple linear regression of Top-Box % vs. Average_Response_Rate.
response_rate_plot <- HCAHPS_ANALYSIS_DATA %>%
ggplot(aes(x = Average_Response_Rate, y = Top_Box_Pct)) +
# Use transparency for geom_point because data points overlap
geom_point(alpha = 0.2, color = "#2563eb") +
geom_smooth(method = "lm", color = "#4c1d95", fill = "#57d2fe") + # Add regression line
labs(
title = "Does Response Rate Predict Patient Satisfaction?",
subtitle = "Scatter plot with fitted linear regression line (Model 2.1)",
x = "Average Response Rate (%)",
y = "Top-Box Percentage (%)"
) +
theme_minimal(base_size = 10)
print(response_rate_plot)
lm_response_rate <- lm(Top_Box_Pct ~ Average_Response_Rate, data = HCAHPS_ANALYSIS_DATA)
print(summary(lm_response_rate))
Call:
lm(formula = Top_Box_Pct ~ Average_Response_Rate, data = HCAHPS_ANALYSIS_DATA)
Residuals:
Min 1Q Median 3Q Max
-28.4699 -6.1736 0.5404 7.9713 20.0966
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 60.68831 0.85483 71.00 <2e-16 ***
Average_Response_Rate 0.39809 0.03148 12.65 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.15 on 4578 degrees of freedom
Multiple R-squared: 0.03376, Adjusted R-squared: 0.03355
F-statistic: 160 on 1 and 4578 DF, p-value: < 2.2e-16