setwd("C:/Users/qfd738-4251/Desktop/My Class Stuff")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ggplot2)
#Load data (adjusted skip since headers off)
raw_data <-read_excel("2025-County-Health-Rankings-Texas-Data-v3.xlsx", sheet = "Select Measure Data", skip = 1)
## New names:
## • `Unreliable` -> `Unreliable...4`
## • `95% CI - Low` -> `95% CI - Low...7`
## • `95% CI - High` -> `95% CI - High...8`
## • `National Z-Score` -> `National Z-Score...9`
## • `95% CI - Low` -> `95% CI - Low...39`
## • `95% CI - High` -> `95% CI - High...40`
## • `National Z-Score` -> `National Z-Score...41`
## • `Unreliable` -> `Unreliable...42`
## • `95% CI - Low` -> `95% CI - Low...44`
## • `95% CI - High` -> `95% CI - High...45`
## • `National Z-Score` -> `National Z-Score...46`
## • `95% CI - Low` -> `95% CI - Low...69`
## • `95% CI - High` -> `95% CI - High...70`
## • `National Z-Score` -> `National Z-Score...71`
## • `95% CI - Low` -> `95% CI - Low...73`
## • `95% CI - High` -> `95% CI - High...74`
## • `National Z-Score` -> `National Z-Score...75`
## • `National Z-Score` -> `National Z-Score...77`
## • `National Z-Score` -> `National Z-Score...84`
## • `National Z-Score` -> `National Z-Score...86`
## • `National Z-Score` -> `National Z-Score...90`
## • `National Z-Score` -> `National Z-Score...94`
## • `National Z-Score` -> `National Z-Score...98`
## • `National Z-Score` -> `National Z-Score...100`
## • `National Z-Score` -> `National Z-Score...107`
## • `95% CI - Low` -> `95% CI - Low...115`
## • `95% CI - High` -> `95% CI - High...116`
## • `National Z-Score` -> `National Z-Score...117`
## • `95% CI - Low` -> `95% CI - Low...119`
## • `95% CI - High` -> `95% CI - High...120`
## • `National Z-Score` -> `National Z-Score...130`
## • `95% CI - Low` -> `95% CI - Low...132`
## • `95% CI - High` -> `95% CI - High...133`
## • `National Z-Score` -> `National Z-Score...134`
## • `95% CI - Low` -> `95% CI - Low...152`
## • `95% CI - High` -> `95% CI - High...153`
## • `National Z-Score` -> `National Z-Score...154`
## • `National Z-Score` -> `National Z-Score...156`
## • `National Z-Score` -> `National Z-Score...158`
## • `95% CI - Low` -> `95% CI - Low...161`
## • `95% CI - High` -> `95% CI - High...162`
## • `National Z-Score` -> `National Z-Score...163`
## • `National Z-Score` -> `National Z-Score...165`
## • `Population` -> `Population...167`
## • `95% CI - Low` -> `95% CI - Low...169`
## • `95% CI - High` -> `95% CI - High...170`
## • `National Z-Score` -> `National Z-Score...171`
## • `Population` -> `Population...173`
## • `95% CI - Low` -> `95% CI - Low...175`
## • `95% CI - High` -> `95% CI - High...176`
## • `National Z-Score` -> `National Z-Score...177`
## • `National Z-Score` -> `National Z-Score...181`
## • `National Z-Score` -> `National Z-Score...185`
## • `95% CI - Low` -> `95% CI - Low...187`
## • `95% CI - High` -> `95% CI - High...188`
## • `National Z-Score` -> `National Z-Score...189`
## • `95% CI - Low` -> `95% CI - Low...197`
## • `95% CI - High` -> `95% CI - High...198`
## • `National Z-Score` -> `National Z-Score...199`
## • `National Z-Score` -> `National Z-Score...223`
## • `National Z-Score` -> `National Z-Score...225`
addl_data<-read_excel("2025-County-Health-Rankings-Texas-Data-v3.xlsx", sheet = "Additional Measure Data", skip = 1)
## New names:
## • `95% CI - Low` -> `95% CI - Low...5`
## • `95% CI - High` -> `95% CI - High...6`
## • `# Deaths` -> `# Deaths...28`
## • `95% CI - Low` -> `95% CI - Low...30`
## • `95% CI - High` -> `95% CI - High...31`
## • `# Deaths` -> `# Deaths...53`
## • `95% CI - Low` -> `95% CI - Low...55`
## • `95% CI - High` -> `95% CI - High...56`
## • `# Deaths` -> `# Deaths...78`
## • `95% CI - Low` -> `95% CI - Low...80`
## • `95% CI - High` -> `95% CI - High...81`
## • `95% CI - Low` -> `95% CI - Low...104`
## • `95% CI - High` -> `95% CI - High...105`
## • `95% CI - Low` -> `95% CI - Low...107`
## • `95% CI - High` -> `95% CI - High...108`
## • `95% CI - Low` -> `95% CI - Low...112`
## • `95% CI - High` -> `95% CI - High...113`
## • `95% CI - Low` -> `95% CI - Low...115`
## • `95% CI - High` -> `95% CI - High...116`
## • `# Deaths` -> `# Deaths...117`
## • `95% CI - Low` -> `95% CI - Low...119`
## • `95% CI - High` -> `95% CI - High...120`
## • `95% CI - Low` -> `95% CI - Low...144`
## • `95% CI - High` -> `95% CI - High...145`
## • `95% CI - Low` -> `95% CI - Low...151`
## • `95% CI - High` -> `95% CI - High...152`
## • `95% CI - Low` -> `95% CI - Low...154`
## • `95% CI - High` -> `95% CI - High...155`
## • `95% CI - Low` -> `95% CI - Low...180`
## • `95% CI - High` -> `95% CI - High...181`
## • `95% CI - Low` -> `95% CI - Low...185`
## • `95% CI - High` -> `95% CI - High...186`
## • `95% CI - Low` -> `95% CI - Low...189`
## • `95% CI - High` -> `95% CI - High...190`
## • `95% CI - Low` -> `95% CI - Low...213`
## • `95% CI - High` -> `95% CI - High...214`
## • `95% CI - Low` -> `95% CI - Low...216`
## • `95% CI - High` -> `95% CI - High...217`
## • `95% CI - Low` -> `95% CI - Low...220`
## • `95% CI - High` -> `95% CI - High...221`
## • `95% CI - Low` -> `95% CI - Low...224`
## • `95% CI - High` -> `95% CI - High...225`
## • `95% CI - Low` -> `95% CI - Low...231`
## • `95% CI - High` -> `95% CI - High...232`
## • `95% CI - Low` -> `95% CI - Low...235`
## • `95% CI - High` -> `95% CI - High...236`
## • `Average Grade Performance` -> `Average Grade Performance...246`
## • `Average Grade Performance (AIAN)` -> `Average Grade Performance
## (AIAN)...247`
## • `Average Grade Performance (Asian)` -> `Average Grade Performance
## (Asian)...248`
## • `Average Grade Performance (Black)` -> `Average Grade Performance
## (Black)...249`
## • `Average Grade Performance (Hispanic)` -> `Average Grade Performance
## (Hispanic)...250`
## • `Average Grade Performance (White)` -> `Average Grade Performance
## (White)...251`
## • `Average Grade Performance` -> `Average Grade Performance...252`
## • `Average Grade Performance (AIAN)` -> `Average Grade Performance
## (AIAN)...253`
## • `Average Grade Performance (Asian)` -> `Average Grade Performance
## (Asian)...254`
## • `Average Grade Performance (Black)` -> `Average Grade Performance
## (Black)...255`
## • `Average Grade Performance (Hispanic)` -> `Average Grade Performance
## (Hispanic)...256`
## • `Average Grade Performance (White)` -> `Average Grade Performance
## (White)...257`
## • `Segregation Index` -> `Segregation Index...258`
## • `95% CI - Low` -> `95% CI - Low...265`
## • `95% CI - High` -> `95% CI - High...266`
## • `95% CI - Low` -> `95% CI - Low...268`
## • `95% CI - High` -> `95% CI - High...269`
## • `Segregation Index` -> `Segregation Index...287`
## • `95% CI - Low` -> `95% CI - Low...289`
## • `95% CI - High` -> `95% CI - High...290`
## • `95% CI - Low` -> `95% CI - Low...314`
## • `95% CI - High` -> `95% CI - High...315`
## • `95% CI - Low` -> `95% CI - Low...339`
## • `95% CI - High` -> `95% CI - High...340`
## • `95% CI - Low` -> `95% CI - Low...363`
## • `95% CI - High` -> `95% CI - High...364`
## • `95% CI - Low` -> `95% CI - Low...366`
## • `95% CI - High` -> `95% CI - High...367`
## • `95% CI - Low` -> `95% CI - Low...386`
## • `95% CI - High` -> `95% CI - High...387`
## • `95% CI - Low` -> `95% CI - Low...391`
## • `95% CI - High` -> `95% CI - High...392`
Sheets selected, NAs removed, and Data Combined
## Sheets Selected and NAs Removed
select_keep <- raw_data %>% dplyr::select(FIPS, State, County, 'Preventable Hospitalization Rate','% Vaccinated','% with Annual Mammogram') %>% drop_na()
addl_keep<- addl_data %>% dplyr::select(FIPS, State, County, '% Rural', '% 65 and Over', '% Food Insecure') %>% drop_na()
## Data Combined
combined_df <- select_keep %>% dplyr::inner_join(addl_keep, by = c("FIPS", "State", "County")) %>% tidyr::drop_na()
## Two sheets were selected. Select Measure Data sheet has 239 observational variables, while additional Measure Data sheet has 254 observational variables. After NAs are dropped, the combined data sheet is left with 239 observational variables.
nrow(select_keep)
## [1] 239
nrow(addl_keep)
## [1] 254
nrow(combined_df)
## [1] 239
Dependent Variable
dv <- combined_df %>% dplyr::select (`Preventable Hospitalization Rate`)
pastecs::stat.desc(dv)
## Preventable Hospitalization Rate
## nbr.val 2.390000e+02
## nbr.null 0.000000e+00
## nbr.na 0.000000e+00
## min 9.060000e+02
## max 6.214000e+03
## range 5.308000e+03
## sum 7.255060e+05
## median 3.023000e+03
## mean 3.035590e+03
## SE.mean 6.307588e+01
## CI.mean.0.95 1.242583e+02
## var 9.508773e+05
## std.dev 9.751294e+02
## coef.var 3.212322e-01
round(t(pastecs::stat.desc(dv)), 2)
## nbr.val nbr.null nbr.na min max range sum
## Preventable Hospitalization Rate 239 0 0 906 6214 5308 725506
## median mean SE.mean CI.mean.0.95 var
## Preventable Hospitalization Rate 3023 3035.59 63.08 124.26 950877.3
## std.dev coef.var
## Preventable Hospitalization Rate 975.13 0.32
Independent Variables
ivs <- combined_df %>%
dplyr::select(
`% Rural`,
`% 65 and Over`,
`% Food Insecure`,
`% Vaccinated`,
`% with Annual Mammogram` )
pastecs::stat.desc(ivs)
## % Rural % 65 and Over % Food Insecure % Vaccinated
## nbr.val 2.390000e+02 239.0000000 239.0000000 239.0000000
## nbr.null 0.000000e+00 0.0000000 0.0000000 0.0000000
## nbr.na 0.000000e+00 0.0000000 0.0000000 0.0000000
## min 5.782581e-01 9.2394441 10.6000000 13.0000000
## max 1.000000e+02 45.2586207 28.5000000 54.0000000
## range 9.942174e+01 36.0191766 17.9000000 41.0000000
## sum 1.517925e+04 4510.1772943 4141.6000000 8526.0000000
## median 6.727475e+01 18.0983442 16.9000000 36.0000000
## mean 6.351151e+01 18.8710347 17.3288703 35.6736402
## SE.mean 2.235534e+00 0.3569098 0.1904404 0.5910705
## CI.mean.0.95 4.403960e+00 0.7031057 0.3751641 1.1643980
## var 1.194429e+03 30.4449165 8.6679445 83.4980838
## std.dev 3.456051e+01 5.5176912 2.9441373 9.1377286
## coef.var 5.441614e-01 0.2923894 0.1698978 0.2561479
## % with Annual Mammogram
## nbr.val 239.0000000
## nbr.null 0.0000000
## nbr.na 0.0000000
## min 14.0000000
## max 55.0000000
## range 41.0000000
## sum 8285.0000000
## median 36.0000000
## mean 34.6652720
## SE.mean 0.4980399
## CI.mean.0.95 0.9811293
## var 59.2824444
## std.dev 7.6995094
## coef.var 0.2221102
round(t(pastecs::stat.desc(ivs)), 2)
## nbr.val nbr.null nbr.na min max range sum
## % Rural 239 0 0 0.58 100.00 99.42 15179.25
## % 65 and Over 239 0 0 9.24 45.26 36.02 4510.18
## % Food Insecure 239 0 0 10.60 28.50 17.90 4141.60
## % Vaccinated 239 0 0 13.00 54.00 41.00 8526.00
## % with Annual Mammogram 239 0 0 14.00 55.00 41.00 8285.00
## median mean SE.mean CI.mean.0.95 var std.dev
## % Rural 67.27 63.51 2.24 4.40 1194.43 34.56
## % 65 and Over 18.10 18.87 0.36 0.70 30.44 5.52
## % Food Insecure 16.90 17.33 0.19 0.38 8.67 2.94
## % Vaccinated 36.00 35.67 0.59 1.16 83.50 9.14
## % with Annual Mammogram 36.00 34.67 0.50 0.98 59.28 7.70
## coef.var
## % Rural 0.54
## % 65 and Over 0.29
## % Food Insecure 0.17
## % Vaccinated 0.26
## % with Annual Mammogram 0.22
## Histogram for Independent Variables
Descriptive Statistics for Dependent and Independent Variables
desc_all <- pastecs::stat.desc(combined_df %>% dplyr::select(where(is.numeric)))
round(t(desc_all), 2)
## nbr.val nbr.null nbr.na min max range
## Preventable Hospitalization Rate 239 0 0 906.00 6214.00 5308.00
## % Vaccinated 239 0 0 13.00 54.00 41.00
## % with Annual Mammogram 239 0 0 14.00 55.00 41.00
## % Rural 239 0 0 0.58 100.00 99.42
## % 65 and Over 239 0 0 9.24 45.26 36.02
## % Food Insecure 239 0 0 10.60 28.50 17.90
## sum median mean SE.mean CI.mean.0.95
## Preventable Hospitalization Rate 725506.00 3023.00 3035.59 63.08 124.26
## % Vaccinated 8526.00 36.00 35.67 0.59 1.16
## % with Annual Mammogram 8285.00 36.00 34.67 0.50 0.98
## % Rural 15179.25 67.27 63.51 2.24 4.40
## % 65 and Over 4510.18 18.10 18.87 0.36 0.70
## % Food Insecure 4141.60 16.90 17.33 0.19 0.38
## var std.dev coef.var
## Preventable Hospitalization Rate 950877.31 975.13 0.32
## % Vaccinated 83.50 9.14 0.26
## % with Annual Mammogram 59.28 7.70 0.22
## % Rural 1194.43 34.56 0.54
## % 65 and Over 30.44 5.52 0.29
## % Food Insecure 8.67 2.94 0.17
## Multiple Histograms (Faceted View)
combined_df %>% dplyr::select(dplyr::where(is.numeric)) %>% pivot_longer(everything()) %>% ggplot(aes(x = value)) + geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "steelblue", color = "black", alpha = 0.6) + geom_density(color = "red",linewidth = 1) + facet_wrap(~ name, scales = "free") + labs(title = "Histograms with Density Curves for Regression Variables", x = "Value", y = "Density") + theme_minimal()

## Dependent Variable: Preventable Hospitalization Rate. After removing NAs, Preventable Hospitalization Rate has 239 observations with no missing values, so the dependent variable is complete in the dataset. Its minimum is 906 and its maximum is 6,214, which is a very widespread and suggests substantial differences in preventable hospital use across the study areas. The mean is about 3,035.59 and the median is 3,023, so the distribution is centered around roughly 3,000 and is fairly balanced around the middle. The standard deviation is 975.13 and the coefficient of variation is 0.32, which shows moderate relative variability compared with its average level.
## Independent Variables: % Rural ranges from 0.58 to 100, with a mean of 63.51 and a median of 67.27. That suggests most observations are fairly rural, but the dataset still includes some fully urban or nearly urban places, so there is strong geographic diversity. % 65 and Over ranges from 9.24 to 45.26, with a mean of 18.87 and a median of 18.10. This suggests the typical area has about one-fifth of its population aged 65+, and the variation is moderate, with some places much older than others. % Food Insecure ranges from 10.60 to 28.50, with a mean of 17.33 and a median of 16.90. This indicates that food insecurity is present in every observation and differs meaningfully across areas, though not as dramatically as the rural measure. % Vaccinated ranges from 13 to 54, with a mean of 35.67 and a median of 36. This shows vaccination coverage varies widely across the sample, but the center is around the mid-30s. % with Annual Mammogram ranges from 14 to 55, with a mean of 34.67 and a median of 36. This suggests screening rates are also quite variable, but the average area has roughly one-third of eligible women receiving annual mammograms.
## Overall, the descriptive statistics for the dependent and independent variables show a dataset with complete numeric data and meaningful variation across all variables. The dependent variable has substantial spread, which is useful for modeling because there is enough variation to explain. The independent variables capture structural and preventive health factors, and their ranges suggest the sample includes places that differ a lot in rurality, age structure, food security, and screening or vaccination behavior. Thus, the table proved by RStudio supports a study of whether community characteristics and preventive care patterns are associated with avoidable hospitalizations.
Fit the Model, Linear Regression
model <- lm (`Preventable Hospitalization Rate` ~ `% Rural` + `% 65 and Over` + `% Food Insecure` + `% Vaccinated` + `% with Annual Mammogram`, data = combined_df)
summary(model)
##
## Call:
## lm(formula = `Preventable Hospitalization Rate` ~ `% Rural` +
## `% 65 and Over` + `% Food Insecure` + `% Vaccinated` + `% with Annual Mammogram`,
## data = combined_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2394.72 -569.37 -60.89 569.63 2747.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2910.867 637.371 4.567 8.02e-06 ***
## `% Rural` 2.653 2.345 1.131 0.25907
## `% 65 and Over` -39.506 14.188 -2.784 0.00580 **
## `% Food Insecure` 38.970 23.799 1.638 0.10288
## `% Vaccinated` 24.788 9.461 2.620 0.00937 **
## `% with Annual Mammogram` -24.747 11.786 -2.100 0.03683 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 939.3 on 233 degrees of freedom
## Multiple R-squared: 0.09155, Adjusted R-squared: 0.07206
## F-statistic: 4.696 on 5 and 233 DF, p-value: 0.0004178
Check Assumptions
## Linearity Assumption
plot(model, which = 1)

## Normality of Residuals
plot(model, which = 2)

## QQ Plot
qqnorm(resid(model))
qqline(resid(model), col= "red")

## Formal Test
shapiro.test(model$residuals) # Shapiro-Wilk normality test
##
## Shapiro-Wilk normality test
##
## data: model$residuals
## W = 0.99319, p-value = 0.3451
bptest(model) # Breusch–Pagan test
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 22.071, df = 5, p-value = 0.0005076
## Homoscedasticity (Scale-Location Plot)
plot(model, which = 3)

## Independence of Errors
durbinWatsonTest(model) # Durbin-Watson test
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01995222 2.039256 0.772
## Alternative hypothesis: rho != 0
## Multicollinearity (IVs not overly correlated)
vif(model)
## `% Rural` `% 65 and Over` `% Food Insecure`
## 1.772142 1.653112 1.324183
## `% Vaccinated` `% with Annual Mammogram`
## 2.015996 2.221236
cor(combined_df[, c("Preventable Hospitalization Rate", "% Rural", "% 65 and Over", "% Food Insecure", "% Vaccinated", "% with Annual Mammogram")], use = "complete.obs")
## Preventable Hospitalization Rate % Rural
## Preventable Hospitalization Rate 1.00000000 -0.08814304
## % Rural -0.08814304 1.00000000
## % 65 and Over -0.19924257 0.56778173
## % Food Insecure 0.12834961 0.02080714
## % Vaccinated 0.06075425 -0.36031667
## % with Annual Mammogram -0.13352325 -0.13297853
## % 65 and Over % Food Insecure % Vaccinated
## Preventable Hospitalization Rate -0.19924257 0.12834961 0.06075425
## % Rural 0.56778173 0.02080714 -0.36031667
## % 65 and Over 1.00000000 0.09892629 -0.08028886
## % Food Insecure 0.09892629 1.00000000 -0.23855757
## % Vaccinated -0.08028886 -0.23855757 1.00000000
## % with Annual Mammogram 0.11303549 -0.44145581 0.65261658
## % with Annual Mammogram
## Preventable Hospitalization Rate -0.1335233
## % Rural -0.1329785
## % 65 and Over 0.1130355
## % Food Insecure -0.4414558
## % Vaccinated 0.6526166
## % with Annual Mammogram 1.0000000
## Influential Observations / Outliers
plot(model, which = 4) # Cook’s Distance

## Rainbow test
raintest(model)
##
## Rainbow test
##
## data: model
## Rain = 0.86188, df1 = 120, df2 = 113, p-value = 0.7888
## This study checked the main linear regression assumptions using residual plots and formal tests. Specifically, the residuals vs. Fitted plot for linearity, the Q-Q plot plus the Shapiro-Wilk test for normality, the scale-location plot plus the Breusch-Pagan test for homoscedasticity, the Durbin-Watson test for Independence of errors, VIF for multicollinearity, and Cook’s distance/outlier diagnostics for influential points.
## The Shapiro-Wilk test has a W = 0.99319, p-value = 0.3451, so the residuals do not significantly depart from normality. That means the normality assumption is reasonably met. The Breusch-Pagan test and the diagnostic plots appear to show no strong fan shame, so homoscedasticity appears acceptable based on the visual diagnostics in the document. The Durbin-Watson test was used to check independence of errors and it showed that there is no indication that autocorrelation was a problem. The histograms suggest the outcome is roughly bell0shaped, while some predictors are skewed but still usable in a linear model. The regression model also appears to be valid overall because it shows an F-statistic of 4.696 with p-value = 0.0004178, which means the model explains a statistically significant amount of variation in preventable hospitalization rate. The VIF check suggests that the predictors were not overly collinear, so multicollinearity does not appear to be a major issue. Moreover, the rainbow test result supports linearity. With p-value = 0.7888, the result strongly suggests that the relationship is adequately linear. In sum, the data and model meet the assumptions for linear regression. The pattern appears acceptable, with no strong curved structure indicating a clear violation. In sum, the data and model meet the assumptions for linear regression. The pattern appears acceptable, with no strong curved structure indicating a clear violation.