#Loading packages and setting working directory
library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.5.2

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.2     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

setwd("C:/Users/andie/Downloads")
NH_ProviderInfo_Jul2025 <- read_excel("NH_ProviderInfo_Jul2025.xlsx")

#Selecting data I am interested in exploring further
nursing.home<- NH_ProviderInfo_Jul2025 |> select("Total Weighted Health Survey Score", "Ownership Type", "Number of Facilities in Chain", "Total nursing staff turnover")

#Changing "Ownership Type" from categorical to binary value. I am creating three categories: for profit, government, and nonprofit.
nursing.home2 <- nursing.home |> mutate(Ownership_Type_Num = case_when(
`Ownership Type` %in% c(
"For profit - Corporation",
"For profit - Individual",
"For profit - Limited Liability company",
"For profit - Partnership"
) ~ 1,
`Ownership Type` %in% c(
"Government - City",
"Government - City/county",
"Government - County",
"Government - Federal",
"Government - Hospital district",
"Government - State"
) ~ 2,
`Ownership Type` %in% c(
"Non profit - Church related",
"Non profit - Corporation",
"Non profit - Other"
) ~ 3,
TRUE ~ NA_real_
)
)
#Now converting this into a binary variable:
nursing.home3 <- nursing.home2 |>
  mutate(
    FOR_PROFIT = ifelse(Ownership_Type_Num == 1, 1, 0),
    GOVERNMENT = ifelse(Ownership_Type_Num == 2, 1, 0),
    NON_PROFIT = ifelse(Ownership_Type_Num == 3, 1, 0)
  )

#I'll tidy the data a bit by removing the "Ownership Type" and "Ownership_Type_Num" variables: 
nursing.home4 <- nursing.home3 |> select("Total Weighted Health Survey Score", "Number of Facilities in Chain", "Total nursing staff turnover", "FOR_PROFIT", "GOVERNMENT", "NON_PROFIT")

Before proceeding, I want to examine the “Number of Facilities in Chain” variable:

summary(nursing.home4$`Number of Facilities in Chain`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.00   12.00   26.00   53.92   64.00  303.00    4685

The “Number of Facilities in Chain” has 4,685 NA values. These NA values represent non-chain nursing homes, or where the number of facilities in chain = 1. For my analysis, I will change the NA values to 1 to represent non-chain facilities.

nursing.home5 <- nursing.home4 |> replace_na(list("Number of Facilities in Chain" = 1))
summary(nursing.home5$`Number of Facilities in Chain`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00   13.00   37.11   39.00  303.00

#As a final step, I am removing NA values from the data. Total Weighted Health Survey Score has 56 missing values, and Total nursing staff turnover has 1105.

nursing.home.clean <- nursing.home5 |> filter(!is.na(`Total Weighted Health Survey Score`), !is.na(`Total nursing staff turnover`))

Linear Model

I will use the Total Weighted Health Survey Score as my dependent variable. Number of facilities in chain, Total nursing staff turnover, and ownership type (for profit, government, or nonprofit) will be my independent variables. Let’s create a linear model:

model <-lm(`Total Weighted Health Survey Score`~`Number of Facilities in Chain`+`Total nursing staff turnover`+FOR_PROFIT+GOVERNMENT+NON_PROFIT,data=nursing.home.clean)

Let’s visualize how this looks:

plot(model)

It does not appear linear. We can check with math:

raintest(model)

## 
##  Rainbow test
## 
## data:  model
## Rain = 1.4504, df1 = 6820, df2 = 6813, p-value < 2.2e-16

The data is not linear.

library(car)

## Warning: package 'car' was built under R version 4.5.2

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.5.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

durbinWatsonTest(model)

##  lag Autocorrelation D-W Statistic p-value
##    1        0.108564      1.782862       0
##  Alternative hypothesis: rho != 0

A p-value less than .05 means the errors are not independent.

bptest(model)

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 146.46, df = 4, p-value < 2.2e-16

The model is heteroscedastic.

ks.test(model$residuals, "pnorm")

## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the one-sample Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model$residuals
## D = 0.63746, p-value < 2.2e-16
## alternative hypothesis: two-sided

The residuals are not normal.

Also, I can not run the vif command because I have variables with perfect multicollarity. I will need to address this and attempt to mitigate the non-normality of my data.

#Removing the government binary variable from my data set: 
nursing.home6 <- nursing.home5 |> select("Total Weighted Health Survey Score", "Number of Facilities in Chain", "Total nursing staff turnover", "FOR_PROFIT", "NON_PROFIT")

#Removing NA values from the data; Total Weighted Health Survey Score has 56 missing values, and Total nursing staff turnover has 1105.
nursing.home.clean2 <- nursing.home6 |> filter(!is.na(`Total Weighted Health Survey Score`), !is.na(`Total nursing staff turnover`))

Next, I need to check if my variables Total Health Survey Score, Number of Facilities in Chain, and Total nursing staff turnover have normal distributions.

hist(nursing.home.clean2$`Total Weighted Health Survey Score`)

hist(nursing.home.clean2$`Number of Facilities in Chain`)

hist(nursing.home.clean2$`Total nursing staff turnover`)

The Total Health Survey Score and Number of Facilities in Chain do not have normal distributions. Total nursing staff turnover has a fairly normal distribution. To make my data more normal, I will use a square root transformation on The Total Health Survey Score and Number of Certified Beds. From previous homework, I know that the Total Health Survey Score contains the score of 0; therefore, I cannot use a log transformation.

nursing.home.transformed <- nursing.home.clean2 |> mutate(across(c(`Total Weighted Health Survey Score`, `Number of Facilities in Chain`), ~ sqrt(.x), .names = "{.col}_sqrt"))

Now let’s look at how the variables look:

hist(nursing.home.transformed$`Total Weighted Health Survey Score_sqrt`)

hist(nursing.home.transformed$`Number of Facilities in Chain_sqrt`)

More normal, but still not normal. Let’s see if it meets assumptions of a linear model now.

#I am going to create another dataframe that drops the non-transformed Total Health Survey Score and Number of Certified Beds.
nursing.home.transformed2 <- nursing.home.transformed |> select(FOR_PROFIT, NON_PROFIT, "Total nursing staff turnover", "Number of Facilities in Chain_sqrt", "Total Weighted Health Survey Score_sqrt")

#Next, I'll create a new linear model:
model.2 <- lm(`Total Weighted Health Survey Score_sqrt`~`Total nursing staff turnover`+FOR_PROFIT+NON_PROFIT+`Number of Facilities in Chain_sqrt`, data=nursing.home.transformed2)

Tests–

raintest(model.2)

## 
##  Rainbow test
## 
## data:  model.2
## Rain = 1.2989, df1 = 6820, df2 = 6814, p-value < 2.2e-16

durbinWatsonTest(model.2)

##  lag Autocorrelation D-W Statistic p-value
##    1       0.1506703      1.698633       0
##  Alternative hypothesis: rho != 0

bptest(model.2)

## 
##  studentized Breusch-Pagan test
## 
## data:  model.2
## BP = 178.65, df = 4, p-value < 2.2e-16

ks.test(model.2$residuals, "pnorm")

## Warning in ks.test.default(model.2$residuals, "pnorm"): ties should not be
## present for the one-sample Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model.2$residuals
## D = 0.32515, p-value < 2.2e-16
## alternative hypothesis: two-sided

vif(model.2)

##       `Total nursing staff turnover`                           FOR_PROFIT 
##                             1.025007                             3.427019 
##                           NON_PROFIT `Number of Facilities in Chain_sqrt` 
##                             3.371102                             1.107229

My model is not linear, the errors are not independent, the model is heteroscedastic, and the residuals are not normal. However, my model does meet the assumption of no multicollinearity.

Generalized Linear Model

nursing.home.glm <- glm(`Total Weighted Health Survey Score`~`Number of Facilities in Chain`+`Total nursing staff turnover`+FOR_PROFIT+NON_PROFIT,data=nursing.home.clean2, family = "gaussian")

summary(nursing.home.glm)

## 
## Call:
## glm(formula = `Total Weighted Health Survey Score` ~ `Number of Facilities in Chain` + 
##     `Total nursing staff turnover` + FOR_PROFIT + NON_PROFIT, 
##     family = "gaussian", data = nursing.home.clean2)
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.815820   4.038978   0.697   0.4857    
## `Number of Facilities in Chain` -0.006149   0.013046  -0.471   0.6374    
## `Total nursing staff turnover`   1.382603   0.055133  25.078  < 2e-16 ***
## FOR_PROFIT                      25.526445   3.347589   7.625 2.59e-14 ***
## NON_PROFIT                      -6.539697   3.676829  -1.779   0.0753 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8633.608)
## 
##     Null deviance: 126856869  on 13638  degrees of freedom
## Residual deviance: 117710613  on 13634  degrees of freedom
## AIC: 162329
## 
## Number of Fisher Scoring iterations: 2

#Bootleg p-value test
126856869-117710613

## [1] 9146256

13638-13634

## [1] 4

9146256 is greater than 4, suggesting that the model is signficant.

Creating Graphics and Descriptive Statistics

ggplot(nursing.home.clean2, aes(x=`Number of Facilities in Chain`))+geom_histogram(bins=15)+labs(title="Number of Facilities in Chain", subtitle="From the CMS Provider Information Dataset")

ggplot(nursing.home.clean2, aes(x=`Total nursing staff turnover`))+geom_histogram(bins=15)+labs(title="Total Nursing Staff Turnover", subtitle="From the CMS Provider Information Dataset")

ggplot(nursing.home.clean2, aes(x=`Total Weighted Health Survey Score`))+geom_histogram(bins=15)+labs(title="Total Weighted Health Survey Score", subtitle="From the CMS Provider Information Dataset")

#Counting number of for-profit, non-profit, and government nursing homes
nursing.home4 |> count(NON_PROFIT)

## # A tibble: 2 × 2
##   NON_PROFIT     n
##        <dbl> <int>
## 1          0 11777
## 2          1  2975

nursing.home4 |> count(FOR_PROFIT)

## # A tibble: 2 × 2
##   FOR_PROFIT     n
##        <dbl> <int>
## 1          0  3936
## 2          1 10816

nursing.home4 |> count(GOVERNMENT)

## # A tibble: 2 × 2
##   GOVERNMENT     n
##        <dbl> <int>
## 1          0 13791
## 2          1   961

summary(nursing.home.clean2$`Total nursing staff turnover`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.60   36.70   46.30   46.88   56.50  100.00

summary(nursing.home.clean2$`Total Weighted Health Survey Score`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   28.00   56.00   85.04  107.00 1505.25

summary(nursing.home.clean2$`Number of Facilities in Chain`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00   14.00   38.77   41.00  303.00

#Testing testing
nursing.home.box.plot <- nursing.home |> 
  mutate(
    Ownership_Type_Condensed = case_when(
      `Ownership Type` %in% c(
        "For profit - Corporation",
        "For profit - Individual",
        "For profit - Limited Liability company",
        "For profit - Partnership"
      ) ~ "For-Profit",
      `Ownership Type` %in% c(
        "Government - City",
        "Government - City/county",
        "Government - County",
        "Government - Federal",
        "Government - Hospital district",
        "Government - State"
      ) ~ "Government",
      `Ownership Type` %in% c(
        "Non profit - Church related",
        "Non profit - Corporation",
        "Non profit - Other"
      ) ~ "Non-Profit",
      TRUE ~ NA_character_
    )
  )

ggplot(nursing.home.box.plot, aes(x= Ownership_Type_Condensed, y=`Total Weighted Health Survey Score`))+ geom_boxplot() + labs(title= "Health Survey Scores by Ownership Type", x= "Ownership Type", y= "Total Weighted Health Survey Score")

## Warning: Removed 56 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Research Paper Draft

Andie

2025-11-16

Linear Model

Generalized Linear Model

Creating Graphics and Descriptive Statistics