This is the data cleaning file for generating ipw for the CQC locations selecting into Skills for Care data set. The overall steps is as follows
select_inselect_in and
calculate the ipwestablishment_id and
ipwlibrary(tidyverse) # package for data cleaning and plotting
library(readxl) # package for reading excel file
library(broom) # extracting model summary as data frame
library(modelsummary) # deriving model tables
library(scales) # label percent
library(lubridate) # working with dates
# environment setup to run ordered logit properly
options(contrasts = rep("contr.treatment", 2))
#import the raw data file
cqc_skills <- read_excel("data/SfC_CQC.xlsx")
skills_raw <- read_csv("data/Skills_raw.csv")
#import the cleaned skills for care data
cleaned_cqc_skills <- read_csv("data/cqc_sfc_cleaned.csv")
#select the CQC data that has Skills for Care establishment id
cqc <- cqc_skills %>%
mutate(select_in = ifelse(!is.na(establishmentid), 1, 0)) %>%
mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
rating = recode(rating,
"Insufficient evidence to rate" = "NA",
"Requires improvement" = "Req improv")) %>%
# set the order of the values in the factors
mutate(form = fct_relevel(form, "FPO"),
# assume the order of the ratings as follows but need to double check with the source
rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
# adding the rating data coded as numerical
mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
rating == "Req improv" ~ 2,
rating == "Good" ~ 3,
rating == "Outstanding" ~ 4)) %>%
mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
primary_cat == "Residential social care" ~ "residential",
TRUE ~ as.character(NA)),
# deriving year column and dummy variable for before_covid
year = year(publication_date),
Year = factor(year),
after_covid = ifelse(year >= 2020, TRUE, FALSE),
before_covid = ifelse(year <= 2019, TRUE, FALSE))
head(cqc)
## # A tibble: 6 × 18
## form cic_type care_home primary_cat region service_group domain rating
## <fct> <chr> <chr> <chr> <chr> <chr> <chr> <ord>
## 1 FPO NA Y Residential socia… East … Overall Safe Good
## 2 FPO NA Y Residential socia… East … Overall Effec… Good
## 3 FPO NA Y Residential socia… East … Overall Caring Good
## 4 FPO NA Y Residential socia… East … Overall Respo… Good
## 5 FPO NA Y Residential socia… East … Overall Well-… Good
## 6 FPO NA Y Residential socia… East … Overall Overa… Good
## # ℹ 10 more variables: publication_date <dttm>, inherited <lgl>,
## # establishmentid <dbl>, select_in <dbl>, rating_num <dbl>, category <chr>,
## # year <dbl>, Year <fct>, after_covid <lgl>, before_covid <lgl>
cqc_overall <- cqc %>%
filter(domain == "Overall")
model_select <- glm(select_in ~ form + category + region +
publication_date + inherited + rating_num,
data = cqc_overall,
family = binomial(link = "logit"))
tidy(model_select)
## # A tibble: 18 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -7.20e- 1 9.95e- 1 -0.724 4.69e- 1
## 2 formCIC 5.31e- 1 2.27e- 1 2.34 1.92e- 2
## 3 formGOV 2.59e+ 0 1.92e- 1 13.5 1.04e-41
## 4 formIND -5.19e- 1 7.81e- 2 -6.65 2.92e-11
## 5 formNA 1.08e- 1 8.18e- 1 0.132 8.95e- 1
## 6 formNPO 7.82e- 1 5.24e- 2 14.9 2.12e-50
## 7 categoryresidential -5.24e- 2 3.92e- 2 -1.34 1.81e- 1
## 8 regionEast of England 1.56e- 1 7.96e- 2 1.95 5.06e- 2
## 9 regionLondon -3.70e- 1 8.17e- 2 -4.53 5.99e- 6
## 10 regionNorth East 1.17e- 1 1.07e- 1 1.09 2.74e- 1
## 11 regionNorth West 1.19e- 1 7.82e- 2 1.52 1.29e- 1
## 12 regionSouth East 8.48e- 2 7.21e- 2 1.18 2.40e- 1
## 13 regionSouth West 2.59e- 1 7.85e- 2 3.30 9.57e- 4
## 14 regionWest Midlands 5.33e- 2 7.93e- 2 0.672 5.01e- 1
## 15 regionYorkshire and The Humber 1.00e- 1 8.33e- 2 1.20 2.28e- 1
## 16 publication_date -1.51e-10 6.00e-10 -0.252 8.01e- 1
## 17 inheritedTRUE 4.40e- 1 7.86e- 2 5.60 2.13e- 8
## 18 rating_num 2.67e- 1 4.24e- 2 6.31 2.85e-10
ipw_data <- augment_columns(model_select,
cqc_overall,
type.predict = "response") %>%
rename(propensity = .fitted) %>%
mutate(ipw = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>%
select(establishmentid, ipw) %>%
filter(establishmentid != "NA") %>%
group_by(establishmentid) %>%
slice(1)
model_select_short <- glm(select_in ~ form + rating_num,
data = cqc_overall,
family = binomial(link = "logit"))
tidy(model_select_short)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.952 0.113 -8.42 3.88e-17
## 2 formCIC 0.534 0.225 2.38 1.74e- 2
## 3 formGOV 2.60 0.191 13.6 4.28e-42
## 4 formIND -0.538 0.0772 -6.97 3.22e-12
## 5 formNA 0.145 0.818 0.177 8.59e- 1
## 6 formNPO 0.779 0.0518 15.0 3.45e-51
## 7 rating_num 0.285 0.0387 7.36 1.87e-13
ipw_data_short <- augment_columns(model_select_short,
cqc_overall,
type.predict = "response") %>%
rename(propensity = .fitted) %>%
mutate(ipw_short = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>%
select(establishmentid, ipw_short) %>%
filter(establishmentid != "NA") %>%
group_by(establishmentid) %>%
slice(1)
ipw_data <- ipw_data %>%
left_join(ipw_data_short, by = "establishmentid")
cqc_overall <- cqc_overall %>%
left_join(ipw_data, by = "establishmentid")
cleaned_sfc <- cleaned_cqc_skills %>%
left_join(ipw_data, by = "establishmentid") %>%
mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
rating = recode(rating,
"Insufficient evidence to rate" = "NA",
"Requires improvement" = "Req improv")) %>%
# set the order of the values in the factors
mutate(form = fct_relevel(form, "FPO"),
# assume the order of the ratings as follows but need to double check with the source
rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
# adding the rating data coded as numerical
mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
rating == "Req improv" ~ 2,
rating == "Good" ~ 3,
rating == "Outstanding" ~ 4)) %>%
mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
primary_cat == "Residential social care" ~ "residential",
TRUE ~ as.character(NA)),
# deriving year column and dummy variable for before_covid
year = year(publication_date),
Year = factor(year),
after_covid = ifelse(year >= 2020, TRUE, FALSE),
before_covid = ifelse(year <= 2019, TRUE, FALSE))
# write.csv(cleaned_sfc, file = "data/cleaned_sfc_ipw.csv")
ipw_for_graphing <- cqc_overall %>%
mutate(SfC = ifelse(select_in == 1, TRUE, FALSE),
weighted_rating_num = rating_num * ipw,
weighted_rating_num_short = rating_num * ipw_short)
datasummary(form * (1 + rating + rating_num + weighted_rating_num+ weighted_rating_num_short) ~ (1 + SfC ) *(1 + Mean),
data = ipw_for_graphing,
fmt = 2)
| NA | SfC | |||||||
|---|---|---|---|---|---|---|---|---|
| All | FALSE | TRUE | ||||||
| form | All | Mean | All | Mean | All | Mean | ||
| FPO | All | 10698.00 | 5740.00 | 4958.00 | ||||
| rating | Inadequate | 129.00 | 81.00 | 48.00 | ||||
| Req improv | 1481.00 | 875.00 | 606.00 | |||||
| Good | 7617.00 | 4011.00 | 3606.00 | |||||
| Outstanding | 453.00 | 198.00 | 255.00 | |||||
| rating_num | 10698.00 | 2.87 | 5740.00 | 2.84 | 4958.00 | 2.90 | ||
| weighted_rating_num | 10698.00 | 6.14 | 5740.00 | 4958.00 | 6.14 | |||
| weighted_rating_num_short | 10698.00 | 6.14 | 5740.00 | 4958.00 | 6.14 | |||
| CIC | All | 84.00 | 33.00 | 51.00 | ||||
| rating | Inadequate | 0.00 | 0.00 | 0.00 | ||||
| Req improv | 8.00 | 3.00 | 5.00 | |||||
| Good | 68.00 | 28.00 | 40.00 | |||||
| Outstanding | 8.00 | 2.00 | 6.00 | |||||
| rating_num | 84.00 | 3.00 | 33.00 | 2.97 | 51.00 | 3.02 | ||
| weighted_rating_num | 84.00 | 4.88 | 33.00 | 51.00 | 4.88 | |||
| weighted_rating_num_short | 84.00 | 4.93 | 33.00 | 51.00 | 4.93 | |||
| GOV | All | 405.00 | 30.00 | 375.00 | ||||
| rating | Inadequate | 0.00 | 0.00 | 0.00 | ||||
| Req improv | 38.00 | 3.00 | 35.00 | |||||
| Good | 331.00 | 26.00 | 305.00 | |||||
| Outstanding | 20.00 | 1.00 | 19.00 | |||||
| rating_num | 405.00 | 2.95 | 30.00 | 2.93 | 375.00 | 2.96 | ||
| weighted_rating_num | 405.00 | 3.20 | 30.00 | 375.00 | 3.20 | |||
| weighted_rating_num_short | 405.00 | 3.20 | 30.00 | 375.00 | 3.20 | |||
| IND | All | 859.00 | 576.00 | 283.00 | ||||
| rating | Inadequate | 12.00 | 10.00 | 2.00 | ||||
| Req improv | 140.00 | 103.00 | 37.00 | |||||
| Good | 640.00 | 419.00 | 221.00 | |||||
| Outstanding | 21.00 | 8.00 | 13.00 | |||||
| rating_num | 859.00 | 2.82 | 576.00 | 2.79 | 283.00 | 2.90 | ||
| weighted_rating_num | 859.00 | 8.38 | 576.00 | 283.00 | 8.38 | |||
| weighted_rating_num_short | 859.00 | 8.46 | 576.00 | 283.00 | 8.46 | |||
| NA | All | 6.00 | 3.00 | 3.00 | ||||
| rating | Inadequate | 0.00 | 0.00 | 0.00 | ||||
| Req improv | 1.00 | 0.00 | 1.00 | |||||
| Good | 5.00 | 3.00 | 2.00 | |||||
| Outstanding | 0.00 | 0.00 | 0.00 | |||||
| rating_num | 6.00 | 2.83 | 3.00 | 3.00 | 3.00 | 2.67 | ||
| weighted_rating_num | 6.00 | 5.43 | 3.00 | 3.00 | 5.43 | |||
| weighted_rating_num_short | 6.00 | 5.42 | 3.00 | 3.00 | 5.42 | |||
| NPO | All | 2066.00 | 703.00 | 1363.00 | ||||
| rating | Inadequate | 12.00 | 5.00 | 7.00 | ||||
| Req improv | 178.00 | 71.00 | 107.00 | |||||
| Good | 1685.00 | 568.00 | 1117.00 | |||||
| Outstanding | 105.00 | 28.00 | 77.00 | |||||
| rating_num | 2066.00 | 2.95 | 703.00 | 2.92 | 1363.00 | 2.97 | ||
| weighted_rating_num | 2066.00 | 4.47 | 703.00 | 1363.00 | 4.47 | |||
| weighted_rating_num_short | 2066.00 | 4.47 | 703.00 | 1363.00 | 4.47 | |||
sfc_count <- ipw_for_graphing %>%
filter(select_in == TRUE) %>%
group_by(form, rating) %>%
summarize(SfC = n(),
SfC_weighted = sum(ipw, na.rm = TRUE),
SfC_weighted_short = sum(ipw_short, na.rm = TRUE)) %>%
ungroup() %>%
filter(form!="NA")
sfc_count
## # A tibble: 22 × 5
## form rating SfC SfC_weighted SfC_weighted_short
## <fct> <ord> <int> <dbl> <dbl>
## 1 FPO Inadequate 48 141. 142.
## 2 FPO Req improv 606 1486. 1492.
## 3 FPO Good 3606 7588. 7581.
## 4 FPO Outstanding 255 462. 466.
## 5 FPO <NA> 443 451. 445.
## 6 CIC Req improv 5 9.56 9.29
## 7 CIC Good 40 65.0 65.8
## 8 CIC Outstanding 6 8.69 8.81
## 9 GOV Req improv 35 38.8 38.8
## 10 GOV Good 305 329. 329.
## # ℹ 12 more rows
cqc_count <- ipw_for_graphing %>%
group_by(form, rating) %>%
summarize(CQC = n()) %>%
left_join(sfc_count, by = c("form", "rating")) %>%
ungroup() %>%
filter(form!="NA") %>%
mutate(ratio = SfC/CQC) %>%
ungroup()
## `summarise()` has grouped output by 'form'. You can override using the
## `.groups` argument.
cqc_count
## # A tibble: 22 × 7
## form rating CQC SfC SfC_weighted SfC_weighted_short ratio
## <fct> <ord> <int> <int> <dbl> <dbl> <dbl>
## 1 FPO Inadequate 129 48 141. 142. 0.372
## 2 FPO Req improv 1481 606 1486. 1492. 0.409
## 3 FPO Good 7617 3606 7588. 7581. 0.473
## 4 FPO Outstanding 453 255 462. 466. 0.563
## 5 FPO <NA> 1018 443 451. 445. 0.435
## 6 CIC Req improv 8 5 9.56 9.29 0.625
## 7 CIC Good 68 40 65.0 65.8 0.588
## 8 CIC Outstanding 8 6 8.69 8.81 0.75
## 9 GOV Req improv 38 35 38.8 38.8 0.921
## 10 GOV Good 331 305 329. 329. 0.921
## # ℹ 12 more rows
sfc_joint <- ipw_for_graphing %>%
filter(select_in == TRUE) %>%
group_by(form) %>%
summarize(SfC = n())
cqc_joint <- ipw_for_graphing %>%
group_by(form) %>%
summarize(CQC = n()) %>%
left_join(sfc_joint, by = "form") %>%
ungroup() %>%
filter(form!="NA") %>%
mutate(ratio = SfC/CQC)
cqc_joint
## # A tibble: 5 × 4
## form CQC SfC ratio
## <fct> <int> <int> <dbl>
## 1 FPO 10698 4958 0.463
## 2 CIC 84 51 0.607
## 3 GOV 405 375 0.926
## 4 IND 859 283 0.329
## 5 NPO 2066 1363 0.660
model_simple <- lm(rating_num ~ form + category + Year + region,
data = filter(cleaned_sfc, domain == "Overall"),
weights = ipw)
tidy(model_simple)
## # A tibble: 19 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.05 0.119 25.6 1.27e-137
## 2 formCIC 0.0435 0.0672 0.647 5.18e- 1
## 3 formGOV 0.0302 0.0316 0.956 3.39e- 1
## 4 formIND -0.0296 0.0225 -1.31 1.89e- 1
## 5 formNPO 0.0400 0.0151 2.64 8.26e- 3
## 6 categoryresidential -0.0801 0.0115 -6.94 4.42e- 12
## 7 Year2017 0.0558 0.119 0.468 6.40e- 1
## 8 Year2018 0.0542 0.118 0.458 6.47e- 1
## 9 Year2019 -0.0751 0.118 -0.636 5.25e- 1
## 10 Year2020 -0.201 0.119 -1.69 9.01e- 2
## 11 Year2021 -0.571 0.119 -4.81 1.58e- 6
## 12 regionEast of England -0.0220 0.0236 -0.931 3.52e- 1
## 13 regionLondon -0.0365 0.0241 -1.52 1.29e- 1
## 14 regionNorth East 0.116 0.0312 3.73 1.96e- 4
## 15 regionNorth West 0.0196 0.0232 0.847 3.97e- 1
## 16 regionSouth East -0.00191 0.0213 -0.0893 9.29e- 1
## 17 regionSouth West 0.0772 0.0232 3.33 8.79e- 4
## 18 regionWest Midlands -0.0476 0.0236 -2.01 4.40e- 2
## 19 regionYorkshire and The Humber -0.0696 0.0245 -2.84 4.57e- 3
model_cqc <- lm(rating_num ~ form + category + Year + region,
data = filter(cqc, domain == "Overall"))
tidy(model_cqc)
## # A tibble: 20 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.99 0.0681 43.9 0
## 2 formCIC 0.0581 0.0471 1.23 2.18e- 1
## 3 formGOV 0.0297 0.0223 1.33 1.82e- 1
## 4 formIND -0.0610 0.0158 -3.87 1.10e- 4
## 5 formNA -0.00593 0.175 -0.0338 9.73e- 1
## 6 formNPO 0.0361 0.0107 3.39 7.08e- 4
## 7 categoryresidential -0.0680 0.00812 -8.37 6.21e-17
## 8 Year2017 0.0677 0.0684 0.990 3.22e- 1
## 9 Year2018 0.0626 0.0674 0.929 3.53e- 1
## 10 Year2019 -0.0546 0.0672 -0.812 4.17e- 1
## 11 Year2020 -0.182 0.0676 -2.69 7.07e- 3
## 12 Year2021 -0.542 0.0679 -7.99 1.44e-15
## 13 regionEast of England 0.0343 0.0166 2.07 3.86e- 2
## 14 regionLondon 0.0162 0.0168 0.960 3.37e- 1
## 15 regionNorth East 0.130 0.0221 5.90 3.73e- 9
## 16 regionNorth West 0.0469 0.0163 2.89 3.90e- 3
## 17 regionSouth East 0.0420 0.0150 2.80 5.04e- 3
## 18 regionSouth West 0.114 0.0163 7.01 2.44e-12
## 19 regionWest Midlands -0.0362 0.0165 -2.19 2.87e- 2
## 20 regionYorkshire and The Humber -0.0358 0.0173 -2.07 3.84e- 2