This is the data cleaning file for generating ipw for the CQC locations selecting into Skills for Care data set. The overall steps is as follows
select_inselect_in and
calculate the ipwestablishment_id and
ipwlibrary(tidyverse) # package for data cleaning and plotting
library(readxl) # package for reading excel file
library(broom) # extracting model summary as data frame
library(modelsummary) # deriving model tables
library(scales) # label percent
library(lubridate) # working with dates
# environment setup to run ordered logit properly
options(contrasts = rep("contr.treatment", 2))
#import the raw data file
cqc_skills <- read_excel("data/SfC_CQC.xlsx")
skills_raw <- read_csv("data/Skills_raw.csv")
#import the cleaned skills for care data
cleaned_cqc_skills <- read_csv("data/cqc_sfc_cleaned.csv")
#select the CQC data that has Skills for Care establishment id
cqc <- cqc_skills %>%
mutate(select_in = ifelse(!is.na(establishmentid), 1, 0)) %>%
mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
rating = recode(rating,
"Insufficient evidence to rate" = "NA",
"Requires improvement" = "Req improv")) %>%
# set the order of the values in the factors
mutate(form = fct_relevel(form, "FPO"),
# assume the order of the ratings as follows but need to double check with the source
rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
# adding the rating data coded as numerical
mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
rating == "Req improv" ~ 2,
rating == "Good" ~ 3,
rating == "Outstanding" ~ 4)) %>%
mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
primary_cat == "Residential social care" ~ "residential",
TRUE ~ as.character(NA)),
# deriving year column and dummy variable for before_covid
year = year(publication_date),
Year = factor(year),
after_covid = ifelse(year >= 2020, TRUE, FALSE),
before_covid = ifelse(year <= 2019, TRUE, FALSE))
head(cqc)
## # A tibble: 6 × 18
## form cic_type care_home primary_cat region service_group domain rating
## <fct> <chr> <chr> <chr> <chr> <chr> <chr> <ord>
## 1 FPO NA Y Residential socia… East … Overall Safe Good
## 2 FPO NA Y Residential socia… East … Overall Effec… Good
## 3 FPO NA Y Residential socia… East … Overall Caring Good
## 4 FPO NA Y Residential socia… East … Overall Respo… Good
## 5 FPO NA Y Residential socia… East … Overall Well-… Good
## 6 FPO NA Y Residential socia… East … Overall Overa… Good
## # ℹ 10 more variables: publication_date <dttm>, inherited <lgl>,
## # establishmentid <dbl>, select_in <dbl>, rating_num <dbl>, category <chr>,
## # year <dbl>, Year <fct>, after_covid <lgl>, before_covid <lgl>
cqc_overall <- cqc %>%
filter(domain == "Overall")
model_select <- glm(select_in ~ form + category + region +
publication_date + inherited + rating_num,
data = cqc_overall,
family = binomial(link = "logit"))
tidy(model_select)
## # A tibble: 18 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -7.20e- 1 9.95e- 1 -0.724 4.69e- 1
## 2 formCIC 5.31e- 1 2.27e- 1 2.34 1.92e- 2
## 3 formGOV 2.59e+ 0 1.92e- 1 13.5 1.04e-41
## 4 formIND -5.19e- 1 7.81e- 2 -6.65 2.92e-11
## 5 formNA 1.08e- 1 8.18e- 1 0.132 8.95e- 1
## 6 formNPO 7.82e- 1 5.24e- 2 14.9 2.12e-50
## 7 categoryresidential -5.24e- 2 3.92e- 2 -1.34 1.81e- 1
## 8 regionEast of England 1.56e- 1 7.96e- 2 1.95 5.06e- 2
## 9 regionLondon -3.70e- 1 8.17e- 2 -4.53 5.99e- 6
## 10 regionNorth East 1.17e- 1 1.07e- 1 1.09 2.74e- 1
## 11 regionNorth West 1.19e- 1 7.82e- 2 1.52 1.29e- 1
## 12 regionSouth East 8.48e- 2 7.21e- 2 1.18 2.40e- 1
## 13 regionSouth West 2.59e- 1 7.85e- 2 3.30 9.57e- 4
## 14 regionWest Midlands 5.33e- 2 7.93e- 2 0.672 5.01e- 1
## 15 regionYorkshire and The Humber 1.00e- 1 8.33e- 2 1.20 2.28e- 1
## 16 publication_date -1.51e-10 6.00e-10 -0.252 8.01e- 1
## 17 inheritedTRUE 4.40e- 1 7.86e- 2 5.60 2.13e- 8
## 18 rating_num 2.67e- 1 4.24e- 2 6.31 2.85e-10
ipw_data <- augment_columns(model_select,
cqc_overall,
type.predict = "response") %>%
rename(propensity = .fitted) %>%
mutate(ipw = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>%
select(establishmentid, ipw) %>%
filter(establishmentid != "NA") %>%
group_by(establishmentid) %>%
slice(1)
cleaned_sfc <- cleaned_cqc_skills %>%
left_join(ipw_data, by = "establishmentid") %>%
mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
rating = recode(rating,
"Insufficient evidence to rate" = "NA",
"Requires improvement" = "Req improv")) %>%
# set the order of the values in the factors
mutate(form = fct_relevel(form, "FPO"),
# assume the order of the ratings as follows but need to double check with the source
rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
# adding the rating data coded as numerical
mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
rating == "Req improv" ~ 2,
rating == "Good" ~ 3,
rating == "Outstanding" ~ 4)) %>%
mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
primary_cat == "Residential social care" ~ "residential",
TRUE ~ as.character(NA)),
# deriving year column and dummy variable for before_covid
year = year(publication_date),
Year = factor(year),
after_covid = ifelse(year >= 2020, TRUE, FALSE),
before_covid = ifelse(year <= 2019, TRUE, FALSE))
head(cleaned_sfc)
## # A tibble: 6 × 89
## ...1 form cic_type care_home primary_cat region service_group
## <dbl> <fct> <chr> <chr> <chr> <chr> <chr>
## 1 1 FPO <NA> Y Residential social care East Mid… Overall
## 2 2 FPO <NA> Y Residential social care East Mid… Overall
## 3 3 FPO <NA> Y Residential social care East Mid… Overall
## 4 4 FPO <NA> Y Residential social care East Mid… Overall
## 5 5 FPO <NA> Y Residential social care East Mid… Overall
## 6 6 FPO <NA> Y Residential social care East Mid… Overall
## # ℹ 82 more variables: publication_date <date>, inherited <lgl>,
## # establishmentid <dbl>, ParentSubSingle <dbl>, isparent <dbl>,
## # estabcreateddate <chr>, Starters_Leavers_FILTER <dbl>, totalstarters <dbl>,
## # totalleavers <dbl>, totalstaff <dbl>, totalvacancies <dbl>, ORGWRKS <dbl>,
## # ORGWRKSGP <dbl>, esttype <dbl>, mainstid <dbl>, MAINSERGP1 <dbl>,
## # MAINSERGP2 <dbl>, jr28flag <dbl>, jr28perm <dbl>, jr28temp <dbl>,
## # jr28pool <dbl>, jr28agcy <dbl>, jr28oth <dbl>, jr28emp <dbl>, …
# write.csv(cleaned_sfc, file = "data/cleaned_sfc_ipw.csv")
ipw_for_graphing <- augment_columns(model_select,
cqc_overall,
type.predict = "response") %>%
rename(propensity = .fitted) %>%
mutate(ipw = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>%
mutate(SfC = ifelse(select_in == 1, TRUE, FALSE))
datasummary(form * (1 + rating + rating_num) ~ (1 + SfC ) *(1 + Mean),
data = ipw_for_graphing,
fmt = 2)
| NA | SfC | |||||||
|---|---|---|---|---|---|---|---|---|
| All | FALSE | TRUE | ||||||
| form | All | Mean | All | Mean | All | Mean | ||
| FPO | All | 9678.00 | 5164.00 | 4514.00 | ||||
| rating | Inadequate | 129.00 | 81.00 | 48.00 | ||||
| Req improv | 1480.00 | 875.00 | 605.00 | |||||
| Good | 7616.00 | 4010.00 | 3606.00 | |||||
| Outstanding | 453.00 | 198.00 | 255.00 | |||||
| rating_num | 9678.00 | 2.87 | 5164.00 | 2.84 | 4514.00 | 2.90 | ||
| CIC | All | 84.00 | 33.00 | 51.00 | ||||
| rating | Inadequate | 0.00 | 0.00 | 0.00 | ||||
| Req improv | 8.00 | 3.00 | 5.00 | |||||
| Good | 68.00 | 28.00 | 40.00 | |||||
| Outstanding | 8.00 | 2.00 | 6.00 | |||||
| rating_num | 84.00 | 3.00 | 33.00 | 2.97 | 51.00 | 3.02 | ||
| GOV | All | 388.00 | 30.00 | 358.00 | ||||
| rating | Inadequate | 0.00 | 0.00 | 0.00 | ||||
| Req improv | 38.00 | 3.00 | 35.00 | |||||
| Good | 330.00 | 26.00 | 304.00 | |||||
| Outstanding | 20.00 | 1.00 | 19.00 | |||||
| rating_num | 388.00 | 2.95 | 30.00 | 2.93 | 358.00 | 2.96 | ||
| IND | All | 813.00 | 540.00 | 273.00 | ||||
| rating | Inadequate | 12.00 | 10.00 | 2.00 | ||||
| Req improv | 140.00 | 103.00 | 37.00 | |||||
| Good | 640.00 | 419.00 | 221.00 | |||||
| Outstanding | 21.00 | 8.00 | 13.00 | |||||
| rating_num | 813.00 | 2.82 | 540.00 | 2.79 | 273.00 | 2.90 | ||
| NA | All | 6.00 | 3.00 | 3.00 | ||||
| rating | Inadequate | 0.00 | 0.00 | 0.00 | ||||
| Req improv | 1.00 | 0.00 | 1.00 | |||||
| Good | 5.00 | 3.00 | 2.00 | |||||
| Outstanding | 0.00 | 0.00 | 0.00 | |||||
| rating_num | 6.00 | 2.83 | 3.00 | 3.00 | 3.00 | 2.67 | ||
| NPO | All | 1979.00 | 671.00 | 1308.00 | ||||
| rating | Inadequate | 12.00 | 5.00 | 7.00 | ||||
| Req improv | 178.00 | 71.00 | 107.00 | |||||
| Good | 1685.00 | 568.00 | 1117.00 | |||||
| Outstanding | 104.00 | 27.00 | 77.00 | |||||
| rating_num | 1979.00 | 2.95 | 671.00 | 2.92 | 1308.00 | 2.97 | ||
sfc_count <- ipw_for_graphing %>%
filter(select_in == TRUE) %>%
group_by(form, rating) %>%
summarize(SfC = n())
## `summarise()` has grouped output by 'form'. You can override using the
## `.groups` argument.
cqc_count <- ipw_for_graphing %>%
group_by(form, rating) %>%
summarize(CQC = n()) %>%
left_join(sfc_count, by = c("form", "rating")) %>%
ungroup() %>%
filter(form!="NA") %>%
mutate(ratio = SfC/CQC)
## `summarise()` has grouped output by 'form'. You can override using the
## `.groups` argument.
cqc_count
## # A tibble: 18 × 5
## form rating CQC SfC ratio
## <fct> <ord> <int> <int> <dbl>
## 1 FPO Inadequate 129 48 0.372
## 2 FPO Req improv 1480 605 0.409
## 3 FPO Good 7616 3606 0.473
## 4 FPO Outstanding 453 255 0.563
## 5 CIC Req improv 8 5 0.625
## 6 CIC Good 68 40 0.588
## 7 CIC Outstanding 8 6 0.75
## 8 GOV Req improv 38 35 0.921
## 9 GOV Good 330 304 0.921
## 10 GOV Outstanding 20 19 0.95
## 11 IND Inadequate 12 2 0.167
## 12 IND Req improv 140 37 0.264
## 13 IND Good 640 221 0.345
## 14 IND Outstanding 21 13 0.619
## 15 NPO Inadequate 12 7 0.583
## 16 NPO Req improv 178 107 0.601
## 17 NPO Good 1685 1117 0.663
## 18 NPO Outstanding 104 77 0.740
sfc_joint <- ipw_for_graphing %>%
filter(select_in == TRUE) %>%
group_by(form) %>%
summarize(SfC = n())
cqc_joint <- ipw_for_graphing %>%
group_by(form) %>%
summarize(CQC = n()) %>%
left_join(sfc_joint, by = "form") %>%
ungroup() %>%
filter(form!="NA") %>%
mutate(ratio = SfC/CQC)
cqc_joint
## # A tibble: 5 × 4
## form CQC SfC ratio
## <fct> <int> <int> <dbl>
## 1 FPO 9678 4514 0.466
## 2 CIC 84 51 0.607
## 3 GOV 388 358 0.923
## 4 IND 813 273 0.336
## 5 NPO 1979 1308 0.661
model_simple <- lm(rating_num ~ form + category + Year + region,
data = filter(cleaned_sfc, domain == "Overall"),
weights = ipw)
tidy(model_simple)
## # A tibble: 19 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.05 0.119 25.6 1.27e-137
## 2 formCIC 0.0435 0.0672 0.647 5.18e- 1
## 3 formGOV 0.0302 0.0316 0.956 3.39e- 1
## 4 formIND -0.0296 0.0225 -1.31 1.89e- 1
## 5 formNPO 0.0400 0.0151 2.64 8.26e- 3
## 6 categoryresidential -0.0801 0.0115 -6.94 4.42e- 12
## 7 Year2017 0.0558 0.119 0.468 6.40e- 1
## 8 Year2018 0.0542 0.118 0.458 6.47e- 1
## 9 Year2019 -0.0751 0.118 -0.636 5.25e- 1
## 10 Year2020 -0.201 0.119 -1.69 9.01e- 2
## 11 Year2021 -0.571 0.119 -4.81 1.58e- 6
## 12 regionEast of England -0.0220 0.0236 -0.931 3.52e- 1
## 13 regionLondon -0.0365 0.0241 -1.52 1.29e- 1
## 14 regionNorth East 0.116 0.0312 3.73 1.96e- 4
## 15 regionNorth West 0.0196 0.0232 0.847 3.97e- 1
## 16 regionSouth East -0.00191 0.0213 -0.0893 9.29e- 1
## 17 regionSouth West 0.0772 0.0232 3.33 8.79e- 4
## 18 regionWest Midlands -0.0476 0.0236 -2.01 4.40e- 2
## 19 regionYorkshire and The Humber -0.0696 0.0245 -2.84 4.57e- 3
model_cqc <- lm(rating_num ~ form + category + Year + region,
data = filter(cqc, domain == "Overall"))
tidy(model_cqc)
## # A tibble: 20 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.99 0.0681 43.9 0
## 2 formCIC 0.0581 0.0471 1.23 2.18e- 1
## 3 formGOV 0.0297 0.0223 1.33 1.82e- 1
## 4 formIND -0.0610 0.0158 -3.87 1.10e- 4
## 5 formNA -0.00593 0.175 -0.0338 9.73e- 1
## 6 formNPO 0.0361 0.0107 3.39 7.08e- 4
## 7 categoryresidential -0.0680 0.00812 -8.37 6.21e-17
## 8 Year2017 0.0677 0.0684 0.990 3.22e- 1
## 9 Year2018 0.0626 0.0674 0.929 3.53e- 1
## 10 Year2019 -0.0546 0.0672 -0.812 4.17e- 1
## 11 Year2020 -0.182 0.0676 -2.69 7.07e- 3
## 12 Year2021 -0.542 0.0679 -7.99 1.44e-15
## 13 regionEast of England 0.0343 0.0166 2.07 3.86e- 2
## 14 regionLondon 0.0162 0.0168 0.960 3.37e- 1
## 15 regionNorth East 0.130 0.0221 5.90 3.73e- 9
## 16 regionNorth West 0.0469 0.0163 2.89 3.90e- 3
## 17 regionSouth East 0.0420 0.0150 2.80 5.04e- 3
## 18 regionSouth West 0.114 0.0163 7.01 2.44e-12
## 19 regionWest Midlands -0.0362 0.0165 -2.19 2.87e- 2
## 20 regionYorkshire and The Humber -0.0358 0.0173 -2.07 3.84e- 2