Read me

This is the data cleaning file for generating ipw for the CQC locations selecting into Skills for Care data set. The overall steps is as follows

  1. Take the anonymized combined CQC-SfC data, generate a dummy variable based on whether SfC establishment_id is NA or not, select_in
  2. Filter out the Overall rating rows and calculate the ipw at the location level
  3. Run the logit regression model of select_in and calculate the ipw
  4. Derive the sub-dataframe of and establishment_id and ipw
  5. Import the cleaned SfC from last year
  6. Merge the ipw data-frame to the cleaned SfC data

Setting up

library(tidyverse) # package for data cleaning and plotting
library(readxl) # package for reading excel file
library(broom) # extracting model summary as data frame
library(modelsummary) # deriving model tables
library(scales) # label percent
library(lubridate) # working with dates
# environment setup to run ordered logit properly
options(contrasts = rep("contr.treatment", 2))

Import data

#import the raw data file
cqc_skills <- read_excel("data/SfC_CQC.xlsx")
skills_raw <- read_csv("data/Skills_raw.csv")
#import the cleaned skills for care data
cleaned_cqc_skills <- read_csv("data/cqc_sfc_cleaned.csv")

Data Cleaning for CQC_skills combined data set

#select the CQC data that has Skills for Care establishment id
cqc <- cqc_skills %>% 
  mutate(select_in = ifelse(!is.na(establishmentid), 1, 0)) %>% 
    mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
         rating = recode(rating, 
                         "Insufficient evidence to rate" = "NA",
                         "Requires improvement" = "Req improv")) %>% 
  # set the order of the values in the factors 
  mutate(form = fct_relevel(form, "FPO"),
         
  # assume the order of the ratings as follows but need to double check with the source 
         rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
  
  # adding the rating data coded as numerical
  mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
                                rating == "Req improv" ~ 2,
                                rating == "Good" ~ 3,
                                rating == "Outstanding" ~ 4)) %>% 
  
  mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
                              primary_cat == "Residential social care" ~ "residential",
                              TRUE ~ as.character(NA)),

         # deriving year column and dummy variable for before_covid
         year = year(publication_date),
         Year = factor(year),
         after_covid = ifelse(year >= 2020, TRUE, FALSE),
         before_covid = ifelse(year <= 2019, TRUE, FALSE))

head(cqc)
## # A tibble: 6 × 18
##   form  cic_type care_home primary_cat        region service_group domain rating
##   <fct> <chr>    <chr>     <chr>              <chr>  <chr>         <chr>  <ord> 
## 1 FPO   NA       Y         Residential socia… East … Overall       Safe   Good  
## 2 FPO   NA       Y         Residential socia… East … Overall       Effec… Good  
## 3 FPO   NA       Y         Residential socia… East … Overall       Caring Good  
## 4 FPO   NA       Y         Residential socia… East … Overall       Respo… Good  
## 5 FPO   NA       Y         Residential socia… East … Overall       Well-… Good  
## 6 FPO   NA       Y         Residential socia… East … Overall       Overa… Good  
## # ℹ 10 more variables: publication_date <dttm>, inherited <lgl>,
## #   establishmentid <dbl>, select_in <dbl>, rating_num <dbl>, category <chr>,
## #   year <dbl>, Year <fct>, after_covid <lgl>, before_covid <lgl>

derive overall rating sub data frame

cqc_overall <- cqc %>%
  filter(domain == "Overall")

Run logit model and calculate ipw

model_select <- glm(select_in ~ form + category + region + 
                    publication_date + inherited + rating_num,
                    data = cqc_overall,
                    family = binomial(link = "logit"))

tidy(model_select)
## # A tibble: 18 × 5
##    term                            estimate std.error statistic  p.value
##    <chr>                              <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                    -7.20e- 1  9.95e- 1    -0.724 4.69e- 1
##  2 formCIC                         5.31e- 1  2.27e- 1     2.34  1.92e- 2
##  3 formGOV                         2.59e+ 0  1.92e- 1    13.5   1.04e-41
##  4 formIND                        -5.19e- 1  7.81e- 2    -6.65  2.92e-11
##  5 formNA                          1.08e- 1  8.18e- 1     0.132 8.95e- 1
##  6 formNPO                         7.82e- 1  5.24e- 2    14.9   2.12e-50
##  7 categoryresidential            -5.24e- 2  3.92e- 2    -1.34  1.81e- 1
##  8 regionEast of England           1.56e- 1  7.96e- 2     1.95  5.06e- 2
##  9 regionLondon                   -3.70e- 1  8.17e- 2    -4.53  5.99e- 6
## 10 regionNorth East                1.17e- 1  1.07e- 1     1.09  2.74e- 1
## 11 regionNorth West                1.19e- 1  7.82e- 2     1.52  1.29e- 1
## 12 regionSouth East                8.48e- 2  7.21e- 2     1.18  2.40e- 1
## 13 regionSouth West                2.59e- 1  7.85e- 2     3.30  9.57e- 4
## 14 regionWest Midlands             5.33e- 2  7.93e- 2     0.672 5.01e- 1
## 15 regionYorkshire and The Humber  1.00e- 1  8.33e- 2     1.20  2.28e- 1
## 16 publication_date               -1.51e-10  6.00e-10    -0.252 8.01e- 1
## 17 inheritedTRUE                   4.40e- 1  7.86e- 2     5.60  2.13e- 8
## 18 rating_num                      2.67e- 1  4.24e- 2     6.31  2.85e-10

Calculate IPW

ipw_data <- augment_columns(model_select,
                            cqc_overall,
                            type.predict = "response") %>% 
            rename(propensity = .fitted) %>% 
            mutate(ipw = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>% 
            select(establishmentid, ipw) %>% 
            filter(establishmentid != "NA") %>% 
            group_by(establishmentid) %>% 
            slice(1)
model_select_short <- glm(select_in ~ form +  rating_num,
                    data = cqc_overall,
                    family = binomial(link = "logit"))

tidy(model_select_short)
## # A tibble: 7 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -0.952    0.113     -8.42  3.88e-17
## 2 formCIC        0.534    0.225      2.38  1.74e- 2
## 3 formGOV        2.60     0.191     13.6   4.28e-42
## 4 formIND       -0.538    0.0772    -6.97  3.22e-12
## 5 formNA         0.145    0.818      0.177 8.59e- 1
## 6 formNPO        0.779    0.0518    15.0   3.45e-51
## 7 rating_num     0.285    0.0387     7.36  1.87e-13
ipw_data_short <- augment_columns(model_select_short,
                            cqc_overall,
                            type.predict = "response") %>% 
            rename(propensity = .fitted) %>% 
            mutate(ipw_short = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>% 
            select(establishmentid, ipw_short) %>% 
            filter(establishmentid != "NA") %>% 
            group_by(establishmentid) %>% 
            slice(1)
ipw_data <- ipw_data %>% 
  left_join(ipw_data_short, by = "establishmentid")

cqc_overall <- cqc_overall %>%
  left_join(ipw_data, by = "establishmentid")

Merge the ipw data into the cleaned, CQC-Skills for care data

cleaned_sfc <- cleaned_cqc_skills %>% 
  left_join(ipw_data, by = "establishmentid") %>% 
    mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
         rating = recode(rating, 
                         "Insufficient evidence to rate" = "NA",
                         "Requires improvement" = "Req improv")) %>% 
  # set the order of the values in the factors 
  mutate(form = fct_relevel(form, "FPO"),
         
  # assume the order of the ratings as follows but need to double check with the source 
         rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
  
  # adding the rating data coded as numerical
  mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
                                rating == "Req improv" ~ 2,
                                rating == "Good" ~ 3,
                                rating == "Outstanding" ~ 4)) %>% 
  
  mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
                              primary_cat == "Residential social care" ~ "residential",
                              TRUE ~ as.character(NA)),

         # deriving year column and dummy variable for before_covid
         year = year(publication_date),
         Year = factor(year),
         after_covid = ifelse(year >= 2020, TRUE, FALSE),
         before_covid = ifelse(year <= 2019, TRUE, FALSE))

Save data

# write.csv(cleaned_sfc, file = "data/cleaned_sfc_ipw.csv")  

Data overview - derive CQC-SFC compare tables

build a data frame to derive the tables

ipw_for_graphing <- cqc_overall %>% 
            mutate(SfC = ifelse(select_in == 1, TRUE, FALSE),
                   weighted_rating_num = rating_num * ipw,
                   weighted_rating_num_short = rating_num * ipw_short) 
datasummary(form * (1 + rating + rating_num + weighted_rating_num+ weighted_rating_num_short) ~ (1 + SfC ) *(1 + Mean),
            data = ipw_for_graphing,
            fmt = 2)
NA SfC
All FALSE TRUE
form All Mean All Mean All Mean
FPO All 10698.00 5740.00 4958.00
rating Inadequate 129.00 81.00 48.00
Req improv 1481.00 875.00 606.00
Good 7617.00 4011.00 3606.00
Outstanding 453.00 198.00 255.00
rating_num 10698.00 2.87 5740.00 2.84 4958.00 2.90
weighted_rating_num 10698.00 6.14 5740.00 4958.00 6.14
weighted_rating_num_short 10698.00 6.14 5740.00 4958.00 6.14
CIC All 84.00 33.00 51.00
rating Inadequate 0.00 0.00 0.00
Req improv 8.00 3.00 5.00
Good 68.00 28.00 40.00
Outstanding 8.00 2.00 6.00
rating_num 84.00 3.00 33.00 2.97 51.00 3.02
weighted_rating_num 84.00 4.88 33.00 51.00 4.88
weighted_rating_num_short 84.00 4.93 33.00 51.00 4.93
GOV All 405.00 30.00 375.00
rating Inadequate 0.00 0.00 0.00
Req improv 38.00 3.00 35.00
Good 331.00 26.00 305.00
Outstanding 20.00 1.00 19.00
rating_num 405.00 2.95 30.00 2.93 375.00 2.96
weighted_rating_num 405.00 3.20 30.00 375.00 3.20
weighted_rating_num_short 405.00 3.20 30.00 375.00 3.20
IND All 859.00 576.00 283.00
rating Inadequate 12.00 10.00 2.00
Req improv 140.00 103.00 37.00
Good 640.00 419.00 221.00
Outstanding 21.00 8.00 13.00
rating_num 859.00 2.82 576.00 2.79 283.00 2.90
weighted_rating_num 859.00 8.38 576.00 283.00 8.38
weighted_rating_num_short 859.00 8.46 576.00 283.00 8.46
NA All 6.00 3.00 3.00
rating Inadequate 0.00 0.00 0.00
Req improv 1.00 0.00 1.00
Good 5.00 3.00 2.00
Outstanding 0.00 0.00 0.00
rating_num 6.00 2.83 3.00 3.00 3.00 2.67
weighted_rating_num 6.00 5.43 3.00 3.00 5.43
weighted_rating_num_short 6.00 5.42 3.00 3.00 5.42
NPO All 2066.00 703.00 1363.00
rating Inadequate 12.00 5.00 7.00
Req improv 178.00 71.00 107.00
Good 1685.00 568.00 1117.00
Outstanding 105.00 28.00 77.00
rating_num 2066.00 2.95 703.00 2.92 1363.00 2.97
weighted_rating_num 2066.00 4.47 703.00 1363.00 4.47
weighted_rating_num_short 2066.00 4.47 703.00 1363.00 4.47
sfc_count <- ipw_for_graphing %>%
  filter(select_in == TRUE) %>% 
  group_by(form, rating) %>%
  summarize(SfC = n(),
            SfC_weighted = sum(ipw, na.rm = TRUE),
            SfC_weighted_short = sum(ipw_short, na.rm = TRUE)) %>% 
  ungroup() %>% 
    filter(form!="NA")
sfc_count
## # A tibble: 22 × 5
##    form  rating        SfC SfC_weighted SfC_weighted_short
##    <fct> <ord>       <int>        <dbl>              <dbl>
##  1 FPO   Inadequate     48       141.               142.  
##  2 FPO   Req improv    606      1486.              1492.  
##  3 FPO   Good         3606      7588.              7581.  
##  4 FPO   Outstanding   255       462.               466.  
##  5 FPO   <NA>          443       451.               445.  
##  6 CIC   Req improv      5         9.56               9.29
##  7 CIC   Good           40        65.0               65.8 
##  8 CIC   Outstanding     6         8.69               8.81
##  9 GOV   Req improv     35        38.8               38.8 
## 10 GOV   Good          305       329.               329.  
## # ℹ 12 more rows
cqc_count <- ipw_for_graphing %>%
  group_by(form, rating) %>%
  summarize(CQC = n()) %>% 
  left_join(sfc_count, by = c("form", "rating")) %>% 
  ungroup() %>% 
  filter(form!="NA") %>%
  mutate(ratio = SfC/CQC) %>% 
  ungroup()
## `summarise()` has grouped output by 'form'. You can override using the
## `.groups` argument.
cqc_count
## # A tibble: 22 × 7
##    form  rating        CQC   SfC SfC_weighted SfC_weighted_short ratio
##    <fct> <ord>       <int> <int>        <dbl>              <dbl> <dbl>
##  1 FPO   Inadequate    129    48       141.               142.   0.372
##  2 FPO   Req improv   1481   606      1486.              1492.   0.409
##  3 FPO   Good         7617  3606      7588.              7581.   0.473
##  4 FPO   Outstanding   453   255       462.               466.   0.563
##  5 FPO   <NA>         1018   443       451.               445.   0.435
##  6 CIC   Req improv      8     5         9.56               9.29 0.625
##  7 CIC   Good           68    40        65.0               65.8  0.588
##  8 CIC   Outstanding     8     6         8.69               8.81 0.75 
##  9 GOV   Req improv     38    35        38.8               38.8  0.921
## 10 GOV   Good          331   305       329.               329.   0.921
## # ℹ 12 more rows
sfc_joint <- ipw_for_graphing %>%
  filter(select_in == TRUE) %>% 
  group_by(form) %>%
  summarize(SfC = n())
cqc_joint <- ipw_for_graphing %>%
  group_by(form) %>%
  summarize(CQC = n()) %>% 
  left_join(sfc_joint, by = "form") %>% 
  ungroup() %>% 
  filter(form!="NA") %>%
  mutate(ratio = SfC/CQC)
cqc_joint
## # A tibble: 5 × 4
##   form    CQC   SfC ratio
##   <fct> <int> <int> <dbl>
## 1 FPO   10698  4958 0.463
## 2 CIC      84    51 0.607
## 3 GOV     405   375 0.926
## 4 IND     859   283 0.329
## 5 NPO    2066  1363 0.660

Analysis with the sub-set with ipw

model_simple <- lm(rating_num ~ form + category + Year + region,
                   data = filter(cleaned_sfc, domain == "Overall"),
                   weights = ipw)

tidy(model_simple)
## # A tibble: 19 × 5
##    term                           estimate std.error statistic   p.value
##    <chr>                             <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                     3.05       0.119    25.6    1.27e-137
##  2 formCIC                         0.0435     0.0672    0.647  5.18e-  1
##  3 formGOV                         0.0302     0.0316    0.956  3.39e-  1
##  4 formIND                        -0.0296     0.0225   -1.31   1.89e-  1
##  5 formNPO                         0.0400     0.0151    2.64   8.26e-  3
##  6 categoryresidential            -0.0801     0.0115   -6.94   4.42e- 12
##  7 Year2017                        0.0558     0.119     0.468  6.40e-  1
##  8 Year2018                        0.0542     0.118     0.458  6.47e-  1
##  9 Year2019                       -0.0751     0.118    -0.636  5.25e-  1
## 10 Year2020                       -0.201      0.119    -1.69   9.01e-  2
## 11 Year2021                       -0.571      0.119    -4.81   1.58e-  6
## 12 regionEast of England          -0.0220     0.0236   -0.931  3.52e-  1
## 13 regionLondon                   -0.0365     0.0241   -1.52   1.29e-  1
## 14 regionNorth East                0.116      0.0312    3.73   1.96e-  4
## 15 regionNorth West                0.0196     0.0232    0.847  3.97e-  1
## 16 regionSouth East               -0.00191    0.0213   -0.0893 9.29e-  1
## 17 regionSouth West                0.0772     0.0232    3.33   8.79e-  4
## 18 regionWest Midlands            -0.0476     0.0236   -2.01   4.40e-  2
## 19 regionYorkshire and The Humber -0.0696     0.0245   -2.84   4.57e-  3
model_cqc <- lm(rating_num ~ form + category + Year + region,
                   data = filter(cqc, domain == "Overall"))

tidy(model_cqc)
## # A tibble: 20 × 5
##    term                           estimate std.error statistic  p.value
##    <chr>                             <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                     2.99      0.0681    43.9    0       
##  2 formCIC                         0.0581    0.0471     1.23   2.18e- 1
##  3 formGOV                         0.0297    0.0223     1.33   1.82e- 1
##  4 formIND                        -0.0610    0.0158    -3.87   1.10e- 4
##  5 formNA                         -0.00593   0.175     -0.0338 9.73e- 1
##  6 formNPO                         0.0361    0.0107     3.39   7.08e- 4
##  7 categoryresidential            -0.0680    0.00812   -8.37   6.21e-17
##  8 Year2017                        0.0677    0.0684     0.990  3.22e- 1
##  9 Year2018                        0.0626    0.0674     0.929  3.53e- 1
## 10 Year2019                       -0.0546    0.0672    -0.812  4.17e- 1
## 11 Year2020                       -0.182     0.0676    -2.69   7.07e- 3
## 12 Year2021                       -0.542     0.0679    -7.99   1.44e-15
## 13 regionEast of England           0.0343    0.0166     2.07   3.86e- 2
## 14 regionLondon                    0.0162    0.0168     0.960  3.37e- 1
## 15 regionNorth East                0.130     0.0221     5.90   3.73e- 9
## 16 regionNorth West                0.0469    0.0163     2.89   3.90e- 3
## 17 regionSouth East                0.0420    0.0150     2.80   5.04e- 3
## 18 regionSouth West                0.114     0.0163     7.01   2.44e-12
## 19 regionWest Midlands            -0.0362    0.0165    -2.19   2.87e- 2
## 20 regionYorkshire and The Humber -0.0358    0.0173    -2.07   3.84e- 2