Read me

This is the data cleaning file for generating ipw for the CQC locations selecting into Skills for Care data set. The overall steps is as follows

  1. Take the anonymized combined CQC-SfC data, generate a dummy variable based on whether SfC establishment_id is NA or not, select_in
  2. Filter out the Overall rating rows and calculate the ipw at the location level
  3. Run the logit regression model of select_in and calculate the ipw
  4. Derive the sub-dataframe of and establishment_id and ipw
  5. Import the cleaned SfC from last year
  6. Merge the ipw data-frame to the cleaned SfC data

Setting up

library(tidyverse) # package for data cleaning and plotting
library(readxl) # package for reading excel file
library(broom) # extracting model summary as data frame
library(modelsummary) # deriving model tables
library(scales) # label percent
library(lubridate) # working with dates
# environment setup to run ordered logit properly
options(contrasts = rep("contr.treatment", 2))

Import data

#import the raw data file
cqc_skills <- read_excel("data/SfC_CQC.xlsx")
skills_raw <- read_csv("data/Skills_raw.csv")
#import the cleaned skills for care data
cleaned_cqc_skills <- read_csv("data/cqc_sfc_cleaned.csv")

Data Cleaning for CQC_skills combined data set

#select the CQC data that has Skills for Care establishment id
cqc <- cqc_skills %>% 
  mutate(select_in = ifelse(!is.na(establishmentid), 1, 0)) %>% 
    mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
         rating = recode(rating, 
                         "Insufficient evidence to rate" = "NA",
                         "Requires improvement" = "Req improv")) %>% 
  # set the order of the values in the factors 
  mutate(form = fct_relevel(form, "FPO"),
         
  # assume the order of the ratings as follows but need to double check with the source 
         rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
  
  # adding the rating data coded as numerical
  mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
                                rating == "Req improv" ~ 2,
                                rating == "Good" ~ 3,
                                rating == "Outstanding" ~ 4)) %>% 
  
  mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
                              primary_cat == "Residential social care" ~ "residential",
                              TRUE ~ as.character(NA)),

         # deriving year column and dummy variable for before_covid
         year = year(publication_date),
         Year = factor(year),
         after_covid = ifelse(year >= 2020, TRUE, FALSE),
         before_covid = ifelse(year <= 2019, TRUE, FALSE))

head(cqc)
## # A tibble: 6 × 18
##   form  cic_type care_home primary_cat        region service_group domain rating
##   <fct> <chr>    <chr>     <chr>              <chr>  <chr>         <chr>  <ord> 
## 1 FPO   NA       Y         Residential socia… East … Overall       Safe   Good  
## 2 FPO   NA       Y         Residential socia… East … Overall       Effec… Good  
## 3 FPO   NA       Y         Residential socia… East … Overall       Caring Good  
## 4 FPO   NA       Y         Residential socia… East … Overall       Respo… Good  
## 5 FPO   NA       Y         Residential socia… East … Overall       Well-… Good  
## 6 FPO   NA       Y         Residential socia… East … Overall       Overa… Good  
## # ℹ 10 more variables: publication_date <dttm>, inherited <lgl>,
## #   establishmentid <dbl>, select_in <dbl>, rating_num <dbl>, category <chr>,
## #   year <dbl>, Year <fct>, after_covid <lgl>, before_covid <lgl>

derive overall rating sub data frame

cqc_overall <- cqc %>%
  filter(domain == "Overall")

Run logit model and calculate ipw

model_select <- glm(select_in ~ form + category + region + 
                    publication_date + inherited + rating_num,
                    data = cqc_overall,
                    family = binomial(link = "logit"))

tidy(model_select)
## # A tibble: 18 × 5
##    term                            estimate std.error statistic  p.value
##    <chr>                              <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                    -7.20e- 1  9.95e- 1    -0.724 4.69e- 1
##  2 formCIC                         5.31e- 1  2.27e- 1     2.34  1.92e- 2
##  3 formGOV                         2.59e+ 0  1.92e- 1    13.5   1.04e-41
##  4 formIND                        -5.19e- 1  7.81e- 2    -6.65  2.92e-11
##  5 formNA                          1.08e- 1  8.18e- 1     0.132 8.95e- 1
##  6 formNPO                         7.82e- 1  5.24e- 2    14.9   2.12e-50
##  7 categoryresidential            -5.24e- 2  3.92e- 2    -1.34  1.81e- 1
##  8 regionEast of England           1.56e- 1  7.96e- 2     1.95  5.06e- 2
##  9 regionLondon                   -3.70e- 1  8.17e- 2    -4.53  5.99e- 6
## 10 regionNorth East                1.17e- 1  1.07e- 1     1.09  2.74e- 1
## 11 regionNorth West                1.19e- 1  7.82e- 2     1.52  1.29e- 1
## 12 regionSouth East                8.48e- 2  7.21e- 2     1.18  2.40e- 1
## 13 regionSouth West                2.59e- 1  7.85e- 2     3.30  9.57e- 4
## 14 regionWest Midlands             5.33e- 2  7.93e- 2     0.672 5.01e- 1
## 15 regionYorkshire and The Humber  1.00e- 1  8.33e- 2     1.20  2.28e- 1
## 16 publication_date               -1.51e-10  6.00e-10    -0.252 8.01e- 1
## 17 inheritedTRUE                   4.40e- 1  7.86e- 2     5.60  2.13e- 8
## 18 rating_num                      2.67e- 1  4.24e- 2     6.31  2.85e-10

Calculate IPW

ipw_data <- augment_columns(model_select,
                            cqc_overall,
                            type.predict = "response") %>% 
            rename(propensity = .fitted) %>% 
            mutate(ipw = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>% 
            select(establishmentid, ipw) %>% 
            filter(establishmentid != "NA") %>% 
            group_by(establishmentid) %>% 
            slice(1)

Merge the ipw data into the cleaned, CQC-Skills for care data

cleaned_sfc <- cleaned_cqc_skills %>% 
  left_join(ipw_data, by = "establishmentid") %>% 
    mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
         rating = recode(rating, 
                         "Insufficient evidence to rate" = "NA",
                         "Requires improvement" = "Req improv")) %>% 
  # set the order of the values in the factors 
  mutate(form = fct_relevel(form, "FPO"),
         
  # assume the order of the ratings as follows but need to double check with the source 
         rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
  
  # adding the rating data coded as numerical
  mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
                                rating == "Req improv" ~ 2,
                                rating == "Good" ~ 3,
                                rating == "Outstanding" ~ 4)) %>% 
  
  mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
                              primary_cat == "Residential social care" ~ "residential",
                              TRUE ~ as.character(NA)),

         # deriving year column and dummy variable for before_covid
         year = year(publication_date),
         Year = factor(year),
         after_covid = ifelse(year >= 2020, TRUE, FALSE),
         before_covid = ifelse(year <= 2019, TRUE, FALSE))

head(cleaned_sfc)
## # A tibble: 6 × 89
##    ...1 form  cic_type care_home primary_cat             region    service_group
##   <dbl> <fct> <chr>    <chr>     <chr>                   <chr>     <chr>        
## 1     1 FPO   <NA>     Y         Residential social care East Mid… Overall      
## 2     2 FPO   <NA>     Y         Residential social care East Mid… Overall      
## 3     3 FPO   <NA>     Y         Residential social care East Mid… Overall      
## 4     4 FPO   <NA>     Y         Residential social care East Mid… Overall      
## 5     5 FPO   <NA>     Y         Residential social care East Mid… Overall      
## 6     6 FPO   <NA>     Y         Residential social care East Mid… Overall      
## # ℹ 82 more variables: publication_date <date>, inherited <lgl>,
## #   establishmentid <dbl>, ParentSubSingle <dbl>, isparent <dbl>,
## #   estabcreateddate <chr>, Starters_Leavers_FILTER <dbl>, totalstarters <dbl>,
## #   totalleavers <dbl>, totalstaff <dbl>, totalvacancies <dbl>, ORGWRKS <dbl>,
## #   ORGWRKSGP <dbl>, esttype <dbl>, mainstid <dbl>, MAINSERGP1 <dbl>,
## #   MAINSERGP2 <dbl>, jr28flag <dbl>, jr28perm <dbl>, jr28temp <dbl>,
## #   jr28pool <dbl>, jr28agcy <dbl>, jr28oth <dbl>, jr28emp <dbl>, …

Save data

# write.csv(cleaned_sfc, file = "data/cleaned_sfc_ipw.csv")  

Data overview - derive CQC-SFC compare tables

build a data frame to derive the tables

ipw_for_graphing <- augment_columns(model_select,
                            cqc_overall,
                            type.predict = "response") %>% 
            rename(propensity = .fitted) %>% 
            mutate(ipw = (select_in / propensity) + (1 - select_in) / (1 - propensity)) %>% 
            mutate(SfC = ifelse(select_in == 1, TRUE, FALSE)) 
datasummary(form * (1 + rating + rating_num) ~ (1 + SfC ) *(1 + Mean),
            data = ipw_for_graphing,
            fmt = 2)
NA SfC
All FALSE TRUE
form All Mean All Mean All Mean
FPO All 9678.00 5164.00 4514.00
rating Inadequate 129.00 81.00 48.00
Req improv 1480.00 875.00 605.00
Good 7616.00 4010.00 3606.00
Outstanding 453.00 198.00 255.00
rating_num 9678.00 2.87 5164.00 2.84 4514.00 2.90
CIC All 84.00 33.00 51.00
rating Inadequate 0.00 0.00 0.00
Req improv 8.00 3.00 5.00
Good 68.00 28.00 40.00
Outstanding 8.00 2.00 6.00
rating_num 84.00 3.00 33.00 2.97 51.00 3.02
GOV All 388.00 30.00 358.00
rating Inadequate 0.00 0.00 0.00
Req improv 38.00 3.00 35.00
Good 330.00 26.00 304.00
Outstanding 20.00 1.00 19.00
rating_num 388.00 2.95 30.00 2.93 358.00 2.96
IND All 813.00 540.00 273.00
rating Inadequate 12.00 10.00 2.00
Req improv 140.00 103.00 37.00
Good 640.00 419.00 221.00
Outstanding 21.00 8.00 13.00
rating_num 813.00 2.82 540.00 2.79 273.00 2.90
NA All 6.00 3.00 3.00
rating Inadequate 0.00 0.00 0.00
Req improv 1.00 0.00 1.00
Good 5.00 3.00 2.00
Outstanding 0.00 0.00 0.00
rating_num 6.00 2.83 3.00 3.00 3.00 2.67
NPO All 1979.00 671.00 1308.00
rating Inadequate 12.00 5.00 7.00
Req improv 178.00 71.00 107.00
Good 1685.00 568.00 1117.00
Outstanding 104.00 27.00 77.00
rating_num 1979.00 2.95 671.00 2.92 1308.00 2.97
sfc_count <- ipw_for_graphing %>%
  filter(select_in == TRUE) %>% 
  group_by(form, rating) %>%
  summarize(SfC = n())
## `summarise()` has grouped output by 'form'. You can override using the
## `.groups` argument.
cqc_count <- ipw_for_graphing %>%
  group_by(form, rating) %>%
  summarize(CQC = n()) %>% 
  left_join(sfc_count, by = c("form", "rating")) %>% 
  ungroup() %>% 
  filter(form!="NA") %>%
  mutate(ratio = SfC/CQC)
## `summarise()` has grouped output by 'form'. You can override using the
## `.groups` argument.
cqc_count
## # A tibble: 18 × 5
##    form  rating        CQC   SfC ratio
##    <fct> <ord>       <int> <int> <dbl>
##  1 FPO   Inadequate    129    48 0.372
##  2 FPO   Req improv   1480   605 0.409
##  3 FPO   Good         7616  3606 0.473
##  4 FPO   Outstanding   453   255 0.563
##  5 CIC   Req improv      8     5 0.625
##  6 CIC   Good           68    40 0.588
##  7 CIC   Outstanding     8     6 0.75 
##  8 GOV   Req improv     38    35 0.921
##  9 GOV   Good          330   304 0.921
## 10 GOV   Outstanding    20    19 0.95 
## 11 IND   Inadequate     12     2 0.167
## 12 IND   Req improv    140    37 0.264
## 13 IND   Good          640   221 0.345
## 14 IND   Outstanding    21    13 0.619
## 15 NPO   Inadequate     12     7 0.583
## 16 NPO   Req improv    178   107 0.601
## 17 NPO   Good         1685  1117 0.663
## 18 NPO   Outstanding   104    77 0.740
sfc_joint <- ipw_for_graphing %>%
  filter(select_in == TRUE) %>% 
  group_by(form) %>%
  summarize(SfC = n())
cqc_joint <- ipw_for_graphing %>%
  group_by(form) %>%
  summarize(CQC = n()) %>% 
  left_join(sfc_joint, by = "form") %>% 
  ungroup() %>% 
  filter(form!="NA") %>%
  mutate(ratio = SfC/CQC)
cqc_joint
## # A tibble: 5 × 4
##   form    CQC   SfC ratio
##   <fct> <int> <int> <dbl>
## 1 FPO    9678  4514 0.466
## 2 CIC      84    51 0.607
## 3 GOV     388   358 0.923
## 4 IND     813   273 0.336
## 5 NPO    1979  1308 0.661

Analysis with the sub-set with ipw

model_simple <- lm(rating_num ~ form + category + Year + region,
                   data = filter(cleaned_sfc, domain == "Overall"),
                   weights = ipw)

tidy(model_simple)
## # A tibble: 19 × 5
##    term                           estimate std.error statistic   p.value
##    <chr>                             <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                     3.05       0.119    25.6    1.27e-137
##  2 formCIC                         0.0435     0.0672    0.647  5.18e-  1
##  3 formGOV                         0.0302     0.0316    0.956  3.39e-  1
##  4 formIND                        -0.0296     0.0225   -1.31   1.89e-  1
##  5 formNPO                         0.0400     0.0151    2.64   8.26e-  3
##  6 categoryresidential            -0.0801     0.0115   -6.94   4.42e- 12
##  7 Year2017                        0.0558     0.119     0.468  6.40e-  1
##  8 Year2018                        0.0542     0.118     0.458  6.47e-  1
##  9 Year2019                       -0.0751     0.118    -0.636  5.25e-  1
## 10 Year2020                       -0.201      0.119    -1.69   9.01e-  2
## 11 Year2021                       -0.571      0.119    -4.81   1.58e-  6
## 12 regionEast of England          -0.0220     0.0236   -0.931  3.52e-  1
## 13 regionLondon                   -0.0365     0.0241   -1.52   1.29e-  1
## 14 regionNorth East                0.116      0.0312    3.73   1.96e-  4
## 15 regionNorth West                0.0196     0.0232    0.847  3.97e-  1
## 16 regionSouth East               -0.00191    0.0213   -0.0893 9.29e-  1
## 17 regionSouth West                0.0772     0.0232    3.33   8.79e-  4
## 18 regionWest Midlands            -0.0476     0.0236   -2.01   4.40e-  2
## 19 regionYorkshire and The Humber -0.0696     0.0245   -2.84   4.57e-  3
model_cqc <- lm(rating_num ~ form + category + Year + region,
                   data = filter(cqc, domain == "Overall"))

tidy(model_cqc)
## # A tibble: 20 × 5
##    term                           estimate std.error statistic  p.value
##    <chr>                             <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                     2.99      0.0681    43.9    0       
##  2 formCIC                         0.0581    0.0471     1.23   2.18e- 1
##  3 formGOV                         0.0297    0.0223     1.33   1.82e- 1
##  4 formIND                        -0.0610    0.0158    -3.87   1.10e- 4
##  5 formNA                         -0.00593   0.175     -0.0338 9.73e- 1
##  6 formNPO                         0.0361    0.0107     3.39   7.08e- 4
##  7 categoryresidential            -0.0680    0.00812   -8.37   6.21e-17
##  8 Year2017                        0.0677    0.0684     0.990  3.22e- 1
##  9 Year2018                        0.0626    0.0674     0.929  3.53e- 1
## 10 Year2019                       -0.0546    0.0672    -0.812  4.17e- 1
## 11 Year2020                       -0.182     0.0676    -2.69   7.07e- 3
## 12 Year2021                       -0.542     0.0679    -7.99   1.44e-15
## 13 regionEast of England           0.0343    0.0166     2.07   3.86e- 2
## 14 regionLondon                    0.0162    0.0168     0.960  3.37e- 1
## 15 regionNorth East                0.130     0.0221     5.90   3.73e- 9
## 16 regionNorth West                0.0469    0.0163     2.89   3.90e- 3
## 17 regionSouth East                0.0420    0.0150     2.80   5.04e- 3
## 18 regionSouth West                0.114     0.0163     7.01   2.44e-12
## 19 regionWest Midlands            -0.0362    0.0165    -2.19   2.87e- 2
## 20 regionYorkshire and The Humber -0.0358    0.0173    -2.07   3.84e- 2