Introduction

The current version include the data cleaning for location level CQC data downloaded Sep. 2021.

Load packages

library(tidyverse) # package for data cleaning and plotting
library(readxl)
library(modelsummary)
set.seed(5432)

Import Raw Data

# import location level full data
locations<- read_excel("D:/Archive/Socialcare UK/data/full_ratings.xlsx", sheet = 2)
# import CIC spin out coding
spinout <- read_csv("D:/Archive/Socialcare UK/data/spinout_list.csv")

Clean and filter out Gov and CIC providers

### Rename the columns 

locations_renamed <- locations %>% 
  rename(primary_cat = `Location Primary Inspection Category`,
         type = `Location Type`,
         region = `Location Region`,
         domain = Domain,
         rating = `Latest Rating`,
         inherited = `Inherited Rating (Y/N)`,
         provider_name = `Provider Name`,
         publication_date = `Publication Date`) 

Standardize the provider name

locations_std <- locations_renamed %>% 
# remove dots
  mutate(std_name = str_replace_all(provider_name, "\\.", "")) %>% 
# capitalize all
  mutate(std_name = str_to_upper(std_name)) %>%
# standardize to LTD
  mutate(std_name = str_replace_all(std_name, "LIMITED", "LTD")) %>% 
# standardize to CIC
  mutate(std_name = str_replace_all(std_name, "COMMUNITY INTEREST COMPANY", "CIC"))

filtering out CIC and GOV providers

## building the string detect conditions
keep_terms <- c("CIC", "NHS ", "FOUNDATION TRUST", "COUNCIL", " CC ", " MDC ", " DC ", " BC ", "BOROUGH")
keep_filter <- paste0("\\b(", paste(keep_terms, collapse = "|"), ")\\b")
## filtering out CICs and GOVs
locations_coded <- locations_std %>% 
  filter(str_detect(std_name, keep_filter)) %>% 
  filter(!str_detect(std_name, "BOROUGH GREEN\\s|BOROUGH ROAD\\s")) %>% 
  ## add the form and level columns
  mutate(form = ifelse(str_detect(std_name, "CIC"), "CIC", "GOV"),
         level = "location") %>% 
  left_join(spinout, by = "provider_name") %>% 
  mutate(spin_out = ifelse(is.na(spin_out), FALSE, TRUE))
nrow(locations_coded)
## [1] 19931
#check the spin-out coding distribution 

table(locations_coded$form, locations_coded$spin_out)
##      
##       FALSE  TRUE
##   CIC   829   433
##   GOV 18669     0

save data

#write_csv(locations_coded, "D:/Archive/Socialcare UK/data/locations_coded.csv")