The current version include the data cleaning for location level CQC data downloaded Sep. 2021.
library(tidyverse) # package for data cleaning and plotting
library(readxl)
library(modelsummary)
set.seed(5432)
# import location level full data
locations<- read_excel("D:/Archive/Socialcare UK/data/full_ratings.xlsx", sheet = 2)
# import CIC spin out coding
spinout <- read_csv("D:/Archive/Socialcare UK/data/spinout_list.csv")
### Rename the columns
locations_renamed <- locations %>%
rename(primary_cat = `Location Primary Inspection Category`,
type = `Location Type`,
region = `Location Region`,
domain = Domain,
rating = `Latest Rating`,
inherited = `Inherited Rating (Y/N)`,
provider_name = `Provider Name`,
publication_date = `Publication Date`)
locations_std <- locations_renamed %>%
# remove dots
mutate(std_name = str_replace_all(provider_name, "\\.", "")) %>%
# capitalize all
mutate(std_name = str_to_upper(std_name)) %>%
# standardize to LTD
mutate(std_name = str_replace_all(std_name, "LIMITED", "LTD")) %>%
# standardize to CIC
mutate(std_name = str_replace_all(std_name, "COMMUNITY INTEREST COMPANY", "CIC"))
## building the string detect conditions
keep_terms <- c("CIC", "NHS ", "FOUNDATION TRUST", "COUNCIL", " CC ", " MDC ", " DC ", " BC ", "BOROUGH")
keep_filter <- paste0("\\b(", paste(keep_terms, collapse = "|"), ")\\b")
## filtering out CICs and GOVs
locations_coded <- locations_std %>%
filter(str_detect(std_name, keep_filter)) %>%
filter(!str_detect(std_name, "BOROUGH GREEN\\s|BOROUGH ROAD\\s")) %>%
## add the form and level columns
mutate(form = ifelse(str_detect(std_name, "CIC"), "CIC", "GOV"),
level = "location") %>%
left_join(spinout, by = "provider_name") %>%
mutate(spin_out = ifelse(is.na(spin_out), FALSE, TRUE))
nrow(locations_coded)
## [1] 19931
#check the spin-out coding distribution
table(locations_coded$form, locations_coded$spin_out)
##
## FALSE TRUE
## CIC 829 433
## GOV 18669 0
#write_csv(locations_coded, "D:/Archive/Socialcare UK/data/locations_coded.csv")