Updated 2/19/2023 to add new variable age and save the
combined data to prepare for further analysis
library(tidyverse) # package for data cleaning and plotting
library(readxl) # package for reading excel file
library(broom) # extracting model summary as data frame
library(modelsummary) # deriving model tables
library(scales) # label percent
library(lubridate) # working with dates
# environment setup to run ordered logit properly
options(contrasts = rep("contr.treatment", 2))
#import the raw data file
cqc_skills <- read_excel("data/SfC_CQC.xlsx")
skills_raw <- read_csv("data/Skills_raw.csv")
#select the CQC data that has Skills for Care establishment id
CQC_sub <- cqc_skills %>%
filter(!is.na(establishmentid)) %>%
#restructure to wide data (1 row for each establishment id)
pivot_wider(names_from = domain, values_from = rating) %>%
mutate(form = ifelse(form == "NA", NA, form))
head(CQC_sub)
## # A tibble: 6 × 15
## form cic_type care_home primary_cat region service_group publication_date
## <chr> <chr> <chr> <chr> <chr> <chr> <dttm>
## 1 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## 2 FPO NA Y Residential… South… Overall 2017-10-31 00:00:00
## 3 FPO NA Y Residential… East … Overall 2018-12-14 00:00:00
## 4 FPO NA Y Residential… West … Overall 2019-09-17 00:00:00
## 5 FPO NA N Community b… Yorks… Overall 2020-02-15 00:00:00
## 6 NPO NA Y Residential… South… Overall 2018-11-13 00:00:00
## # ℹ 8 more variables: inherited <chr>, establishmentid <dbl>, Safe <chr>,
## # Effective <chr>, Caring <chr>, Responsive <chr>, `Well-led` <chr>,
## # Overall <chr>
CQC_sub %>%
summarize(all_rows = n(),
unique_rows = n_distinct(establishmentid))
## # A tibble: 1 × 2
## all_rows unique_rows
## <int> <int>
## 1 7258 6799
This results show that some facilities have more than 1 historical rating records.
#write variable name list
columns <- c("ParentSubSingle", "isparent", "estabcreateddate", #basic info
"Starters_Leavers_FILTER", "totalstarters", "totalleavers", #staff
"totalstaff", "totalvacancies","ORGWRKS", "ORGWRKSGP", #staff
"esttype", "mainstid", "MAINSERGP1") #service descriptions
Notes for some variables:
ParentSubSingle: values: 0, 1, 2. Does not say which is
parent/sub/single. It is observed 2 corresponds to `isparent = 1”, so
should be parent.
Starters_Leavers_FILTER: Flag to define if a workplace
has reliable starters or leavers information. 1 = Include, 0 =
Exclude.
ORGWRKS: Sum of totalstaff across the
organisation.
ORGWRKSGP: Grouping of ORGWRKS
Value Label -1 Not allocated 991 Micro (1 to 9) 992 Small (10 to 49) 993 Medium (50 to 249) 994 Large (250 or more)
esttype: sector of the workplace
mainstid: The main service provided by the workplace
totalstarters and totalleavers: Total
number of permanent and temporary staff that started/left in the
previous 12 months.
skills_selected <- skills_raw %>%
select(establishmentid, all_of(columns), starts_with("jr28"), starts_with("jr29"),
matches("^st.*cap$"), matches("^st.*util$"))
Code Label
28 Any job role. This is the sum of all job roles submitted by the establishment.
29 Any direct care role. This is the sum of all direct care roles submitted by the establishment. (Note: this variable may be more directly related to service quality)
30 Any manager/supervisor role. This is the sum of all manager/supervisor roles submitted by the establishment.
31 Any regulated profession role. This is the sum of all regulated profession roles submitted by the establishment.
32 Any other roles. This is the sum of all other roles, those not included in groups 29,30 and 31, submitted by the establishment.
1 Senior Management
2 Middle Management
3 First Line Manager
4 Registered Manager
5 Supervisor
6 Social Worker
7 Senior Care Worker
8 Care Worker
9 Community Support and Outreach Work
10 Employment Support
11 Advice Guidance and Advocacy
15 Occupational Therapist
16 Registered Nurse
17 Allied Health Professional
22 Technician
23 Other care-providing job role
24 Managers and staff in care-related but not care-providing roles
25 Administrative or office staff not care-providing
26 Ancillary staff not care-providing
27 Other non-care-providing job roles
34 Activities worker or co-ordinator
35 Safeguarding and reviewing officer
36 Occupational therapist assistant
37 Nursing Associate
38 Nursing Assistant
39 Assessment officer
40 Care co-ordinator
41 Care navigator
42 Any Childrens/young peoples job role
Use this small dataset to check if each establishment ID has multiple rows
skills_raw %>%
select(establishmentid) %>%
summarise(count = n(),
count_unqice = n_distinct(establishmentid))
## # A tibble: 1 × 2
## count count_unqice
## <int> <int>
## 1 18956 18956
In the Skills for Care data set, each establishmentid
only corresponds to 1 row data.
# merge and recode data
merged_wide <- CQC_sub %>%
left_join(skills_selected, by = "establishmentid") %>%
mutate(main_service_type = factor(mainstid),
main_group1 = factor(MAINSERGP1),
main_service_group = case_when(
main_group1 == "1" ~ "Adult residential",
main_group1 == "3" ~ "Adult domicilary",
TRUE ~ as.character(NA)),
# derive truncated service types (where CICs exist) see table 1
service_type_selected = case_when(
main_service_type == "1" ~ "care home w/ nursing",
main_service_type == "2" ~ "care home w/o nursing",
main_service_type == "8" ~ "Domicilary Care",
main_service_type == "55" ~ "Supported Living",
# dropped 61 -- community based for learning disability
TRUE ~ as.character(NA)),
# recode size type to be more readable
size_type = case_when(
ORGWRKSGP == -1 ~ as.character(NA),
ORGWRKSGP == 991 ~ "micro",
ORGWRKSGP == 992 ~ "small",
ORGWRKSGP == 993 ~ "medium",
ORGWRKSGP == 994 ~ "large",
TRUE ~ as.character(NA)),
# derive age of the organization
start_date = dmy(estabcreateddate),
age = 2023-year(start_date)
)
Tabulation of service types by legal forms
#the distribution of service by by forms, we can only keep the types that have CIC providers?
table1 <- datasummary(main_service_type ~ form,
data = merged_wide, fmt = 0)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'
table1
| main_service_type | CIC | FPO | GOV | IND | NPO |
|---|---|---|---|---|---|
| 1 | 5 | 1106 | 32 | 35 | 189 |
| 2 | 8 | 2179 | 167 | 210 | 832 |
| 5 | 0 | 0 | 0 | 2 | 0 |
| 6 | 0 | 0 | 0 | 0 | 1 |
| 8 | 30 | 1501 | 94 | 49 | 184 |
| 14 | 0 | 0 | 0 | 0 | 1 |
| 17 | 0 | 8 | 32 | 1 | 4 |
| 54 | 0 | 65 | 15 | 0 | 48 |
| 55 | 6 | 207 | 30 | 0 | 127 |
| 60 | 0 | 3 | 0 | 0 | 1 |
| 61 | 2 | 11 | 1 | 1 | 3 |
| 62 | 0 | 1 | 0 | 0 | 2 |
| 64 | 0 | 27 | 3 | 0 | 0 |
| 67 | 0 | 5 | 0 | 0 | 1 |
| 69 | 0 | 5 | 4 | 0 | 3 |
| 70 | 0 | 3 | 0 | 0 | 0 |
| 72 | 0 | 1 | 0 | 0 | 8 |
| 75 | 0 | 0 | 0 | 0 | 1 |
Tabulation of total
table_main_group1 <- datasummary(main_group1 ~ form,
data = merged_wide, fmt = 0)
table_main_group1
| main_group1 | CIC | FPO | GOV | IND | NPO |
|---|---|---|---|---|---|
| 1 | 13 | 3285 | 199 | 247 | 1021 |
| 2 | 0 | 0 | 0 | 0 | 1 |
| 3 | 36 | 1773 | 139 | 49 | 359 |
| 4 | 0 | 8 | 32 | 1 | 5 |
| 5 | 0 | 3 | 0 | 0 | 2 |
| 9 | 2 | 52 | 8 | 1 | 9 |
| 10 | 0 | 1 | 0 | 0 | 8 |
Tabulation of total staff (mean and sd) by form and size type
cross_table <- datasummary(main_service_group * service_type_selected ~ form,
data = merged_wide, fmt = 0)
cross_table
| main_service_group | service_type_selected | CIC | FPO | GOV | IND | NPO |
|---|---|---|---|---|---|---|
| Adult domicilary | care home w/ nursing | 0 | 0 | 0 | 0 | 0 |
| care home w/o nursing | 0 | 0 | 0 | 0 | 0 | |
| Domicilary Care | 30 | 1501 | 94 | 49 | 184 | |
| Supported Living | 6 | 207 | 30 | 0 | 127 | |
| Adult residential | care home w/ nursing | 5 | 1106 | 32 | 35 | 189 |
| care home w/o nursing | 8 | 2179 | 167 | 210 | 832 | |
| Domicilary Care | 0 | 0 | 0 | 0 | 0 | |
| Supported Living | 0 | 0 | 0 | 0 | 0 |
Check the distribution of service types over service group
table2 <- datasummary(totalstaff*form*(Mean + SD) ~ size_type,
data = merged_wide, fmt = 0)
table2
| form | large | medium | micro | small | ||
|---|---|---|---|---|---|---|
| totalstaff | CIC | Mean | 84 | 67 | 9 | 27 |
| SD | 154 | 34 | 10 | |||
| FPO | Mean | 59 | 65 | 6 | 28 | |
| SD | 65 | 41 | 2 | 11 | ||
| GOV | Mean | 53 | 22 | 17 | ||
| SD | 60 | 15 | ||||
| IND | Mean | 58 | 6 | 24 | ||
| SD | 25 | 2 | 10 | |||
| NPO | Mean | 46 | 60 | 7 | 27 | |
| SD | 65 | 49 | 2 | 11 |
Employee turnover rate is the percent of employees who
leave a company within a specific time period. Turnover rate is commonly
calculated by month, quarter, or year and includes both voluntary and
involuntary losses. Monthly and quarterly turnover rates are commonly
expressed as averages, while annual turnover rate is usually
cumulative.
\[ \text{Employee turnover rate} = \frac{\text{Number of employees who left}}{\text{Average number of employees}} \times 100 \]
\[ \text{Average number of employees} = \frac{\text{(Headcount at the begining of the timeframe + headcount at the end of the timeframe)}}{2} \]
Strictly speaking, we do not have the exact begin/end staff number for the n_left employees in the last year
count of employees left in last 12 month/latest total staff by service type, total staff/ number of vacancy - not useful due to a lot of missing value
Any other ideas? Please specify exact variable rather than vague concept
# derive new variables
merged_recoded_wide <- merged_wide %>%
# mutate NA only for jr28flag = 1 and jr29 = 1, respectively
mutate(across(starts_with("jr28"),
~ ifelse(jr28flag == 1, ifelse(is.na(.x), 0, .x), .x))) %>%
mutate(across(starts_with("jr29"),
~ ifelse(jr29flag == 1, ifelse(is.na(.x), 0, .x), .x))) %>%
mutate(across(starts_with("st"),
~ ifelse(is.na(.x), 0, .x))) %>%
mutate(totalstaff = ifelse(is.na(totalstaff), 0, totalstaff),
turnover = totalleavers/totalstaff*100,
turnover = ifelse(turnover < 0, NA, turnover),
turnover = ifelse(is.na(turnover) | turnover == Inf, NA, turnover)) %>%
rowwise() %>%
mutate(capacity = sum(across(matches("^st.*cap$"))),
utility = sum(across(matches("^st.*util$")))) %>%
ungroup() %>%
mutate(staff_level = totalstaff/utility*100,
staff_level = ifelse(staff_level == Inf, NA, staff_level))
For now, haven’t accounted for the vacancies. Because there seems to be quite some missing values. Need to check in more detail later.
table_staff <- datasummary(totalstaff * (Mean+Median) ~ form,
data = filter(merged_recoded_wide, !is.na(main_service_group)), fmt = 1, na.rm = T, output = "staff2.docx")
table_staff
## [1] "staff2.docx"
ggplot(filter(merged_recoded_wide, !is.na(main_service_group)),
aes(x = form, y = totalstaff)) +
geom_boxplot() +
coord_cartesian(ylim =c(0, 500)) +
facet_wrap(vars(main_service_group))
table4 <- datasummary(turnover * service_type_selected *(Mean + SD)~ form,
data = merged_recoded_wide, fmt = 1, na.rm = T)
table4
| service_type_selected | CIC | FPO | GOV | IND | NPO | ||
|---|---|---|---|---|---|---|---|
| turnover | care home w/ nursing | Mean | 40.3 | 34.9 | 14.3 | 18.3 | 27.3 |
| SD | 28.7 | 34.4 | 17.1 | 16.0 | 26.0 | ||
| care home w/o nursing | Mean | 9.1 | 28.9 | 13.5 | 18.9 | 21.6 | |
| SD | 12.9 | 34.2 | 11.6 | 27.1 | 26.6 | ||
| Domicilary Care | Mean | 30.7 | 37.7 | 12.3 | 28.9 | 26.2 | |
| SD | 29.7 | 59.2 | 9.0 | 35.6 | 33.3 | ||
| Supported Living | Mean | 10.7 | 26.1 | 10.6 | 33.7 | ||
| SD | 30.4 | 10.7 | 65.7 |
Prepare data for analysis
#pivot longer back
merged_long <- merged_recoded_wide %>%
pivot_longer(cols = c(Safe, Effective, Caring, Responsive, `Well-led`, Overall),
names_to = "domain", values_to = "rating")
#select relevant columns, rename and relabel
merged_long_ordered <- merged_long %>%
# recode legal form types to be more readable / easier to present
mutate(inherited = ifelse(inherited == "Y", TRUE, FALSE),
rating = recode(rating,
"Insufficient evidence to rate" = "NA",
"Requires improvement" = "Req improv")) %>%
# set the order of the values in the factors
mutate(form = ordered(form, levels = c("FPO", "NPO", "GOV", "CIC", "IND")),
# assume the order of the ratings as follows but need to double check with the source
rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding"))) %>%
# adding the rating data coded as numerical
mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
rating == "Req improv" ~ 2,
rating == "Good" ~ 3,
rating == "Outstanding" ~ 4)) %>%
mutate(category = case_when(primary_cat == "Community based adult social care services" ~ "community",
primary_cat == "Residential social care" ~ "residential",
TRUE ~ as.character(NA)),
# deriving year column and dummy variable for before_covid
year = year(publication_date),
after_covid = ifelse(year >= 2020, TRUE, FALSE),
before_covid = ifelse(year <= 2019, TRUE, FALSE)) %>%
# converting the ordinal variable to numerical
mutate(rating_num = case_when(rating == "Inadequate" ~ 1,
rating == "Req improv" ~ 2,
rating == "Good" ~ 3,
rating == "Outstanding" ~ 4)) %>%
# derive the rating dummy
mutate(rating_higher = ifelse(rating_num > 2, 1, 0))
# show first several rows of the data set derived
head(merged_long_ordered)
## # A tibble: 6 × 84
## form cic_type care_home primary_cat region service_group publication_date
## <ord> <chr> <chr> <chr> <chr> <chr> <dttm>
## 1 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## 2 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## 3 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## 4 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## 5 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## 6 FPO NA Y Residential… East … Overall 2020-03-03 00:00:00
## # ℹ 77 more variables: inherited <lgl>, establishmentid <dbl>,
## # ParentSubSingle <dbl>, isparent <dbl>, estabcreateddate <chr>,
## # Starters_Leavers_FILTER <dbl>, totalstarters <dbl>, totalleavers <dbl>,
## # totalstaff <dbl>, totalvacancies <dbl>, ORGWRKS <dbl>, ORGWRKSGP <dbl>,
## # esttype <dbl>, mainstid <dbl>, MAINSERGP1 <dbl>, jr28flag <dbl>,
## # jr28perm <dbl>, jr28temp <dbl>, jr28pool <dbl>, jr28agcy <dbl>,
## # jr28oth <dbl>, jr28emp <dbl>, jr28work <dbl>, jr28strt <dbl>, …
table3 <- datasummary(staff_level * main_service_group *(Mean + SD) ~ form,
data = merged_long_ordered, fmt = 1, na.rm = T)
table3
| main_service_group | FPO | NPO | GOV | CIC | IND | ||
|---|---|---|---|---|---|---|---|
| staff_level | Adult domicilary | Mean | 112.6 | 229.9 | 215.8 | 137.6 | 71.5 |
| SD | 187.2 | 464.1 | 330.9 | 133.6 | 57.4 | ||
| Adult residential | Mean | 166.7 | 204.7 | 255.1 | 352.7 | 152.6 | |
| SD | 160.6 | 118.6 | 168.4 | 210.9 | 313.2 |
write.csv(merged_long_ordered, file = "data/cqc_sfc_cleaned.csv")