data-analysis

Introduction

The current version re-run the codes of the data analysis.

# environment setup to run ordered logit properly
options(contrasts = rep("contr.treatment", 2))

Load packages

library(tidyverse) # package for data cleaning and plotting
library(readxl)
library(modelsummary)
library(ordinal) # package for ordinal logit regression
library(brant) # brant test for the parallel assumption for ordered logit
library(MASS) # models that work with the brant test
library(broom) # extracting model summary as data frame
library(modelsummary) # deriving model tables
library(scales) # label percent
library(lubridate) # working with dates
library(marginaleffects) #to calculate marginal effects
library(gt) # to format tables
library(here) # work with directory
set.seed(5432)

Merging CQC and financial data

# import location level full data
rating<- read_csv(here("cleaned_data","cic_all_ratings_2019.csv"))
finance <- read_csv(here("cleaned_data","cls_finance.csv"))

finance1 <- finance %>% 
  mutate(id_digit = as.numeric(str_extract(project_id, "\\d+"))) %>%
  arrange(id_digit)

# checking the largest number for the project_id in the finance data set 
summary(finance1$id_digit)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   228.0   460.0   610.1   913.2  5181.0

#merging the data
cic2019 <- rating %>% 
  left_join(finance1, by = "project_id")

data cleanning

#select relevant columns, rename and relabel 
cic_cleaned <- cic2019 %>% 
  # recode legal form types to be more readable / easier to present
  mutate(# inherited = ifelse(inherited == "Y", TRUE, FALSE),
         rating = recode(rating, 
                         "Insufficient evidence to rate" = "NA",
                         "Requires improvement" = "Req improv"),
         date = ymd(publication_date)) %>% 
  
  # assign order in the rating levels
  mutate(rating = ordered(rating, levels = c("Inadequate","Req improv", "Good", "Outstanding")),
         social_care = ifelse(type == "Social Care Org", "social care", "healthcare")) %>% 
  
  
  # creating a new dummy variable for facility category
  mutate(founded = as.numeric(founded),
         year = year(date),
         age = year - founded, 
         Year = factor(year)) %>% 
  mutate(cls = ifelse(CLS == 1, "CLS", "CLG"),
         totalequity = as.numeric(totalequity),
         totalequity_std = scale(totalequity, center = TRUE, scale = TRUE)) %>% 
  
   mutate(share_size = case_when(
    totalissueshares <= 20 ~ "small",
    totalissueshares >= 100000 ~ "large",
    TRUE ~ NA_character_
  ))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `founded = as.numeric(founded)`.
## Caused by warning:
## ! NAs introduced by coercion

Check the distribution of the data for share_size

cic_cleaned %>% 
  count(share_size)

## # A tibble: 3 × 2
##   share_size     n
##   <chr>      <int>
## 1 large         23
## 2 small        464
## 3 <NA>        1409

Total counts in full data set

count_full <- cic_cleaned %>% 
  mutate(overall = ifelse(domain == "Overall", 1, 0)) %>% 
  summarize(count_provider = n_distinct(provider_name),
            count_location = n_distinct(location_name)-1,
            count_overall_rating = sum(overall),
            count_rating = n()) 

count_full

## # A tibble: 1 × 4
##   count_provider count_location count_overall_rating count_rating
##            <int>          <dbl>                <dbl>        <int>
## 1            103            170                  551         1896

count_by_form <- cic_cleaned %>% 
  mutate(overall = ifelse(domain == "Overall", 1, 0)) %>% 
  group_by(cls) %>% 
  summarize(count_provider = n_distinct(provider_name),
            count_location = n_distinct(location_name)-1,
            count_overall_rating = sum(overall),
            count_rating = n()) 

count_by_form

## # A tibble: 3 × 5
##   cls   count_provider count_location count_overall_rating count_rating
##   <chr>          <int>          <dbl>                <dbl>        <int>
## 1 CLG               47             56                  104          527
## 2 CLS               56            114                  447         1354
## 3 <NA>               3              2                    0           15

count_by_level_form <- cic_cleaned %>% 
  mutate(overall = ifelse(domain == "Overall", 1, 0)) %>% 
  group_by(cls, level) %>% 
  summarize(count_provider = n_distinct(provider_name),
            count_location = n_distinct(location_name), 
 # the count_location for provider should be manually adjusted to 0
            count_overall_rating = sum(overall),
            count_rating = n())

## `summarise()` has grouped output by 'cls'. You can override using the `.groups`
## argument.

count_by_level_form

## # A tibble: 5 × 6
## # Groups:   cls [3]
##   cls   level    count_provider count_location count_overall_rating count_rating
##   <chr> <chr>             <int>          <int>                <dbl>        <int>
## 1 CLG   location             46             56                   75          353
## 2 CLG   provider              3              1                   29          174
## 3 CLS   location             51            114                  395         1042
## 4 CLS   provider             10              1                   52          312
## 5 <NA>  location              3              3                    0           15

count_by_level <- cic_cleaned %>% 
  mutate(overall = ifelse(domain == "Overall", 1, 0)) %>% 
  group_by(level) %>% 
  summarize(count_provider = n_distinct(provider_name),
            count_location = n_distinct(location_name), 
 # the count_location for provider should be manually adjusted to 0
            count_overall_rating = sum(overall),
            count_rating = n()) 

count_by_level

## # A tibble: 2 × 5
##   level    count_provider count_location count_overall_rating count_rating
##   <chr>             <int>          <int>                <dbl>        <int>
## 1 location             97            170                  470         1410
## 2 provider             13              1                   81          486

count_by_level_type <- cic_cleaned %>% 
  mutate(overall = ifelse(domain == "Overall", 1, 0)) %>% 
  group_by(social_care, level) %>% 
  summarize(count_provider = n_distinct(provider_name),
            count_location = n_distinct(location_name), 
 # the count_location for provider should be manually adjusted to 0
            count_overall_rating = sum(overall),
            count_rating = n())

## `summarise()` has grouped output by 'social_care'. You can override using the
## `.groups` argument.

count_by_level_type

## # A tibble: 3 × 6
## # Groups:   social_care [2]
##   social_care level    count_provider count_location count_overall_rating
##   <chr>       <chr>             <int>          <int>                <dbl>
## 1 healthcare  location             38             82                  374
## 2 healthcare  provider             13              1                   81
## 3 social care location             61             88                   96
## # ℹ 1 more variable: count_rating <int>

datasummary_crosstab(
  formula = cls ~ rating,
  data = cic_cleaned
)

tinytable_teri27jmghitw0ljwdgq

cls		Inadequate	Req improv	Good	Outstanding	All
CLG	N	13	50	412	52	527
	% row	2.5	9.5	78.2	9.9	100.0
CLS	N	44	98	1078	134	1354
	% row	3.2	7.2	79.6	9.9	100.0
All	N	57	151	1502	186	1896
	% row	3.0	8.0	79.2	9.8	100.0

datasummary_crosstab(
  formula = cls * spinout ~ rating,
  data = cic_cleaned
)

tinytable_d30yies06ylidy0es72y

cls	spinout		Inadequate	Req improv	Good	Outstanding	All
CLG	0	N	10	27	254	32	323
		% row	3.1	8.4	78.6	9.9	100.0
	1	N	3	23	158	20	204
		% row	1.5	11.3	77.5	9.8	100.0
CLS	0	N	25	46	643	92	806
		% row	3.1	5.7	79.8	11.4	100.0
	1	N	19	52	435	42	548
		% row	3.5	9.5	79.4	7.7	100.0
	All	N	57	151	1502	186	1896
		% row	3.0	8.0	79.2	9.8	100.0

regression analysis

models without equity variable

model_order_overall <- clm(rating ~ cls  + spinout + social_care + age + dissolved,
                data = filter(cic_cleaned, domain == "Overall"),
                link = "logit")

model_order_safe <- clm(rating ~ cls  + spinout + social_care + age + dissolved,
                data = filter(cic_cleaned, domain == "Safe"),
                link = "logit")
model_order_effective <- clm(rating ~ cls  + spinout + social_care + age + dissolved,
                data = filter(cic_cleaned, domain == "Effective"),
                link = "logit")
model_order_caring <- clm(rating ~ cls  + spinout + social_care + age + dissolved,
                data = filter(cic_cleaned, domain == "Caring"),
                link = "logit")
model_order_well_led <- clm(rating ~ cls  + spinout + social_care + age + dissolved,
                data = filter(cic_cleaned, domain == "Well-led"),
                link = "logit")
model_order_responsive <- clm(rating ~ cls  + spinout + social_care + age + dissolved,
                data = filter(cic_cleaned, domain == "Responsive"),
                link = "logit")

ordinal_models <-
  modelsummary(
    list(
      "overall" = model_order_overall,
      "safe" = model_order_safe,
      "effective" = model_order_effective,
      "caring" = model_order_caring,
      "well-led" = model_order_well_led,
      "responsive" = model_order_responsive
    ),
    coef_omit = "region",
    exponentiate = F,
    statistic = "({p.value}) {stars}")
ordinal_models

tinytable_k502zsxz8pkka343ympg

	overall	safe	effective	caring	well-led	responsive
Inadequate\|Req improv	-3.485	-3.363	-3.775		-3.510
	(<0.001) ***	(<0.001) ***	(<0.001) ***		(<0.001) ***
Req improv\|Good	-2.496	-1.536	-2.316	-5.337	-2.021	-3.910
	(<0.001) ***	(0.003) **	(<0.001) ***	(<0.001) ***	(<0.001) ***	(<0.001) ***
Good\|Outstanding	1.435	4.103	2.803	0.674	1.885	1.485
	(<0.001) ***	(<0.001) ***	(<0.001) ***	(0.202)	(<0.001) ***	(0.007) **
clsCLS	0.483	0.564	0.273	-0.329	-0.056	-0.135
	(0.072) +	(0.102)	(0.451)	(0.364)	(0.857)	(0.730)
spinout	-0.386	-0.305	-0.446	-0.189	-0.325	-0.681
	(0.064) +	(0.370)	(0.214)	(0.588)	(0.277)	(0.082) +
social_caresocial care	0.096	0.947	0.153	-0.817	0.043	-0.093
	(0.716)	(0.016) *	(0.677)	(0.040) *	(0.888)	(0.810)
age	-0.119	-0.045	-0.050	-0.070	-0.023	-0.064
	(<0.001) ***	(0.354)	(0.275)	(0.207)	(0.587)	(0.247)
dissolved	0.212	-0.972	0.370	0.129	0.740	0.439
	(0.607)	(0.032) *	(0.530)	(0.817)	(0.131)	(0.458)
Num.Obs.	540	261	261	261	261	261
AIC	906.6	340.0	332.5	274.7	445.5	269.3
BIC	940.9	368.5	361.0	299.6	474.1	294.3
RMSE	2.45	2.17	2.23	1.55	2.42	1.40

models with equity variables

Due to the large range/dispersion of the fiancial data. I standardize the totalequity variable to enable the models to run.

summary(cic_cleaned$totalequity)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   -23436    53934   653284  2892894  3613364 36881694       57

eq_order_overall <- clm(rating ~ cls  + spinout + social_care + age + dissolved + totalequity_std,
                data = filter(cic_cleaned, domain == "Overall"),
                link = "logit")

eq_order_safe <- clm(rating ~ cls  + spinout + social_care + age + dissolved + totalequity_std,
                data = filter(cic_cleaned, domain == "Safe"),
                link = "logit")
eq_order_effective <- clm(rating ~ cls  + spinout + social_care + age + dissolved + totalequity_std,
                data = filter(cic_cleaned, domain == "Effective"),
                link = "logit")
eq_order_caring <- clm(rating ~ cls  + spinout + social_care + age + dissolved + totalequity_std,
                data = filter(cic_cleaned, domain == "Caring"),
                link = "logit")
eq_order_well_led <- clm(rating ~ cls  + spinout + social_care + age + dissolved + totalequity_std,
                data = filter(cic_cleaned, domain == "Well-led"),
                link = "logit")
eq_order_responsive <- clm(rating ~ cls  + spinout + social_care + age + dissolved + totalequity_std,
                data = filter(cic_cleaned, domain == "Responsive"),
                link = "logit")

eq_models <-
  modelsummary(
    list(
      "overall" = eq_order_overall,
      "safe" = eq_order_safe,
      "effective" = eq_order_effective,
      "caring" = eq_order_caring,
      "well-led" = eq_order_well_led,
      "responsive" = eq_order_responsive
    ),
    coef_omit = "region",
    exponentiate = F,
    statistic = "({p.value}) {stars}")
eq_models

tinytable_n48qs45djadyft9jwtr4

	overall	safe	effective	caring	well-led	responsive
Inadequate\|Req improv	-3.632	-3.792	-3.868		-3.763
	(<0.001) ***	(<0.001) ***	(<0.001) ***		(<0.001) ***
Req improv\|Good	-2.670	-1.992	-2.404	-5.353	-2.328	-4.106
	(<0.001) ***	(<0.001) ***	(<0.001) ***	(<0.001) ***	(<0.001) ***	(<0.001) ***
Good\|Outstanding	1.308	3.868	2.713	0.651	1.652	1.316
	(<0.001) ***	(<0.001) ***	(<0.001) ***	(0.232)	(<0.001) ***	(0.019) *
clsCLS	0.495	0.611	0.246	-0.311	-0.028	-0.151
	(0.066) +	(0.083) +	(0.496)	(0.392)	(0.929)	(0.698)
spinout	-0.507	-0.682	-0.545	-0.191	-0.530	-0.858
	(0.026) *	(0.066) +	(0.150)	(0.602)	(0.099) +	(0.041) *
social_caresocial care	0.081	0.960	0.184	-0.842	-0.004	-0.074
	(0.760)	(0.017) *	(0.619)	(0.035) *	(0.989)	(0.849)
age	-0.132	-0.078	-0.057	-0.072	-0.047	-0.079
	(<0.001) ***	(0.094) +	(0.219)	(0.204)	(0.294)	(0.154)
dissolved	0.240	-0.857	0.449	0.104	0.812	0.541
	(0.566)	(0.066) +	(0.450)	(0.853)	(0.104)	(0.364)
totalequity_std	0.092	0.446	0.163	-0.038	0.174	0.238
	(0.356)	(0.027) *	(0.382)	(0.851)	(0.274)	(0.219)
Num.Obs.	538	259	259	259	259	259
AIC	895.3	327.6	333.0	275.6	436.8	269.2
BIC	933.8	359.6	365.1	304.1	468.8	297.7
RMSE	2.45	2.16	2.23	1.55	2.41	1.40

regression analysis comparing CIC with big vs. small shares

Check the distribution of the data for share_size

cic_cleaned %>% 
  count(share_size)

## # A tibble: 3 × 2
##   share_size     n
##   <chr>      <int>
## 1 large         23
## 2 small        464
## 3 <NA>        1409

Only CLS has large vs. small value as expected

# Crosstab between `cls` and `share_size`
datasummary_crosstab(cls ~ share_size, data = cic_cleaned)

tinytable_49luib1np3wiy4e4ku8n

cls		large	small	All
CLG	N	0	0	527
	% row	0.0	0.0	100.0
CLS	N	23	464	1354
	% row	1.7	34.3	100.0
All	N	23	464	1896
	% row	1.2	24.5	100.0

Spint out can perfectly predict whether one is small or not, so droping that in the model

# Crosstab between `spinout` and `social_care`
datasummary_crosstab(spinout ~ share_size, data = cic_cleaned)

tinytable_rhb0nohc18bz8kowqwbk

spinout		large	small	All
0	N	23	390	1144
	% row	2.0	34.1	100.0
1	N	0	74	752
	% row	0.0	9.8	100.0
All	N	23	464	1896
	% row	1.2	24.5	100.0

healthcare perfectly predicts share_size as well

# Crosstab between `dissolved` and `share_size`
datasummary_crosstab(healthcare ~ share_size, data = cic_cleaned)

tinytable_uirc000rjp2pxqdab8m8

healthcare		large	small	All
0	N	0	138	594
	% row	0.0	23.2	100.0
1	N	23	326	1257
	% row	1.8	25.9	100.0
999	N	0	0	24
	% row	0.0	0.0	100.0
All	N	23	464	1896
	% row	1.2	24.5	100.0

To sum, all large share CICs are cls, non-spinout and healthcare category

models without equity variable

model_order_overall_size <- clm(rating ~  share_size + age,
                data = filter(cic_cleaned, domain == "Overall"),
                link = "logit")

model_order_safe_size <- clm(rating ~  share_size + age ,
                data = filter(cic_cleaned, domain == "Safe"),
                link = "logit")
model_order_effective_size <- clm(rating ~  share_size + age ,
                data = filter(cic_cleaned, domain == "Effective"),
                link = "logit")
model_order_caring_size <- clm(rating ~  share_size + age ,
                data = filter(cic_cleaned, domain == "Caring"),
                link = "logit")
model_order_well_led_size <- clm(rating ~  share_size + age ,
                data = filter(cic_cleaned, domain == "Well-led"),
                link = "logit")
model_order_responsive_size <- clm(rating ~  share_size + age ,
                data = filter(cic_cleaned, domain == "Responsive"),
                link = "logit")

ordinal_models_size <-
  modelsummary(
    list(
      "overall" = model_order_overall_size,
      "safe" = model_order_safe_size,
      "effective" = model_order_effective_size,
      "caring" = model_order_caring_size,
      "well-led" = model_order_well_led_size,
      "responsive" = model_order_responsive_size
    ),
    coef_omit = "region",
    exponentiate = F,
    statistic = "({p.value}) {stars}")
ordinal_models_size

tinytable_qe86kq61p3dzox81e50l

	overall	safe	effective	caring	well-led	responsive
Inadequate\|Req improv	-5.006		-5.618		-2.414
	(<0.001) ***		(<0.001) ***		(0.100)
Req improv\|Good	-3.639	-0.066	-3.723		-0.100	-4.167
	(<0.001) ***	(0.963)	(0.005) **		(0.937)	(0.003) **
Good\|Outstanding	-0.242	4.229	0.453	0.922	3.311	0.870
	(0.765)	(0.006) **	(0.704)	(0.461)	(0.014) *	(0.477)
share_sizesmall	-1.325	1.898	-1.285	-1.196	1.560	-1.355
	(0.111)	(0.201)	(0.309)	(0.368)	(0.238)	(0.298)
age	-0.043	-0.023	-0.040	0.057	0.029	0.054
	(0.365)	(0.715)	(0.523)	(0.363)	(0.614)	(0.393)
Num.Obs.	207	56	56	56	56	56
AIC	384.6	83.2	92.4	57.4	109.3	72.6
BIC	401.2	91.3	102.5	63.4	119.4	80.7
RMSE	2.62	1.36	2.45	0.78	2.58	1.57

data-analysis

Meng Ye

2024-08-24

Introduction

Load packages

Merging CQC and financial data

data cleanning

regression analysis

models without equity variable

models with equity variables

regression analysis comparing CIC with big vs. small shares

models without equity variable