INSTALL PACKAGES AND LOAD DATASET TO ENVIRONMENT, DELETING ORIGINAL FILENAME AND RENAMING IT.

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(broom)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ggplot2)


CPSWaitingForAdoptionFY2014_2023 <- read_csv("CPSWaitingForAdoptionFY2014-2023.csv")
## Rows: 12158 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Region, Gender, Race/Ethnicity, Age Group
## dbl (3): Fiscal Year, Chidlren Waiting on Adoption 31 August, Average Months...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waitadopt <- CPSWaitingForAdoptionFY2014_2023

rm(CPSWaitingForAdoptionFY2014_2023)

RENAME VARIABLES FOR EASY ANALYSIS

waitadopt$Ethnicity <- waitadopt$`Race/Ethnicity`
waitadopt$Agegroup <- waitadopt$`Age Group`
waitadopt$Avgmonths <- waitadopt$`Average Months since Termination of Parental Rights`

EXPAND DATASET FROM GROUPED NUMBER OF CHILDREN TO INDIVIDUAL CHILDREN FOR EACH OBSERVATION

expwaitadopt <- waitadopt %>%
  uncount(`Chidlren Waiting on Adoption 31 August`)

EXPLORE AVERAGE MONTHS SINCE TERMINATION OF PARENTAL RIGHTS VARIABLE BY GROUPING IT INTO ATTRIBUTES.

termparent <- expwaitadopt %>%
  group_by(Avgmonths) %>%
  summarise(count = n())

TRANSFORM GENDER VARIABLE BY CHANGING “UNKNOWN” TO “NA” AND RECODING IT FOR “FEMALE” AS “1”, AND “MALE” AS “0”.

expwaitadopt$Gender[expwaitadopt$Gender == 'Unknown'] <- NA

expwaitadopt <- expwaitadopt %>% mutate(Gendervar=ifelse(Gender=='Female',1,0))

RUN A LINEAR MODEL TO EXPLAIN DEPENDENT VARIABLE BY THREE INDEPENDENT VARIABLES

waitadopt_model <- lm(Avgmonths~Agegroup+Gendervar+Ethnicity, data = expwaitadopt)
summary(waitadopt_model)
## 
## Call:
## lm(formula = Avgmonths ~ Agegroup + Gendervar + Ethnicity, data = expwaitadopt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.996  -5.497  -0.838   3.845 137.862 
## 
## Coefficients:
##                               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   43.70596    0.12568  347.754  < 2e-16 ***
## Agegroup6-12 Years Old       -19.86273    0.11284 -176.025  < 2e-16 ***
## AgegroupBirth to 5 Years Old -30.19234    0.11028 -273.771  < 2e-16 ***
## Gendervar                     -3.88663    0.08739  -44.472  < 2e-16 ***
## EthnicityAnglo                -6.65879    0.12435  -53.547  < 2e-16 ***
## EthnicityAsian                -6.95241    0.80777   -8.607  < 2e-16 ***
## EthnicityHispanic             -4.59830    0.11428  -40.236  < 2e-16 ***
## EthnicityNative American      -7.54141    1.34698   -5.599 2.17e-08 ***
## EthnicityOther                -4.18548    0.21236  -19.709  < 2e-16 ***
## EthnicityUnknown              -6.54164    0.47732  -13.705  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11 on 63628 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5573, Adjusted R-squared:  0.5573 
## F-statistic:  8901 on 9 and 63628 DF,  p-value: < 2.2e-16

ASSUMPTIONS FOR LINEAR REGRESSION: 1) LINEARITY a) Plot:

plot(waitadopt_model,which=1)

  1. Rainbow Test:
raintest(waitadopt_model)
## 
##  Rainbow test
## 
## data:  waitadopt_model
## Rain = 1.0186, df1 = 31819, df2 = 31809, p-value = 0.05028
  1. INDEPENDENCE OF ERRORS DURBIN-WATSON TEST:
durbinWatsonTest(waitadopt_model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.6810428     0.6379019       0
##  Alternative hypothesis: rho != 0

RUN A ROBUST LINEAR MODEL TO MITIGATE FOR NON-INDEPENDENCE OF ERRORS:

waitadopt_robmod <- rlm(Avgmonths~Agegroup+Gendervar+Ethnicity, data = expwaitadopt)
summary(waitadopt_robmod)
## 
## Call: rlm(formula = Avgmonths ~ Agegroup + Gendervar + Ethnicity, data = expwaitadopt)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -39.1649  -4.3244  -0.2835   4.3565 141.1551 
## 
## Coefficients:
##                              Value     Std. Error t value  
## (Intercept)                    39.8749    0.0873   456.6220
## Agegroup6-12 Years Old        -19.2350    0.0784  -245.3306
## AgegroupBirth to 5 Years Old  -28.1467    0.0766  -367.3202
## Gendervar                      -2.6743    0.0607   -44.0414
## EthnicityAnglo                 -4.9788    0.0864   -57.6223
## EthnicityAsian                 -8.0437    0.5613   -14.3316
## EthnicityHispanic              -3.1713    0.0794   -39.9377
## EthnicityNative American       -4.8115    0.9359    -5.1410
## EthnicityOther                 -3.9537    0.1476   -26.7944
## EthnicityUnknown               -5.4141    0.3317   -16.3245
## 
## Residual standard error: 6.43 on 63628 degrees of freedom
##   (1 observation deleted due to missingness)

SECOND RESIDUALS VS FITTED CHART PLOTTING THE ROBUST LINEAR MODEL.

plot(waitadopt_robmod,which=1)

  1. HOMOSCEDASTICITY SCALE-LOCATION PLOT TO VISUALIZE HOMOSCEDASTICITY:
plot(waitadopt_robmod,which=3)

BREUSH-PAGAN TEST TO TEST FOR HOMOSCEDASTICITY:

bptest(waitadopt_robmod)
## 
##  studentized Breusch-Pagan test
## 
## data:  waitadopt_robmod
## BP = 5578.5, df = 9, p-value < 2.2e-16

SQUARE ROOT TRANSFORMATION:

waitadopt_robmod_sqrt <- rlm(sqrt(Avgmonths) ~ Agegroup + Gendervar + Ethnicity, data = expwaitadopt)
summary(waitadopt_robmod_sqrt)
## 
## Call: rlm(formula = sqrt(Avgmonths) ~ Agegroup + Gendervar + Ethnicity, 
##     data = expwaitadopt)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.168642 -0.603159 -0.005139  0.592455  7.267119 
## 
## Coefficients:
##                              Value     Std. Error t value  
## (Intercept)                     6.5032    0.0111   587.3120
## Agegroup6-12 Years Old         -1.9595    0.0099  -197.0995
## AgegroupBirth to 5 Years Old   -3.2979    0.0097  -339.4195
## Gendervar                      -0.3346    0.0077   -43.4570
## EthnicityAnglo                 -0.6539    0.0110   -59.6816
## EthnicityAsian                 -1.0811    0.0712   -15.1917
## EthnicityHispanic              -0.3963    0.0101   -39.3618
## EthnicityNative American       -0.6819    0.1187    -5.7458
## EthnicityOther                 -0.5514    0.0187   -29.4731
## EthnicityUnknown               -0.8533    0.0421   -20.2907
## 
## Residual standard error: 0.8857 on 63628 degrees of freedom
##   (1 observation deleted due to missingness)

TO VISUALIZE HOMOSCEDASTICITY AFTER SQUARE-ROOT TRANSFORMATION:

plot(waitadopt_robmod_sqrt, which=3)

  1. NORMALITY OF RESIDUALS Q-Q PLOT TO VISUALIZE NORMALITY OF RESIDUALS
plot(waitadopt_robmod_sqrt, which=2)

  1. MULTICOLLINEARITY VARIANCE INFLATION FACTOR CALCULATION ON MODEL
vif(waitadopt_robmod_sqrt)
##               GVIF Df GVIF^(1/(2*Df))
## Agegroup  1.006844  2        1.001707
## Gendervar 1.001005  1        1.000503
## Ethnicity 1.006547  6        1.000544

EXPLORING MODEL FOR INTERPRETATION: COMPLETE DATASET

summary(waitadopt_robmod_sqrt)
## 
## Call: rlm(formula = sqrt(Avgmonths) ~ Agegroup + Gendervar + Ethnicity, 
##     data = expwaitadopt)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.168642 -0.603159 -0.005139  0.592455  7.267119 
## 
## Coefficients:
##                              Value     Std. Error t value  
## (Intercept)                     6.5032    0.0111   587.3120
## Agegroup6-12 Years Old         -1.9595    0.0099  -197.0995
## AgegroupBirth to 5 Years Old   -3.2979    0.0097  -339.4195
## Gendervar                      -0.3346    0.0077   -43.4570
## EthnicityAnglo                 -0.6539    0.0110   -59.6816
## EthnicityAsian                 -1.0811    0.0712   -15.1917
## EthnicityHispanic              -0.3963    0.0101   -39.3618
## EthnicityNative American       -0.6819    0.1187    -5.7458
## EthnicityOther                 -0.5514    0.0187   -29.4731
## EthnicityUnknown               -0.8533    0.0421   -20.2907
## 
## Residual standard error: 0.8857 on 63628 degrees of freedom
##   (1 observation deleted due to missingness)

CREATING ALTERNATIVE MODELS WITH FEWER VARIABLES MODEL EXPLAINED BY AGE

agesqrtrlm <- rlm(sqrt(Avgmonths) ~ Agegroup, data = expwaitadopt)
summary(agesqrtrlm)
## 
## Call: rlm(formula = sqrt(Avgmonths) ~ Agegroup, data = expwaitadopt)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5.93632 -0.59229 -0.01932  0.60203  7.51842 
## 
## Coefficients:
##                              Value     Std. Error t value  
## (Intercept)                     5.9363    0.0078   763.4419
## Agegroup6-12 Years Old         -1.9471    0.0103  -188.9802
## AgegroupBirth to 5 Years Old   -3.2898    0.0101  -327.0692
## 
## Residual standard error: 0.8827 on 63636 degrees of freedom

MODEL EXPLAINED BY GENDER

gendersqrtrlm <- rlm(sqrt(Avgmonths) ~ Gendervar, data = expwaitadopt)
summary(gendersqrtrlm)
## 
## Call: rlm(formula = sqrt(Avgmonths) ~ Gendervar, data = expwaitadopt)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0542 -1.1940 -0.3186  1.2523  9.4006 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept)   4.0542   0.0092   439.1225
## Gendervar    -0.3165   0.0135   -23.5196
## 
## Residual standard error: 1.8 on 63636 degrees of freedom
##   (1 observation deleted due to missingness)

GRAPH PLOTTING TIME BY AGE AND GENDER

ggplot(expwaitadopt, aes(x = Agegroup, y = Avgmonths, color = Gender)) +
  geom_col() +
  labs(title = "Waiting for adoption by age and gender", x = "Age Group", y = "Average months waiting for adoption", fill = "Gender") +
  geom_smooth(method = "rlm", se = TRUE) +
  scale_color_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

MODEL EXPLAINED BY ETHNICITY

ethnsqrtrlm <- rlm(sqrt(Avgmonths) ~ Ethnicity, data = expwaitadopt)
summary(ethnsqrtrlm)
## 
## Call: rlm(formula = sqrt(Avgmonths) ~ Ethnicity, data = expwaitadopt)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2784 -1.1784 -0.3157  1.2553  9.7315 
## 
## Coefficients:
##                          Value    Std. Error t value 
## (Intercept)                4.2784   0.0142   301.7726
## EthnicityAnglo            -0.5738   0.0189   -30.3448
## EthnicityAsian            -0.7119   0.1228    -5.7960
## EthnicityHispanic         -0.3665   0.0174   -21.0875
## EthnicityNative American  -0.4422   0.2049    -2.1586
## EthnicityOther            -0.7910   0.0323   -24.4966
## EthnicityUnknown          -1.5871   0.0725   -21.8997
## 
## Residual standard error: 1.772 on 63632 degrees of freedom

GRAPH PLOTTING TIME BY ETHNICITY

ggplot(expwaitadopt, aes(x = Ethnicity, y = Avgmonths, color = Gender)) +
  geom_col() +
  labs(title = "Waiting for adoption by ethnicity and gender", x = "Ethnicity", y = "Average months waiting for adoption") +
  geom_smooth(method = "rlm", se = TRUE) +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

GRAPH PLOTTING TIME BY ETHNICITY AND

ggplot(expwaitadopt, aes(x = Ethnicity, y = Avgmonths, color = `Age Group`)) +
  geom_col() +
  labs(title = "Waiting for adoption by ethnicity and age group", x = "Ethnicity", y = "Average months waiting for adoption") +
  geom_smooth(method = "rlm", se = TRUE) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'