INSTALL PACKAGES AND LOAD DATASET TO ENVIRONMENT, DELETING ORIGINAL FILENAME AND RENAMING IT.
library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
CPSWaitingForAdoptionFY2014_2023 <- read_csv("CPSWaitingForAdoptionFY2014-2023.csv")
## Rows: 12158 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Region, Gender, Race/Ethnicity, Age Group
## dbl (3): Fiscal Year, Chidlren Waiting on Adoption 31 August, Average Months...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waitadopt <- CPSWaitingForAdoptionFY2014_2023
rm(CPSWaitingForAdoptionFY2014_2023)
RENAME VARIABLES FOR EASY ANALYSIS
waitadopt$Ethnicity <- waitadopt$`Race/Ethnicity`
waitadopt$Agegroup <- waitadopt$`Age Group`
waitadopt$Avgmonths <- waitadopt$`Average Months since Termination of Parental Rights`
EXPAND DATASET FROM GROUPED NUMBER OF CHILDREN TO INDIVIDUAL CHILDREN FOR EACH OBSERVATION
expwaitadopt <- waitadopt %>%
uncount(`Chidlren Waiting on Adoption 31 August`)
EXPLORE AVERAGE MONTHS SINCE TERMINATION OF PARENTAL RIGHTS VARIABLE BY GROUPING IT INTO ATTRIBUTES.
termparent <- expwaitadopt %>%
group_by(Avgmonths) %>%
summarise(count = n())
TRANSFORM GENDER VARIABLE BY CHANGING “UNKNOWN” TO “NA” AND RECODING IT FOR “FEMALE” AS “1”, AND “MALE” AS “0”.
expwaitadopt$Gender[expwaitadopt$Gender == 'Unknown'] <- NA
expwaitadopt <- expwaitadopt %>% mutate(Gendervar=ifelse(Gender=='Female',1,0))
RUN A LINEAR MODEL TO EXPLAIN DEPENDENT VARIABLE BY THREE INDEPENDENT VARIABLES
waitadopt_model <- lm(Avgmonths~Agegroup+Gendervar+Ethnicity, data = expwaitadopt)
summary(waitadopt_model)
##
## Call:
## lm(formula = Avgmonths ~ Agegroup + Gendervar + Ethnicity, data = expwaitadopt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.996 -5.497 -0.838 3.845 137.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.70596 0.12568 347.754 < 2e-16 ***
## Agegroup6-12 Years Old -19.86273 0.11284 -176.025 < 2e-16 ***
## AgegroupBirth to 5 Years Old -30.19234 0.11028 -273.771 < 2e-16 ***
## Gendervar -3.88663 0.08739 -44.472 < 2e-16 ***
## EthnicityAnglo -6.65879 0.12435 -53.547 < 2e-16 ***
## EthnicityAsian -6.95241 0.80777 -8.607 < 2e-16 ***
## EthnicityHispanic -4.59830 0.11428 -40.236 < 2e-16 ***
## EthnicityNative American -7.54141 1.34698 -5.599 2.17e-08 ***
## EthnicityOther -4.18548 0.21236 -19.709 < 2e-16 ***
## EthnicityUnknown -6.54164 0.47732 -13.705 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11 on 63628 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5573, Adjusted R-squared: 0.5573
## F-statistic: 8901 on 9 and 63628 DF, p-value: < 2.2e-16
ASSUMPTIONS FOR LINEAR REGRESSION: 1) LINEARITY a) Plot:
plot(waitadopt_model,which=1)
raintest(waitadopt_model)
##
## Rainbow test
##
## data: waitadopt_model
## Rain = 1.0186, df1 = 31819, df2 = 31809, p-value = 0.05028
durbinWatsonTest(waitadopt_model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.6810428 0.6379019 0
## Alternative hypothesis: rho != 0
RUN A ROBUST LINEAR MODEL TO MITIGATE FOR NON-INDEPENDENCE OF ERRORS:
waitadopt_robmod <- rlm(Avgmonths~Agegroup+Gendervar+Ethnicity, data = expwaitadopt)
summary(waitadopt_robmod)
##
## Call: rlm(formula = Avgmonths ~ Agegroup + Gendervar + Ethnicity, data = expwaitadopt)
## Residuals:
## Min 1Q Median 3Q Max
## -39.1649 -4.3244 -0.2835 4.3565 141.1551
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 39.8749 0.0873 456.6220
## Agegroup6-12 Years Old -19.2350 0.0784 -245.3306
## AgegroupBirth to 5 Years Old -28.1467 0.0766 -367.3202
## Gendervar -2.6743 0.0607 -44.0414
## EthnicityAnglo -4.9788 0.0864 -57.6223
## EthnicityAsian -8.0437 0.5613 -14.3316
## EthnicityHispanic -3.1713 0.0794 -39.9377
## EthnicityNative American -4.8115 0.9359 -5.1410
## EthnicityOther -3.9537 0.1476 -26.7944
## EthnicityUnknown -5.4141 0.3317 -16.3245
##
## Residual standard error: 6.43 on 63628 degrees of freedom
## (1 observation deleted due to missingness)
SECOND RESIDUALS VS FITTED CHART PLOTTING THE ROBUST LINEAR MODEL.
plot(waitadopt_robmod,which=1)
plot(waitadopt_robmod,which=3)
BREUSH-PAGAN TEST TO TEST FOR HOMOSCEDASTICITY:
bptest(waitadopt_robmod)
##
## studentized Breusch-Pagan test
##
## data: waitadopt_robmod
## BP = 5578.5, df = 9, p-value < 2.2e-16
SQUARE ROOT TRANSFORMATION:
waitadopt_robmod_sqrt <- rlm(sqrt(Avgmonths) ~ Agegroup + Gendervar + Ethnicity, data = expwaitadopt)
summary(waitadopt_robmod_sqrt)
##
## Call: rlm(formula = sqrt(Avgmonths) ~ Agegroup + Gendervar + Ethnicity,
## data = expwaitadopt)
## Residuals:
## Min 1Q Median 3Q Max
## -6.168642 -0.603159 -0.005139 0.592455 7.267119
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 6.5032 0.0111 587.3120
## Agegroup6-12 Years Old -1.9595 0.0099 -197.0995
## AgegroupBirth to 5 Years Old -3.2979 0.0097 -339.4195
## Gendervar -0.3346 0.0077 -43.4570
## EthnicityAnglo -0.6539 0.0110 -59.6816
## EthnicityAsian -1.0811 0.0712 -15.1917
## EthnicityHispanic -0.3963 0.0101 -39.3618
## EthnicityNative American -0.6819 0.1187 -5.7458
## EthnicityOther -0.5514 0.0187 -29.4731
## EthnicityUnknown -0.8533 0.0421 -20.2907
##
## Residual standard error: 0.8857 on 63628 degrees of freedom
## (1 observation deleted due to missingness)
TO VISUALIZE HOMOSCEDASTICITY AFTER SQUARE-ROOT TRANSFORMATION:
plot(waitadopt_robmod_sqrt, which=3)
plot(waitadopt_robmod_sqrt, which=2)
vif(waitadopt_robmod_sqrt)
## GVIF Df GVIF^(1/(2*Df))
## Agegroup 1.006844 2 1.001707
## Gendervar 1.001005 1 1.000503
## Ethnicity 1.006547 6 1.000544
EXPLORING MODEL FOR INTERPRETATION: COMPLETE DATASET
summary(waitadopt_robmod_sqrt)
##
## Call: rlm(formula = sqrt(Avgmonths) ~ Agegroup + Gendervar + Ethnicity,
## data = expwaitadopt)
## Residuals:
## Min 1Q Median 3Q Max
## -6.168642 -0.603159 -0.005139 0.592455 7.267119
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 6.5032 0.0111 587.3120
## Agegroup6-12 Years Old -1.9595 0.0099 -197.0995
## AgegroupBirth to 5 Years Old -3.2979 0.0097 -339.4195
## Gendervar -0.3346 0.0077 -43.4570
## EthnicityAnglo -0.6539 0.0110 -59.6816
## EthnicityAsian -1.0811 0.0712 -15.1917
## EthnicityHispanic -0.3963 0.0101 -39.3618
## EthnicityNative American -0.6819 0.1187 -5.7458
## EthnicityOther -0.5514 0.0187 -29.4731
## EthnicityUnknown -0.8533 0.0421 -20.2907
##
## Residual standard error: 0.8857 on 63628 degrees of freedom
## (1 observation deleted due to missingness)
CREATING ALTERNATIVE MODELS WITH FEWER VARIABLES MODEL EXPLAINED BY AGE
agesqrtrlm <- rlm(sqrt(Avgmonths) ~ Agegroup, data = expwaitadopt)
summary(agesqrtrlm)
##
## Call: rlm(formula = sqrt(Avgmonths) ~ Agegroup, data = expwaitadopt)
## Residuals:
## Min 1Q Median 3Q Max
## -5.93632 -0.59229 -0.01932 0.60203 7.51842
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 5.9363 0.0078 763.4419
## Agegroup6-12 Years Old -1.9471 0.0103 -188.9802
## AgegroupBirth to 5 Years Old -3.2898 0.0101 -327.0692
##
## Residual standard error: 0.8827 on 63636 degrees of freedom
MODEL EXPLAINED BY GENDER
gendersqrtrlm <- rlm(sqrt(Avgmonths) ~ Gendervar, data = expwaitadopt)
summary(gendersqrtrlm)
##
## Call: rlm(formula = sqrt(Avgmonths) ~ Gendervar, data = expwaitadopt)
## Residuals:
## Min 1Q Median 3Q Max
## -4.0542 -1.1940 -0.3186 1.2523 9.4006
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 4.0542 0.0092 439.1225
## Gendervar -0.3165 0.0135 -23.5196
##
## Residual standard error: 1.8 on 63636 degrees of freedom
## (1 observation deleted due to missingness)
GRAPH PLOTTING TIME BY AGE AND GENDER
ggplot(expwaitadopt, aes(x = Agegroup, y = Avgmonths, color = Gender)) +
geom_col() +
labs(title = "Waiting for adoption by age and gender", x = "Age Group", y = "Average months waiting for adoption", fill = "Gender") +
geom_smooth(method = "rlm", se = TRUE) +
scale_color_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
MODEL EXPLAINED BY ETHNICITY
ethnsqrtrlm <- rlm(sqrt(Avgmonths) ~ Ethnicity, data = expwaitadopt)
summary(ethnsqrtrlm)
##
## Call: rlm(formula = sqrt(Avgmonths) ~ Ethnicity, data = expwaitadopt)
## Residuals:
## Min 1Q Median 3Q Max
## -4.2784 -1.1784 -0.3157 1.2553 9.7315
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 4.2784 0.0142 301.7726
## EthnicityAnglo -0.5738 0.0189 -30.3448
## EthnicityAsian -0.7119 0.1228 -5.7960
## EthnicityHispanic -0.3665 0.0174 -21.0875
## EthnicityNative American -0.4422 0.2049 -2.1586
## EthnicityOther -0.7910 0.0323 -24.4966
## EthnicityUnknown -1.5871 0.0725 -21.8997
##
## Residual standard error: 1.772 on 63632 degrees of freedom
GRAPH PLOTTING TIME BY ETHNICITY
ggplot(expwaitadopt, aes(x = Ethnicity, y = Avgmonths, color = Gender)) +
geom_col() +
labs(title = "Waiting for adoption by ethnicity and gender", x = "Ethnicity", y = "Average months waiting for adoption") +
geom_smooth(method = "rlm", se = TRUE) +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
GRAPH PLOTTING TIME BY ETHNICITY AND
ggplot(expwaitadopt, aes(x = Ethnicity, y = Avgmonths, color = `Age Group`)) +
geom_col() +
labs(title = "Waiting for adoption by ethnicity and age group", x = "Ethnicity", y = "Average months waiting for adoption") +
geom_smooth(method = "rlm", se = TRUE) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'