library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.0.10 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(testit)
library(tigris)
## Warning: package 'tigris' was built under R version 4.2.3
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(stringr)
library(httr)
## Warning: package 'httr' was built under R version 4.2.3
library(here)
## here() starts at C:/Users/Mahmuda Sultana/Desktop/CTDATA/CTData
library(ggplot2)
library(dplyr)
# Import the dataset
data <- read_csv("C:/Users/Mahmuda Sultana/Desktop/CTDATA/CTData_Research-Analyst_Technical-Exercise_DATA.CSV")
## Rows: 1803 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): SMI
## dbl (6): CT.DEMO.005, CT.POLICY.001, CT.POLICY.002.a, CT.POLICY.003.a, CT.PO...
## lgl (1): CT.POLICY.006.a
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View part of dataset
head(data)
## # A tibble: 6 × 8
## SMI CT.DEMO.005 CT.POLICY.001 CT.POLICY.002.a CT.POLICY.003.a
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Above 85% of SMI 1 1 1 NA
## 2 Below 30% of SMI 2 1 1 1
## 3 Between 60-85% of S… 1 1 0 NA
## 4 Between 30-60% of S… 1 1 1 NA
## 5 Between 30-60% of S… 2 1 1 1
## 6 Below 30% of SMI 1 0 NA NA
## # ℹ 3 more variables: CT.POLICY.004.a <dbl>, CT.POLICY.005.a <dbl>,
## # CT.POLICY.006.a <lgl>
Descriptive analysis provides the insights of the data. The goal is to provide a simple summary of the data, which can help in understanding patterns, trends, and the overall distribution of the variable
#SMI variable
smi_summary <- data %>%
count(SMI) %>%
mutate(percentage = n / sum(n) * 100)
print(smi_summary)
## # A tibble: 5 × 3
## SMI n percentage
## <chr> <int> <dbl>
## 1 Above 85% of SMI 577 32.0
## 2 Below 30% of SMI 367 20.4
## 3 Between 30-60% of SMI 478 26.5
## 4 Between 60-85% of SMI 269 14.9
## 5 <NA> 112 6.21
# CT.DEMO.005 (Number of children under 6)
demo_summary <- data %>%
count(CT.DEMO.005) %>%
mutate(percentage = n / sum(n) * 100)
print(demo_summary)
## # A tibble: 6 × 3
## CT.DEMO.005 n percentage
## <dbl> <int> <dbl>
## 1 1 1155 64.1
## 2 2 554 30.7
## 3 3 71 3.94
## 4 4 7 0.388
## 5 5 7 0.388
## 6 NA 9 0.499
# CT.POLICY.001 (Looked for non-parental care for children under 6)
policy001_summary <- data %>%
count(CT.POLICY.001) %>%
mutate(percentage = n / sum(n) * 100)
print(policy001_summary)
## # A tibble: 3 × 3
## CT.POLICY.001 n percentage
## <dbl> <int> <dbl>
## 1 0 436 24.2
## 2 1 1326 73.5
## 3 NA 41 2.27
# CT.POLICY.002.a (Used non-parental care for oldest child under 6)
policy002_summary <- data %>%
count(CT.POLICY.002.a) %>%
mutate(percentage = n / sum(n) * 100)
print(policy002_summary)
## # A tibble: 3 × 3
## CT.POLICY.002.a n percentage
## <dbl> <int> <dbl>
## 1 0 221 12.3
## 2 1 1093 60.6
## 3 NA 489 27.1
# CT.POLICY.003.a (used non-parental care for their second child under 6)
policy003_summary <- data %>%
count(CT.POLICY.003.a) %>%
mutate(percentage = n / sum(n) * 100)
print(policy003_summary)
## # A tibble: 3 × 3
## CT.POLICY.003.a n percentage
## <dbl> <int> <dbl>
## 1 0 107 5.93
## 2 1 362 20.1
## 3 NA 1334 74.0
# CT.POLICY.004.a (used non-parental care for their third child under 6)
policy004_summary <- data %>%
count(CT.POLICY.004.a) %>%
mutate(percentage = n / sum(n) * 100)
print(policy004_summary)
## # A tibble: 3 × 3
## CT.POLICY.004.a n percentage
## <dbl> <int> <dbl>
## 1 0 16 0.887
## 2 1 42 2.33
## 3 NA 1745 96.8
# CT.POLICY.005.a (used non-parental care for their fourth child under 6)
policy005_summary <- data %>%
count(CT.POLICY.005.a) %>%
mutate(percentage = n / sum(n) * 100)
print(policy005_summary)
## # A tibble: 3 × 3
## CT.POLICY.005.a n percentage
## <dbl> <int> <dbl>
## 1 0 5 0.277
## 2 1 4 0.222
## 3 NA 1794 99.5
# CT.POLICY.006.a (used non-parental care for their fifth child under 6)
policy006_summary <- data %>%
count(CT.POLICY.006.a) %>%
mutate(percentage = n / sum(n) * 100)
print(policy006_summary)
## # A tibble: 1 × 3
## CT.POLICY.006.a n percentage
## <lgl> <int> <dbl>
## 1 NA 1803 100
# I am creating a new binary variable indicating if the respondent is using non-parental care for any child
data <- data %>%
mutate(using_non_parental_care = ifelse(CT.POLICY.002.a == 1 |
CT.POLICY.003.a == 1 |
CT.POLICY.004.a == 1 |
CT.POLICY.005.a == 1 |
CT.POLICY.006.a == 1, 1, 0))
# Percentage of respondents using non-parental care
total_respondents <- nrow(data)
respondents_using_care <- sum(data$using_non_parental_care, na.rm = TRUE)
percentage_using_care <- (respondents_using_care / total_respondents) * 100
# Output the results
cat("Sample size:", total_respondents, "\n")
## Sample size: 1803
cat("Respondents using non-parental care:", respondents_using_care, "\n")
## Respondents using non-parental care: 1128
cat("Percentage using non-parental care:", percentage_using_care, "%", "\n")
## Percentage using non-parental care: 62.5624 %
# Creating a variable that counts how many children each respondent has
total_children <- sum(data$CT.DEMO.005, na.rm = TRUE)
# Creating a variable that counts how many children are receiving non-parental care per respondent
# For each respondent, calculating sum of how many of their children (out of the total) are in non-parental care
data <- data %>%
mutate(children_receiving_care = rowSums(select(., CT.POLICY.002.a:CT.POLICY.006.a) == 1, na.rm = TRUE))
# Calculate the total number of children receiving non-parental care
total_children_receiving_care <- sum(data$children_receiving_care, na.rm = TRUE)
# Calculate the percentage of children receiving non-parental care
percentage_children_care <- (total_children_receiving_care / total_children) * 100
# Output the results
cat("Total children under 6:", total_children, "\n")
## Total children under 6: 2539
cat("Children receiving non-parental care:", total_children_receiving_care, "\n")
## Children receiving non-parental care: 1501
cat("Percentage of children receiving non-parental care:", percentage_children_care, "%", "\n")
## Percentage of children receiving non-parental care: 59.11776 %
# I will calculate the percentage of respondents using non-parental care within each SMI category
# Group data by SMI (income category) and calculate the percentage within each group
smi_care_summary <- data %>%
filter(!is.na(SMI)) %>%
group_by(SMI) %>%
summarise(
respondents_in_group = n(),
respondents_using_care = sum(using_non_parental_care, na.rm = TRUE),
percentage_using_care = (respondents_using_care / respondents_in_group) * 100
)
# summary table for reference
print(smi_care_summary)
## # A tibble: 4 × 4
## SMI respondents_in_group respondents_using_care percentage_using_care
## <chr> <int> <dbl> <dbl>
## 1 Above 85% o… 577 408 70.7
## 2 Below 30% o… 367 174 47.4
## 3 Between 30-… 478 320 66.9
## 4 Between 60-… 269 183 68.0
# percentage of respondents using non-parental care by income category
ggplot(smi_care_summary, aes(x = SMI, y = percentage_using_care, fill = SMI)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.1f%%", percentage_using_care)),
vjust = -0.5, size = 4, color = "black") + # Add text on top of bars
labs(title = "Percentage of Respondents Using Non-Parental Care by SMI",
x = "State Median Income (SMI) Category",
y = "Percentage Using Non-Parental Care") +
theme_minimal() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
# we will test whether the variation among household income groups in the likelihood of using non-parental care is statistically significant. This can be done using a Chi-Square test since both variables are categorical.
# A contingency table of SMI and using_non_parental_care
contingency_table <- table(data$SMI, data$using_non_parental_care)
# Perform the Chi-Square test
chi_square_test <- chisq.test(contingency_table)
# Output the results
cat("Chi-Square Test Statistic:", chi_square_test$statistic, "\n")
## Chi-Square Test Statistic: 141.282
cat("Degrees of Freedom:", chi_square_test$parameter, "\n")
## Degrees of Freedom: 3
cat("p-value:", chi_square_test$p.value, "\n")
## p-value: 1.999968e-30
# Interpretation in layman terms
if (chi_square_test$p.value < 0.05) {
cat("The variation among household income groups in the likelihood of using non-parental care is statistically significant.\n")
} else {
cat("The variation among household income groups in the likelihood of using non-parental care is not statistically significant.\n")
}
## The variation among household income groups in the likelihood of using non-parental care is statistically significant.
The statistical test strongly suggests that the likelihood of using non-parental care differs across household income groups. In practical terms, this means that income levels have a clear impact on whether families are using non-parental care for their children. Families in different income groups are not equally likely to use non-parental care.
This report outlines key findings from a survey analyzing the use of non-parental care for young children. The survey collected responses from families with children under the age of six. We examine how income levels and other factors influence the use of non-parental childcare.
1) Out of the 1,803 respondents surveyed, 1,128 respondents were currently using non-parental care. This means that 62.6% of respondents had their children in some form of non-parental care at the time of the survey. A significant proportion of families rely on non-parental care for their children, with over 60% of surveyed families reporting usage of these services.
2) The total number of children under six represented in the survey was 2,539. Among them 1,501 children were receiving non-parental care. This corresponds to 59.1% of young children receiving non-parental care. Approximately 6 in 10 young children are cared for outside of their immediate family environment, showing a widespread dependence on non-parental care among families.
3) The likelihood of using non-parental care tends to increase with higher income levels. Families with incomes above 85% of the state median were the most likely to use non-parental care, while families with incomes below 30% of the state median were the least likely to do so. The difference in childcare usage by income group highlights potential economic barriers that lower-income families may face when accessing childcare services.
4) The p-value is very small, far below the common threshold of 0.05. This indicates that the differences in non-parental care usage across income groups are statistically significant. In other words, the likelihood of using non-parental care varies substantially depending on a family’s income. Household income plays a crucial role in whether families use non-parental care. Higher-income families are more likely to use non-parental care services, and this difference is not due to random chance but reflects a real underlying relationship between income and childcare use.