library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
eci_data <- read.csv("eci.csv")
library(readxl)
Post_Tax_Income <- read_excel("Post-Tax Income.xlsx")
## New names:
## • `NO` -> `NO...2`
## • `ME` -> `ME...4`
## • `NO` -> `NO...5`
## • `ME` -> `ME...7`
#The variable "post_tax_income" represents the post-tax household income of families with children potentially receiving ECI services in Texas. This variable measures the financial capacity of the household after taxes, which could influence the ability to access and afford early childhood services.
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:tidyr':
##
## extract
str(eci_data)
## 'data.frame': 274 obs. of 7 variables:
## $ County: chr "Anderson" "Andrews" "Angelina" "Aransas" ...
## $ B3P : int 2299 1703 5166 1305 400 74 3236 1500 513 824 ...
## $ CSCS : chr "29" "78" "299" "30" ...
## $ CSFA : chr "" "*" "" "" ...
## $ TS : chr "29" "*" "299" "30" ...
## $ PPSC : chr "1%" "5%" "6%" "2%" ...
## $ TPPS : chr "1%" "*" "6%" "2%" ...
stat.desc(Post_Tax_Income$NO...2)
## nbr.val nbr.null nbr.na min max range
## 3.700000e+01 0.000000e+00 7.000000e+00 6.061000e+03 1.312000e+05 1.251390e+05
## sum median mean SE.mean CI.mean.0.95 var
## 1.629129e+06 2.805000e+04 4.403051e+04 6.045896e+03 1.226164e+04 1.352456e+09
## std.dev coef.var
## 3.677575e+04 8.352332e-01
colnames(eci_data)
## [1] "County" "B3P" "CSCS" "CSFA" "TS" "PPSC" "TPPS"
# Remove rows with NAs in the 'post_tax_income' variable
post_tax2 <- Post_Tax_Income %>% filter(!is.na(TWT1))
# Create a histogram of the post-tax income
hist(post_tax2$TWT1, main="Histogram of Post-Tax Income", xlab="Post-Tax Income", col="blue", border="black")

# Apply square root transformation
post_tax2 <- Post_Tax_Income %>%
filter(!is.na(TWT1)) %>%
mutate(sqrt_income = sqrt(TWT2))
head(post_tax2)
## # A tibble: 6 × 8
## HOUSEHOLDS NO...2 TWT1 ME...4 NO...5 TWT2 ME...7 sqrt_income
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 All households 131200 70460 628 131400 64240 602 253.
## 2 Family households 84270 90760 752 84330 82210 793 287.
## 3 ...Married-couple 61440 104000 956 62180 94110 911 307.
## 4 ...Female householder, n… 15620 57890 943 15030 51200 915 226.
## 5 ...Male householder, no … 7212 73630 1789 7128 64490 2171 254.
## 6 Nonfamily households 46940 41480 656 47100 39630 697 199.
ggplot(post_tax2, aes(x = sqrt_income)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Square Root Transformed TWT2",
x = "Square Root of TWT2",
y = "Frequency") +
theme_minimal()
