library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
eci_data <- read.csv("eci.csv")
library(readxl)
Post_Tax_Income <- read_excel("Post-Tax Income.xlsx")
## New names:
## • `NO` -> `NO...2`
## • `ME` -> `ME...4`
## • `NO` -> `NO...5`
## • `ME` -> `ME...7`
#The variable "post_tax_income" represents the post-tax household income of families with children potentially receiving ECI services in Texas. This variable measures the financial capacity of the household after taxes, which could influence the ability to access and afford early childhood services.
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
str(eci_data)
## 'data.frame':    274 obs. of  7 variables:
##  $ County: chr  "Anderson" "Andrews" "Angelina" "Aransas" ...
##  $ B3P   : int  2299 1703 5166 1305 400 74 3236 1500 513 824 ...
##  $ CSCS  : chr  "29" "78" "299" "30" ...
##  $ CSFA  : chr  "" "*" "" "" ...
##  $ TS    : chr  "29" "*" "299" "30" ...
##  $ PPSC  : chr  "1%" "5%" "6%" "2%" ...
##  $ TPPS  : chr  "1%" "*" "6%" "2%" ...
stat.desc(Post_Tax_Income$NO...2)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 3.700000e+01 0.000000e+00 7.000000e+00 6.061000e+03 1.312000e+05 1.251390e+05 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.629129e+06 2.805000e+04 4.403051e+04 6.045896e+03 1.226164e+04 1.352456e+09 
##      std.dev     coef.var 
## 3.677575e+04 8.352332e-01
colnames(eci_data)
## [1] "County" "B3P"    "CSCS"   "CSFA"   "TS"     "PPSC"   "TPPS"
# Remove rows with NAs in the 'post_tax_income' variable
post_tax2 <- Post_Tax_Income %>% filter(!is.na(TWT1))
# Create a histogram of the post-tax income
hist(post_tax2$TWT1, main="Histogram of Post-Tax Income", xlab="Post-Tax Income", col="blue", border="black")

# Apply square root transformation
post_tax2 <- Post_Tax_Income %>%
  filter(!is.na(TWT1)) %>%
  mutate(sqrt_income = sqrt(TWT2))
head(post_tax2)
## # A tibble: 6 × 8
##   HOUSEHOLDS                NO...2   TWT1 ME...4 NO...5  TWT2 ME...7 sqrt_income
##   <chr>                      <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>       <dbl>
## 1 All households            131200  70460    628 131400 64240    602        253.
## 2 Family households          84270  90760    752  84330 82210    793        287.
## 3 ...Married-couple          61440 104000    956  62180 94110    911        307.
## 4 ...Female householder, n…  15620  57890    943  15030 51200    915        226.
## 5 ...Male householder, no …   7212  73630   1789   7128 64490   2171        254.
## 6 Nonfamily households       46940  41480    656  47100 39630    697        199.
ggplot(post_tax2, aes(x = sqrt_income)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Square Root Transformed TWT2",
       x = "Square Root of TWT2",
       y = "Frequency") +
  theme_minimal()