library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# The following data present the number of children served by the Early Childhood Intervention (ECI) program, statewide and by county, in a given fiscal year. It also includes the number of children served as a percentage of the birth-to-three population in Texas.

ecidata <- read_csv("eci.csv")
## Rows: 274 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): County, CSCS, CSFA, TS, PPSC, TPPS
## dbl (1): B3P
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data Glossary: B3P: Birth-to-3 Population; CSCS:Children Served Comprehensive Services; CSFA:Children Served Follow Along; TS: Total Served; PPSC:Percent of Population Served Comp; TPPS: Total Percent of Population Served 

summary(ecidata)
##     County               B3P               CSCS               CSFA          
##  Length:274         Min.   :     2.0   Length:274         Length:274        
##  Class :character   1st Qu.:   383.0   Class :character   Class :character  
##  Mode  :character   Median :   993.5   Mode  :character   Mode  :character  
##                     Mean   :  6920.7                                        
##                     3rd Qu.:  2587.5                                        
##                     Max.   :316834.0                                        
##                     NA's   :20                                              
##       TS                PPSC               TPPS          
##  Length:274         Length:274         Length:274        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
hist(ecidata$B3P, main="Histogram of B3P", xlab="B3P")

# Removed "%" and "*" and converted to numeric
ecidata$PPSC <- as.numeric(gsub("%|\\*", "", ecidata$PPSC))
ecidata$TPPS <- as.numeric(gsub("%|\\*", "", ecidata$TPPS))


# Created a scatter plot 
plot(ecidata$B3P, ecidata$PPSC, main = "B3P vs PPSC", xlab = "B3P", ylab = "PPSC")

# Calculated correlation between B3P and PPSC
correlation <- cor(ecidata$B3P, ecidata$PPSC, use = "complete.obs")
print(correlation)
## [1] -0.1197247