library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
excel_sheets("DataSet.xlsx")
## [1] "Sheet4" "FY19"   "FY20"   "FY21"   "FY22"
excel_sheets("DataSet.xlsx") %>% map(~read_xlsx("DataSet.xlsx",.))
## [[1]]
## # A tibble: 77 × 2
##    `ZIP Code` Region               
##         <dbl> <chr>                
##  1      78006 San Antonio Northwest
##  2      78023 San Antonio Northwest
##  3      78238 San Antonio Northwest
##  4      78240 San Antonio Northwest
##  5      78249 San Antonio Northwest
##  6      78250 San Antonio Northwest
##  7      78254 San Antonio Northwest
##  8      78255 San Antonio Northwest
##  9      78256 San Antonio Northwest
## 10      78216 San Antonio North    
## # ℹ 67 more rows
## 
## [[2]]
## # A tibble: 38 × 12
##    enrollee renewal enrolled            diagnosis DOB                   age
##    <chr>    <chr>   <dttm>              <chr>     <dttm>              <dbl>
##  1 Alspach  NO      2019-07-26 00:00:00 ASD       2015-02-07 00:00:00    10
##  2 Arriaga  NO      2019-07-31 00:00:00 ID        2013-05-15 00:00:00    11
##  3 Bourland NO      2019-05-01 00:00:00 ID        2007-12-17 00:00:00    17
##  4 Brown    NO      2019-04-15 00:00:00 ASD       2002-08-16 00:00:00    22
##  5 Cash     NO      2019-04-29 00:00:00 ASD       2014-02-05 00:00:00    11
##  6 Crostley NO      2019-08-28 00:00:00 ASD       2006-12-09 00:00:00    18
##  7 Deya     NO      2019-05-14 00:00:00 ASD       2003-09-15 00:00:00    21
##  8 Dray     NO      2019-05-15 00:00:00 ASD       2011-07-20 00:00:00    13
##  9 Eichman  NO      2019-04-26 00:00:00 ASD       2012-02-15 00:00:00    13
## 10 Ell      NO      2019-08-06 00:00:00 ASD       2001-06-11 00:00:00    23
## # ℹ 28 more rows
## # ℹ 6 more variables: gender <chr>, ethnicity <chr>, ISD <chr>, income <dbl>,
## #   zipcode <dbl>, Region <chr>
## 
## [[3]]
## # A tibble: 122 × 12
##    enrollee   renewal enrolled            diagnosis    DOB                   age
##    <chr>      <chr>   <dttm>              <chr>        <dttm>              <dbl>
##  1 Ackerman   NO      2020-04-24 00:00:00 ASD          2003-06-03 00:00:00    21
##  2 Ackerman   NO      2020-04-24 00:00:00 ASD          2006-12-03 00:00:00    18
##  3 Alaniz     NO      2018-12-14 00:00:00 ASD          2012-10-23 00:00:00    12
##  4 Alfaro     NO      2019-11-08 00:00:00 ASD          2001-03-31 00:00:00    23
##  5 Alspach    NO      2019-07-26 00:00:00 ASD          2015-02-07 00:00:00    10
##  6 Altamirano NO      2020-02-14 00:00:00 ASD          2006-08-17 00:00:00    18
##  7 Alvarez    NO      2019-04-30 00:00:00 CP           2012-06-18 00:00:00    12
##  8 Arriaga    NO      2019-07-31 00:00:00 ID           2013-05-15 00:00:00    11
##  9 Autry      NO      2020-02-24 00:00:00 ASD          2015-06-01 00:00:00     9
## 10 Barker     NO      2019-10-02 00:00:00 Transverse … 2004-03-15 00:00:00    21
## # ℹ 112 more rows
## # ℹ 6 more variables: income <dbl>, gender <chr>, ethnicity <chr>, ISD <chr>,
## #   zipcode <dbl>, Region <chr>
## 
## [[4]]
## # A tibble: 114 × 13
##    enrollee   enrolled renewal diagnosis DOB                   age income gender
##    <chr>      <chr>    <chr>   <chr>     <dttm>              <dbl>  <dbl> <chr> 
##  1 Adamski    44384    NO      ASD       2013-07-22 00:00:00    11 120000 Female
##  2 Adamski    44384    NO      ASD       2013-07-22 00:00:00    11 120000 Male  
##  3 Altamirano 43875    NO      ASD       2006-08-17 00:00:00    18  47000 Male  
##  4 Arriaga    43677    NO      ID        2013-05-15 00:00:00    11  56000 male  
##  5 Autry      43885    NO      ASD       2015-06-01 00:00:00     9  80000 Male  
##  6 Avant      44272    NO      ASD       2015-04-26 00:00:00     9 100000 Female
##  7 Baldwin    44235    NO      OHI       2004-03-30 00:00:00    20 180000 male  
##  8 Bandaru    44223    NO      ASD       2014-07-31 00:00:00    10 155000 Male  
##  9 Barcenez   44127    NO      ASD       2001-08-17 00:00:00    23 102000 Male  
## 10 Beede      43882    NO      Apraxia   2016-05-11 00:00:00     8 130000 Female
## # ℹ 104 more rows
## # ℹ 5 more variables: race <chr>, ethnicity <chr>, ISD <chr>, zipcode <dbl>,
## #   Region <chr>
## 
## [[5]]
## # A tibble: 113 × 13
##    enrollee  enrolled renewal diagnosis DOB                   age income gender
##    <chr>     <chr>    <chr>   <chr>     <dttm>              <dbl>  <dbl> <chr> 
##  1 Adamski   44384    NO      ASD       2013-07-22 00:00:00    11 120000 Female
##  2 Adamski   44384    NO      ASD       2013-07-22 00:00:00    11 120000 Male  
##  3 Alexandr  44704    NO      ASD       2013-08-08 00:00:00    11  24000 Male  
##  4 Alexandr  44704    NO      ASD       2011-11-22 00:00:00    13  24000 Male  
##  5 Almanza   44461    NO      ASD       2016-08-30 00:00:00     8  95000 Male  
##  6 Alvarado  44735    NO      ASD       2011-04-16 00:00:00    13  75000 Male  
##  7 Alvarez   44540    NO      ASD       2007-03-06 00:00:00    18  88000 Male  
##  8 Anderson  44788    NO      GDD       2017-05-25 00:00:00     7  85000 Male  
##  9 Aponte    44533    NO      ASD       2012-12-22 00:00:00    12      0 Male  
## 10 Arredondo 44620    NO      ASD       2018-05-24 00:00:00     6  51000 Male  
## # ℹ 103 more rows
## # ℹ 5 more variables: race <chr>, ethnicity <chr>, ISD <chr>, zipcode <dbl>,
## #   Region <chr>
FY19 <- read_excel("DataSet.xlsx", sheet = "FY19")
FY20 <- read_excel("DataSet.xlsx", sheet = "FY20")
FY21 <- read_excel("DataSet.xlsx", sheet = "FY21")
FY22 <- read_excel("DataSet.xlsx", sheet = "FY22")
Sheet4 <- read_excel("DataSet.xlsx", sheet = "Sheet4")
head(FY19)
## # A tibble: 6 × 12
##   enrollee renewal enrolled            diagnosis DOB                   age
##   <chr>    <chr>   <dttm>              <chr>     <dttm>              <dbl>
## 1 Alspach  NO      2019-07-26 00:00:00 ASD       2015-02-07 00:00:00    10
## 2 Arriaga  NO      2019-07-31 00:00:00 ID        2013-05-15 00:00:00    11
## 3 Bourland NO      2019-05-01 00:00:00 ID        2007-12-17 00:00:00    17
## 4 Brown    NO      2019-04-15 00:00:00 ASD       2002-08-16 00:00:00    22
## 5 Cash     NO      2019-04-29 00:00:00 ASD       2014-02-05 00:00:00    11
## 6 Crostley NO      2019-08-28 00:00:00 ASD       2006-12-09 00:00:00    18
## # ℹ 6 more variables: gender <chr>, ethnicity <chr>, ISD <chr>, income <dbl>,
## #   zipcode <dbl>, Region <chr>
library(lubridate)
library(dplyr)
race_eth_22 <- FY22 %>% mutate(hispanic=ifelse(grepl("Hispanic",ethnicity),1,0))

race_eth_21 <- FY21 %>% mutate(hispanic=ifelse(grepl("Hispanic",ethnicity),1,0))
enroll_22 <- race_eth_22 %>% select(age, income, hispanic) %>% arrange(-income, age, hispanic)
cor(enroll_22$income, enroll_22$hispanic)
## [1] -0.08105679
pairs(~age+income+hispanic,data=enroll_22)

cor.test(enroll_22$hispanic,enroll_22$income,method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  enroll_22$hispanic and enroll_22$income
## t = -0.85681, df = 111, p-value = 0.3934
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2618655  0.1052490
## sample estimates:
##         cor 
## -0.08105679
hist(enroll_22$age,probability = T)
lines(density(enroll_22$age),col="red",lwd=3)

hist(enroll_22$income,probability = T)
lines(density(enroll_22$income),col="red",lwd=3)

cor.test(enroll_22$age,enroll_22$income,method="kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  enroll_22$age and enroll_22$income
## z = 2.1872, p-value = 0.02873
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.1449036

Although weak, there is a positive correlation between age and income, the older the individuals the higher the income.

income <-enroll_22 %>% select(income) %>% mutate(income_sqrt=sqrt(income))

head(income)
## # A tibble: 6 × 2
##   income income_sqrt
##    <dbl>       <dbl>
## 1 500000        707.
## 2 300000        548.
## 3 280000        529.
## 4 265000        515.
## 5 240000        490.
## 6 200000        447.
hist(income$income_sqrt, probability = T)
lines(density(income$income_sqrt),col='red',lwd=2)