library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
excel_sheets("DataSet.xlsx")
## [1] "Sheet4" "FY19" "FY20" "FY21" "FY22"
excel_sheets("DataSet.xlsx") %>% map(~read_xlsx("DataSet.xlsx",.))
## [[1]]
## # A tibble: 77 × 2
## `ZIP Code` Region
## <dbl> <chr>
## 1 78006 San Antonio Northwest
## 2 78023 San Antonio Northwest
## 3 78238 San Antonio Northwest
## 4 78240 San Antonio Northwest
## 5 78249 San Antonio Northwest
## 6 78250 San Antonio Northwest
## 7 78254 San Antonio Northwest
## 8 78255 San Antonio Northwest
## 9 78256 San Antonio Northwest
## 10 78216 San Antonio North
## # ℹ 67 more rows
##
## [[2]]
## # A tibble: 38 × 12
## enrollee renewal enrolled diagnosis DOB age
## <chr> <chr> <dttm> <chr> <dttm> <dbl>
## 1 Alspach NO 2019-07-26 00:00:00 ASD 2015-02-07 00:00:00 10
## 2 Arriaga NO 2019-07-31 00:00:00 ID 2013-05-15 00:00:00 11
## 3 Bourland NO 2019-05-01 00:00:00 ID 2007-12-17 00:00:00 17
## 4 Brown NO 2019-04-15 00:00:00 ASD 2002-08-16 00:00:00 22
## 5 Cash NO 2019-04-29 00:00:00 ASD 2014-02-05 00:00:00 11
## 6 Crostley NO 2019-08-28 00:00:00 ASD 2006-12-09 00:00:00 18
## 7 Deya NO 2019-05-14 00:00:00 ASD 2003-09-15 00:00:00 21
## 8 Dray NO 2019-05-15 00:00:00 ASD 2011-07-20 00:00:00 13
## 9 Eichman NO 2019-04-26 00:00:00 ASD 2012-02-15 00:00:00 13
## 10 Ell NO 2019-08-06 00:00:00 ASD 2001-06-11 00:00:00 23
## # ℹ 28 more rows
## # ℹ 6 more variables: gender <chr>, ethnicity <chr>, ISD <chr>, income <dbl>,
## # zipcode <dbl>, Region <chr>
##
## [[3]]
## # A tibble: 122 × 12
## enrollee renewal enrolled diagnosis DOB age
## <chr> <chr> <dttm> <chr> <dttm> <dbl>
## 1 Ackerman NO 2020-04-24 00:00:00 ASD 2003-06-03 00:00:00 21
## 2 Ackerman NO 2020-04-24 00:00:00 ASD 2006-12-03 00:00:00 18
## 3 Alaniz NO 2018-12-14 00:00:00 ASD 2012-10-23 00:00:00 12
## 4 Alfaro NO 2019-11-08 00:00:00 ASD 2001-03-31 00:00:00 23
## 5 Alspach NO 2019-07-26 00:00:00 ASD 2015-02-07 00:00:00 10
## 6 Altamirano NO 2020-02-14 00:00:00 ASD 2006-08-17 00:00:00 18
## 7 Alvarez NO 2019-04-30 00:00:00 CP 2012-06-18 00:00:00 12
## 8 Arriaga NO 2019-07-31 00:00:00 ID 2013-05-15 00:00:00 11
## 9 Autry NO 2020-02-24 00:00:00 ASD 2015-06-01 00:00:00 9
## 10 Barker NO 2019-10-02 00:00:00 Transverse … 2004-03-15 00:00:00 21
## # ℹ 112 more rows
## # ℹ 6 more variables: income <dbl>, gender <chr>, ethnicity <chr>, ISD <chr>,
## # zipcode <dbl>, Region <chr>
##
## [[4]]
## # A tibble: 114 × 13
## enrollee enrolled renewal diagnosis DOB age income gender
## <chr> <chr> <chr> <chr> <dttm> <dbl> <dbl> <chr>
## 1 Adamski 44384 NO ASD 2013-07-22 00:00:00 11 120000 Female
## 2 Adamski 44384 NO ASD 2013-07-22 00:00:00 11 120000 Male
## 3 Altamirano 43875 NO ASD 2006-08-17 00:00:00 18 47000 Male
## 4 Arriaga 43677 NO ID 2013-05-15 00:00:00 11 56000 male
## 5 Autry 43885 NO ASD 2015-06-01 00:00:00 9 80000 Male
## 6 Avant 44272 NO ASD 2015-04-26 00:00:00 9 100000 Female
## 7 Baldwin 44235 NO OHI 2004-03-30 00:00:00 20 180000 male
## 8 Bandaru 44223 NO ASD 2014-07-31 00:00:00 10 155000 Male
## 9 Barcenez 44127 NO ASD 2001-08-17 00:00:00 23 102000 Male
## 10 Beede 43882 NO Apraxia 2016-05-11 00:00:00 8 130000 Female
## # ℹ 104 more rows
## # ℹ 5 more variables: race <chr>, ethnicity <chr>, ISD <chr>, zipcode <dbl>,
## # Region <chr>
##
## [[5]]
## # A tibble: 113 × 13
## enrollee enrolled renewal diagnosis DOB age income gender
## <chr> <chr> <chr> <chr> <dttm> <dbl> <dbl> <chr>
## 1 Adamski 44384 NO ASD 2013-07-22 00:00:00 11 120000 Female
## 2 Adamski 44384 NO ASD 2013-07-22 00:00:00 11 120000 Male
## 3 Alexandr 44704 NO ASD 2013-08-08 00:00:00 11 24000 Male
## 4 Alexandr 44704 NO ASD 2011-11-22 00:00:00 13 24000 Male
## 5 Almanza 44461 NO ASD 2016-08-30 00:00:00 8 95000 Male
## 6 Alvarado 44735 NO ASD 2011-04-16 00:00:00 13 75000 Male
## 7 Alvarez 44540 NO ASD 2007-03-06 00:00:00 18 88000 Male
## 8 Anderson 44788 NO GDD 2017-05-25 00:00:00 7 85000 Male
## 9 Aponte 44533 NO ASD 2012-12-22 00:00:00 12 0 Male
## 10 Arredondo 44620 NO ASD 2018-05-24 00:00:00 6 51000 Male
## # ℹ 103 more rows
## # ℹ 5 more variables: race <chr>, ethnicity <chr>, ISD <chr>, zipcode <dbl>,
## # Region <chr>
FY19 <- read_excel("DataSet.xlsx", sheet = "FY19")
FY20 <- read_excel("DataSet.xlsx", sheet = "FY20")
FY21 <- read_excel("DataSet.xlsx", sheet = "FY21")
FY22 <- read_excel("DataSet.xlsx", sheet = "FY22")
Sheet4 <- read_excel("DataSet.xlsx", sheet = "Sheet4")
head(FY19)
## # A tibble: 6 × 12
## enrollee renewal enrolled diagnosis DOB age
## <chr> <chr> <dttm> <chr> <dttm> <dbl>
## 1 Alspach NO 2019-07-26 00:00:00 ASD 2015-02-07 00:00:00 10
## 2 Arriaga NO 2019-07-31 00:00:00 ID 2013-05-15 00:00:00 11
## 3 Bourland NO 2019-05-01 00:00:00 ID 2007-12-17 00:00:00 17
## 4 Brown NO 2019-04-15 00:00:00 ASD 2002-08-16 00:00:00 22
## 5 Cash NO 2019-04-29 00:00:00 ASD 2014-02-05 00:00:00 11
## 6 Crostley NO 2019-08-28 00:00:00 ASD 2006-12-09 00:00:00 18
## # ℹ 6 more variables: gender <chr>, ethnicity <chr>, ISD <chr>, income <dbl>,
## # zipcode <dbl>, Region <chr>
library(lubridate)
library(dplyr)
race_eth_22 <- FY22 %>% mutate(hispanic=ifelse(grepl("Hispanic",ethnicity),1,0))
race_eth_21 <- FY21 %>% mutate(hispanic=ifelse(grepl("Hispanic",ethnicity),1,0))
enroll_22 <- race_eth_22 %>% select(age, income, hispanic) %>% arrange(-income, age, hispanic)
cor(enroll_22$income, enroll_22$hispanic)
## [1] -0.08105679
pairs(~age+income+hispanic,data=enroll_22)
cor.test(enroll_22$hispanic,enroll_22$income,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: enroll_22$hispanic and enroll_22$income
## t = -0.85681, df = 111, p-value = 0.3934
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2618655 0.1052490
## sample estimates:
## cor
## -0.08105679
hist(enroll_22$age,probability = T)
lines(density(enroll_22$age),col="red",lwd=3)
hist(enroll_22$income,probability = T)
lines(density(enroll_22$income),col="red",lwd=3)
cor.test(enroll_22$age,enroll_22$income,method="kendall")
##
## Kendall's rank correlation tau
##
## data: enroll_22$age and enroll_22$income
## z = 2.1872, p-value = 0.02873
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.1449036
Although weak, there is a positive correlation between age and income, the older the individuals the higher the income.
income <-enroll_22 %>% select(income) %>% mutate(income_sqrt=sqrt(income))
head(income)
## # A tibble: 6 × 2
## income income_sqrt
## <dbl> <dbl>
## 1 500000 707.
## 2 300000 548.
## 3 280000 529.
## 4 265000 515.
## 5 240000 490.
## 6 200000 447.
hist(income$income_sqrt, probability = T)
lines(density(income$income_sqrt),col='red',lwd=2)