library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(readr)
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
setwd("/Users/tiffanyking/Desktop/Data 110")
kindergarten<- read_csv("kindergarten_CA.csv")
## Parsed with column specification:
## cols(
## district = col_character(),
## sch_code = col_double(),
## county = col_character(),
## pub_priv = col_character(),
## school = col_character(),
## enrollment = col_double(),
## complete = col_double(),
## start_year = col_double()
## )
str(kindergarten)
## tibble [110,382 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ district : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
## $ sch_code : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
## $ county : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ pub_priv : chr [1:110382] "Private" "Public" "Public" "Public" ...
## $ school : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
## $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
## $ complete : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
## $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
## - attr(*, "spec")=
## .. cols(
## .. district = col_character(),
## .. sch_code = col_double(),
## .. county = col_character(),
## .. pub_priv = col_character(),
## .. school = col_character(),
## .. enrollment = col_double(),
## .. complete = col_double(),
## .. start_year = col_double()
## .. )
School <- na.omit(kindergarten)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(RColorBrewer)
library(plotly)
Students <- School %>%
select(enrollment,complete, start_year) %>%
filter(start_year >=2005 & start_year<= 2015) %>%
group_by(start_year) %>%
summarize(total=sum(enrollment),
immune_total= sum(complete)) %>%
select(start_year,total,immune_total)
## `summarise()` ungrouping output (override with `.groups` argument)
Students
## # A tibble: 11 x 3
## start_year total immune_total
## <dbl> <dbl> <dbl>
## 1 2005 507224 471216
## 2 2006 497817 461879
## 3 2007 493626 455245
## 4 2008 496085 455199
## 5 2009 491653 448480
## 6 2010 505088 458409
## 7 2011 524336 477356
## 8 2012 525536 474871
## 9 2013 530530 478701
## 10 2014 531940 481256
## 11 2015 547520 508693
cols <- c("Red","black")
highchart() %>%
hc_yAxis_multiples(
list(title = list(text = "Enrollment Rate")),
list(title = list(text = "Complete Immun. Rate"),
opposite = TRUE)
) %>%
hc_add_series(data = Students$total,
name = "Enrollment Rate",
type = "column",
yAxis = 0) %>%
hc_add_series(data = Students$immune_total,
name = "Complete Immunization Rate",
type = "line",
yAxis = 0) %>%
hc_xAxis(categories = Students$start_year,
tickInterval = 2) %>%
hc_colors(cols) %>%
hc_chart(style = list(fontFamily = "Georgia",
fontWeight = "bold"))
Gradeschool <- School %>%
select(enrollment, county, start_year) %>%
filter(start_year>=2008 & start_year <= 2015 & county %in% c("Los Angeles", "San Diego","Orange", "Riverside", "San Bernardino", "Sacramento")) %>%
group_by(county, start_year) %>%
summarize(total= sum(enrollment)) %>%
select(start_year, total)
## `summarise()` regrouping output by 'county' (override with `.groups` argument)
## Adding missing grouping variables: `county`
Gradeschool
## # A tibble: 48 x 3
## # Groups: county [6]
## county start_year total
## <chr> <dbl> <dbl>
## 1 Los Angeles 2008 123892
## 2 Los Angeles 2009 118829
## 3 Los Angeles 2010 122736
## 4 Los Angeles 2011 129954
## 5 Los Angeles 2012 131102
## 6 Los Angeles 2013 130054
## 7 Los Angeles 2014 129494
## 8 Los Angeles 2015 133398
## 9 Orange 2008 41019
## 10 Orange 2009 40541
## # … with 38 more rows
cols <- brewer.pal(4, "Set1")
highchart() %>%
hc_add_series(data = Gradeschool,
type = "line",
hcaes(x = start_year,
y = total,
group = county)) %>%
hc_colors(cols) %>%
hc_xAxis(title = list(text="Year")) %>%
hc_yAxis(title = list(text="Student Enrollment Rate"))
summary(School)
## district sch_code county pub_priv
## Length:108730 Min. : 1501 Length:108730 Length:108730
## Class :character 1st Qu.:6019863 Class :character Class :character
## Mode :character Median :6048201 Mode :character Mode :character
## Mean :5884901
## 3rd Qu.:6120430
## Max. :9999999
## school enrollment complete start_year
## Length:108730 Min. : 10.00 Min. : 0.00 Min. :2001
## Class :character 1st Qu.: 34.00 1st Qu.: 29.00 1st Qu.:2004
## Mode :character Median : 68.00 Median : 61.00 Median :2008
## Mean : 70.77 Mean : 64.89 Mean :2008
## 3rd Qu.: 98.00 3rd Qu.: 91.00 3rd Qu.:2012
## Max. :981.00 Max. :973.00 Max. :2015
Education <- School %>%
select(enrollment, county, start_year) %>%
filter(start_year>=2008 & start_year<= 2015 & county %in% c("San Diego","Orange", "Riverside", "San Bernardino")) %>%
group_by(county, start_year) %>%
summarize(total=sum(enrollment)) %>%
select(start_year, total)
## `summarise()` regrouping output by 'county' (override with `.groups` argument)
## Adding missing grouping variables: `county`
Education
## # A tibble: 32 x 3
## # Groups: county [4]
## county start_year total
## <chr> <dbl> <dbl>
## 1 Orange 2008 41019
## 2 Orange 2009 40541
## 3 Orange 2010 41113
## 4 Orange 2011 41693
## 5 Orange 2012 42260
## 6 Orange 2013 42781
## 7 Orange 2014 41821
## 8 Orange 2015 41718
## 9 Riverside 2008 31390
## 10 Riverside 2009 31624
## # … with 22 more rows
p4 <- Education %>%
ggplot(aes(county, total, fill = county)) +
ggtitle("Enrollment Rate of Kindergarten Students in CA") +
xlab("Counties") +
ylab("Frequency") +
geom_boxplot() +
scale_fill_discrete(name = "Counties", labels = c("San Diego","Orange", "Riverside", "San Bernardino")) +
theme_linedraw(base_size = 11)
p4
## Prepare for Scatterplot
my_df <- School %>%
select(enrollment,complete, pub_priv, start_year) %>%
filter(start_year>=2005 & start_year <= 2015 & pub_priv %in% c("Public")) %>%
group_by(pub_priv,start_year) %>%
summarize(Average=mean(enrollment),
Average_immune= mean(complete))%>%
select(start_year, Average_immune, Average)
## `summarise()` regrouping output by 'pub_priv' (override with `.groups` argument)
## Adding missing grouping variables: `pub_priv`
my_df
## # A tibble: 11 x 4
## # Groups: pub_priv [1]
## pub_priv start_year Average_immune Average
## <chr> <dbl> <dbl> <dbl>
## 1 Public 2005 78.5 84.1
## 2 Public 2006 76.5 82.1
## 3 Public 2007 75.4 81.5
## 4 Public 2008 75.9 82.5
## 5 Public 2009 75.2 82.2
## 6 Public 2010 76.4 83.9
## 7 Public 2011 78.7 86.2
## 8 Public 2012 78.4 86.4
## 9 Public 2013 79.6 87.9
## 10 Public 2014 79.4 87.5
## 11 Public 2015 82.9 88.9
p5<- ggplot(my_df, aes(x=start_year, y=Average_immune, color=pub_priv)) +
geom_point() +
ggtitle("Enrollment Rate of Kindergarten Students In CA", sub="By School Type") +
labs(x="School Year", y="Average # of Students Enrolled") +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
theme_classic(base_size = 11)
p5
## `geom_smooth()` using formula 'y ~ x'
For this project, I decided to focus on the Kindergarten.CSV file. I choose this project because I thought it would be interesting to explore. This dataset is exploring the enrollment/ immunization rate within California counties & school districts. I do not have a personal connection to it because I do not have any children; however, the dataset contained useful information. Some many parents or caretakers may be against immunizations, so I think it would be interesting for them to see this information. Several variables were ranging from categorical to numerical. In the data set, the categorical variables are “Pub_Priv, District, School.” The numerical variables are “Enrollment and Complete” I started the project by uploading the libraries and cleaning the data, and I did that by using the na.omit function to remove all NA’s in the dataset. As I did more research, I found an article that examined immunization policies within the state of California. In 2016, there was billed passed that banned “personal belief exemptions for childhood vaccinations, immunization rates for children entering kindergarten” (Sindiso,2019) so which makes me believe parents had the choice not to vaccine their children. This was because there was a major outbreak of measles in 2014 and interesting enough, in my plot there was an a slight increase of students who got vaccinated. After reviewing the article, it interesting that people were so against vaccinations. If I had more time, I would explore the more recent data on immunization, especially after the law being passed in California. In my first plot I focused finding the sum of enrollment and immunization rate between the years 2005-2015. I used high charter, and you can see that the immunization rate was slightly lower, which is interesting because I thought it was required that students get vaccinated before attending school. With each plot, I tried to use different variables in the 2nd plot (which is the line graph). I wasn’t too surprised that Los Angeles was leading in enrollment rate, but I thought other counties would be closer to Los Angeles. For that plot, I purposely choose the popular California counties. If I had more time, I would compare public and private institutions. But I also would like I create a heat map of California between public and private school enrollment.
References
Nyathi, Sindiso, et al. “The 2016 California policy to eliminate nonmedical vaccine exemptions and changes in vaccine coverage: An empirical policy analysis.” PLoS Medicine, vol. 16, no. 12, 2019, p. e1002994. Gale In Context: Science, https://link.gale.com/apps/doc/A611297913/SCIC?u=rock77357&sid=SCIC&xid=1f27ad7a. Accessed 17 Nov. 2020.