Load Libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.1
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(readr)
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Set Working Directory & add CSV file

setwd("/Users/tiffanyking/Desktop/Data 110")
kindergarten<- read_csv("kindergarten_CA.csv")
## Parsed with column specification:
## cols(
##   district = col_character(),
##   sch_code = col_double(),
##   county = col_character(),
##   pub_priv = col_character(),
##   school = col_character(),
##   enrollment = col_double(),
##   complete = col_double(),
##   start_year = col_double()
## )

Structure of the data

str(kindergarten)
## tibble [110,382 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ district  : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
##  $ sch_code  : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
##  $ county    : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ pub_priv  : chr [1:110382] "Private" "Public" "Public" "Public" ...
##  $ school    : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
##  $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
##  $ complete  : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
##  $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   district = col_character(),
##   ..   sch_code = col_double(),
##   ..   county = col_character(),
##   ..   pub_priv = col_character(),
##   ..   school = col_character(),
##   ..   enrollment = col_double(),
##   ..   complete = col_double(),
##   ..   start_year = col_double()
##   .. )

Removing all NA’s

School <- na.omit(kindergarten)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(RColorBrewer)
library(plotly)

Prepare the data

Students <- School %>%
    select(enrollment,complete, start_year) %>%
    filter(start_year >=2005 & start_year<= 2015) %>%
    group_by(start_year) %>%
    summarize(total=sum(enrollment),
    immune_total= sum(complete)) %>%
    select(start_year,total,immune_total)
## `summarise()` ungrouping output (override with `.groups` argument)
Students
## # A tibble: 11 x 3
##    start_year  total immune_total
##         <dbl>  <dbl>        <dbl>
##  1       2005 507224       471216
##  2       2006 497817       461879
##  3       2007 493626       455245
##  4       2008 496085       455199
##  5       2009 491653       448480
##  6       2010 505088       458409
##  7       2011 524336       477356
##  8       2012 525536       474871
##  9       2013 530530       478701
## 10       2014 531940       481256
## 11       2015 547520       508693
cols <- c("Red","black")

Highcharter

highchart() %>%
  hc_yAxis_multiples(
    list(title = list(text = "Enrollment Rate")),
    list(title = list(text = "Complete Immun. Rate"),
         opposite = TRUE)
  ) %>%
  hc_add_series(data = Students$total,
                name = "Enrollment Rate",
                type = "column",
                yAxis = 0) %>%
  hc_add_series(data = Students$immune_total,
                name = "Complete Immunization Rate",
                type = "line",
                yAxis = 0) %>%
  hc_xAxis(categories = Students$start_year,
           tickInterval = 2) %>%
  hc_colors(cols) %>%
  hc_chart(style = list(fontFamily = "Georgia",
                        fontWeight = "bold"))

Prepare the data

Gradeschool <- School %>%
  select(enrollment, county, start_year) %>%
    filter(start_year>=2008 & start_year <= 2015 & county %in% c("Los Angeles", "San Diego","Orange", "Riverside", "San Bernardino", "Sacramento")) %>%
    group_by(county, start_year) %>%
    summarize(total= sum(enrollment)) %>%
    select(start_year, total)
## `summarise()` regrouping output by 'county' (override with `.groups` argument)
## Adding missing grouping variables: `county`
Gradeschool
## # A tibble: 48 x 3
## # Groups:   county [6]
##    county      start_year  total
##    <chr>            <dbl>  <dbl>
##  1 Los Angeles       2008 123892
##  2 Los Angeles       2009 118829
##  3 Los Angeles       2010 122736
##  4 Los Angeles       2011 129954
##  5 Los Angeles       2012 131102
##  6 Los Angeles       2013 130054
##  7 Los Angeles       2014 129494
##  8 Los Angeles       2015 133398
##  9 Orange            2008  41019
## 10 Orange            2009  40541
## # … with 38 more rows

Another plot to examine

cols <- brewer.pal(4, "Set1")

highchart() %>%
  hc_add_series(data = Gradeschool,
                   type = "line",
                   hcaes(x = start_year,
                   y = total, 
                   group = county)) %>%
  hc_colors(cols) %>%
  hc_xAxis(title = list(text="Year")) %>%
  hc_yAxis(title = list(text="Student Enrollment Rate"))

Overview of the data

summary(School)
##    district            sch_code          county            pub_priv        
##  Length:108730      Min.   :   1501   Length:108730      Length:108730     
##  Class :character   1st Qu.:6019863   Class :character   Class :character  
##  Mode  :character   Median :6048201   Mode  :character   Mode  :character  
##                     Mean   :5884901                                        
##                     3rd Qu.:6120430                                        
##                     Max.   :9999999                                        
##     school            enrollment        complete        start_year  
##  Length:108730      Min.   : 10.00   Min.   :  0.00   Min.   :2001  
##  Class :character   1st Qu.: 34.00   1st Qu.: 29.00   1st Qu.:2004  
##  Mode  :character   Median : 68.00   Median : 61.00   Median :2008  
##                     Mean   : 70.77   Mean   : 64.89   Mean   :2008  
##                     3rd Qu.: 98.00   3rd Qu.: 91.00   3rd Qu.:2012  
##                     Max.   :981.00   Max.   :973.00   Max.   :2015

Prepare the data for Box Plot

Education <- School %>%
  select(enrollment, county, start_year) %>%
    filter(start_year>=2008 & start_year<= 2015 & county %in% c("San Diego","Orange", "Riverside", "San Bernardino")) %>%
    group_by(county, start_year) %>%
    summarize(total=sum(enrollment)) %>%
    select(start_year, total)
## `summarise()` regrouping output by 'county' (override with `.groups` argument)
## Adding missing grouping variables: `county`
Education
## # A tibble: 32 x 3
## # Groups:   county [4]
##    county    start_year total
##    <chr>          <dbl> <dbl>
##  1 Orange          2008 41019
##  2 Orange          2009 40541
##  3 Orange          2010 41113
##  4 Orange          2011 41693
##  5 Orange          2012 42260
##  6 Orange          2013 42781
##  7 Orange          2014 41821
##  8 Orange          2015 41718
##  9 Riverside       2008 31390
## 10 Riverside       2009 31624
## # … with 22 more rows

Box Plot

p4 <- Education %>%
  ggplot(aes(county, total, fill = county)) + 
  ggtitle("Enrollment Rate of Kindergarten Students in CA") +
  xlab("Counties") +
  ylab("Frequency") +
  
  geom_boxplot() +
  scale_fill_discrete(name = "Counties", labels = c("San Diego","Orange", "Riverside", "San Bernardino")) +
theme_linedraw(base_size = 11)
p4

## Prepare for Scatterplot

my_df <- School %>%
 
  select(enrollment,complete, pub_priv, start_year) %>%
    filter(start_year>=2005 & start_year <= 2015 & pub_priv %in% c("Public")) %>%
    group_by(pub_priv,start_year) %>%
    summarize(Average=mean(enrollment),
      Average_immune= mean(complete))%>%
    select(start_year, Average_immune, Average)
## `summarise()` regrouping output by 'pub_priv' (override with `.groups` argument)
## Adding missing grouping variables: `pub_priv`
my_df
## # A tibble: 11 x 4
## # Groups:   pub_priv [1]
##    pub_priv start_year Average_immune Average
##    <chr>         <dbl>          <dbl>   <dbl>
##  1 Public         2005           78.5    84.1
##  2 Public         2006           76.5    82.1
##  3 Public         2007           75.4    81.5
##  4 Public         2008           75.9    82.5
##  5 Public         2009           75.2    82.2
##  6 Public         2010           76.4    83.9
##  7 Public         2011           78.7    86.2
##  8 Public         2012           78.4    86.4
##  9 Public         2013           79.6    87.9
## 10 Public         2014           79.4    87.5
## 11 Public         2015           82.9    88.9

ScatterPlot to Enrollment/ Immunizations Rate over the years

p5<- ggplot(my_df, aes(x=start_year, y=Average_immune, color=pub_priv)) +
  geom_point() + 
  ggtitle("Enrollment Rate of Kindergarten Students In CA", sub="By School Type") +
  labs(x="School Year", y="Average # of Students Enrolled") +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  theme_classic(base_size = 11)
p5
## `geom_smooth()` using formula 'y ~ x'

Short Essay

For this project, I decided to focus on the Kindergarten.CSV file. I choose this project because I thought it would be interesting to explore. This dataset is exploring the enrollment/ immunization rate within California counties & school districts. I do not have a personal connection to it because I do not have any children; however, the dataset contained useful information. Some many parents or caretakers may be against immunizations, so I think it would be interesting for them to see this information. Several variables were ranging from categorical to numerical. In the data set, the categorical variables are “Pub_Priv, District, School.” The numerical variables are “Enrollment and Complete” I started the project by uploading the libraries and cleaning the data, and I did that by using the na.omit function to remove all NA’s in the dataset. As I did more research, I found an article that examined immunization policies within the state of California. In 2016, there was billed passed that banned “personal belief exemptions for childhood vaccinations, immunization rates for children entering kindergarten” (Sindiso,2019) so which makes me believe parents had the choice not to vaccine their children. This was because there was a major outbreak of measles in 2014 and interesting enough, in my plot there was an a slight increase of students who got vaccinated. After reviewing the article, it interesting that people were so against vaccinations. If I had more time, I would explore the more recent data on immunization, especially after the law being passed in California. In my first plot I focused finding the sum of enrollment and immunization rate between the years 2005-2015. I used high charter, and you can see that the immunization rate was slightly lower, which is interesting because I thought it was required that students get vaccinated before attending school. With each plot, I tried to use different variables in the 2nd plot (which is the line graph). I wasn’t too surprised that Los Angeles was leading in enrollment rate, but I thought other counties would be closer to Los Angeles. For that plot, I purposely choose the popular California counties. If I had more time, I would compare public and private institutions. But I also would like I create a heat map of California between public and private school enrollment.

References

Nyathi, Sindiso, et al. “The 2016 California policy to eliminate nonmedical vaccine exemptions and changes in vaccine coverage: An empirical policy analysis.” PLoS Medicine, vol. 16, no. 12, 2019, p. e1002994. Gale In Context: Science, https://link.gale.com/apps/doc/A611297913/SCIC?u=rock77357&sid=SCIC&xid=1f27ad7a. Accessed 17 Nov. 2020.