library(rlang)
## Warning: package 'rlang' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::%@%() masks rlang::%@%()
## x purrr::as_function() masks rlang::as_function()
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks rlang::flatten()
## x purrr::flatten_chr() masks rlang::flatten_chr()
## x purrr::flatten_dbl() masks rlang::flatten_dbl()
## x purrr::flatten_int() masks rlang::flatten_int()
## x purrr::flatten_lgl() masks rlang::flatten_lgl()
## x purrr::flatten_raw() masks rlang::flatten_raw()
## x purrr::invoke() masks rlang::invoke()
## x dplyr::lag() masks stats::lag()
## x purrr::list_along() masks rlang::list_along()
## x purrr::modify() masks rlang::modify()
## x purrr::prepend() masks rlang::prepend()
## x purrr::splice() masks rlang::splice()
Kindergarten_CA <- read_csv("C:/Users/Maggie/Downloads/kindergarten_CA.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## district = col_character(),
## sch_code = col_double(),
## county = col_character(),
## pub_priv = col_character(),
## school = col_character(),
## enrollment = col_double(),
## complete = col_double(),
## start_year = col_double()
## )
view(Kindergarten_CA)
str(Kindergarten_CA)
## tibble [110,382 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ district : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
## $ sch_code : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
## $ county : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ pub_priv : chr [1:110382] "Private" "Public" "Public" "Public" ...
## $ school : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
## $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
## $ complete : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
## $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
## - attr(*, "spec")=
## .. cols(
## .. district = col_character(),
## .. sch_code = col_double(),
## .. county = col_character(),
## .. pub_priv = col_character(),
## .. school = col_character(),
## .. enrollment = col_double(),
## .. complete = col_double(),
## .. start_year = col_double()
## .. )
any(is.na(Kindergarten_CA))
## [1] TRUE
summary(Kindergarten_CA)
## district sch_code county pub_priv
## Length:110382 Min. : 1501 Length:110382 Length:110382
## Class :character 1st Qu.:6019905 Class :character Class :character
## Mode :character Median :6048706 Mode :character Mode :character
## Mean :5879880
## 3rd Qu.:6134460
## Max. :9999999
##
## school enrollment complete start_year
## Length:110382 Min. : 10.00 Min. : 0.00 Min. :2001
## Class :character 1st Qu.: 34.00 1st Qu.: 29.00 1st Qu.:2004
## Mode :character Median : 68.00 Median : 61.00 Median :2008
## Mean : 70.77 Mean : 64.89 Mean :2008
## 3rd Qu.: 98.00 3rd Qu.: 91.00 3rd Qu.:2012
## Max. :981.00 Max. :973.00 Max. :2015
## NA's :1652 NA's :1652
kindergarten_no_na <-na.omit(Kindergarten_CA)
any(is.na(kindergarten_no_na))
## [1] FALSE
summary(kindergarten_no_na)
## district sch_code county pub_priv
## Length:108730 Min. : 1501 Length:108730 Length:108730
## Class :character 1st Qu.:6019863 Class :character Class :character
## Mode :character Median :6048201 Mode :character Mode :character
## Mean :5884901
## 3rd Qu.:6120430
## Max. :9999999
## school enrollment complete start_year
## Length:108730 Min. : 10.00 Min. : 0.00 Min. :2001
## Class :character 1st Qu.: 34.00 1st Qu.: 29.00 1st Qu.:2004
## Mode :character Median : 68.00 Median : 61.00 Median :2008
## Mean : 70.77 Mean : 64.89 Mean :2008
## 3rd Qu.: 98.00 3rd Qu.: 91.00 3rd Qu.:2012
## Max. :981.00 Max. :973.00 Max. :2015
head(kindergarten_no_na)
## # A tibble: 6 x 8
## district sch_code county pub_priv school enrollment complete start_year
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alameda Un~ 6967434 Alame~ Private ALAMEDA C~ 12 11 2001
## 2 Alameda Un~ 6110779 Alame~ Public BAY FARM ~ 78 77 2001
## 3 Alameda Un~ 6100374 Alame~ Public EARHART (~ 77 73 2001
## 4 Alameda Un~ 6090013 Alame~ Public EDISON EL~ 56 53 2001
## 5 Alameda Un~ 6090039 Alame~ Public FRANKLIN ~ 41 41 2001
## 6 Alameda Un~ 6090047 Alame~ Public HAIGHT EL~ 75 65 2001
kindergarten_no_na <- mutate(kindergarten_no_na, Percentage = complete / enrollment * 100)
view(kindergarten_no_na)
Percent_Mode <- table(kindergarten_no_na$Percentage)
Percent_Mode[Percent_Mode == max(Percent_Mode)]
## 100
## 20173
kindergarten_no_na <- rename(kindergarten_no_na, Public_Private = pub_priv )
head(kindergarten_no_na)
## # A tibble: 6 x 9
## district sch_code county Public_Private school enrollment complete start_year
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alameda~ 6967434 Alame~ Private ALAME~ 12 11 2001
## 2 Alameda~ 6110779 Alame~ Public BAY F~ 78 77 2001
## 3 Alameda~ 6100374 Alame~ Public EARHA~ 77 73 2001
## 4 Alameda~ 6090013 Alame~ Public EDISO~ 56 53 2001
## 5 Alameda~ 6090039 Alame~ Public FRANK~ 41 41 2001
## 6 Alameda~ 6090047 Alame~ Public HAIGH~ 75 65 2001
## # ... with 1 more variable: Percentage <dbl>
only_NAs_Complete <- Kindergarten_CA[is.na(Kindergarten_CA$complete),]
view(only_NAs_Complete)
NA_Counts_Complete <- table(only_NAs_Complete$pub_priv,only_NAs_Complete$start_year)
view(NA_Counts_Complete)
barplot(NA_Counts_Complete, main = "Number of Public and Private School Who Did Not Report Immunizations By Year", xlab = "Year", beside = TRUE, legend = c("Private", "Public"), col = c("light blue", "yellow"))
unique(kindergarten_no_na$county)
## [1] "Alameda" "Amador" "Butte" "Calaveras"
## [5] "Colusa" "Contra Costa" "Del Norte" "El Dorado"
## [9] "Fresno" "Glenn" "Humboldt" "Imperial"
## [13] "Inyo" "Kern" "Kings" "Lake"
## [17] "Lassen" "Los Angeles" "Alpine" "Madera"
## [21] "Marin" "Mariposa" "Mendocino" "Merced"
## [25] "Modoc" "Mono" "Monterey" "Napa"
## [29] "Nevada" "Orange" "Placer" "Plumas"
## [33] "Riverside" "Sacramento" "San Benito" "San Bernardino"
## [37] "San Diego" "San Francisco" "San Joaquin" "San Luis Obispo"
## [41] "San Mateo" "Santa Barbara" "Santa Clara" "Santa Cruz"
## [45] "Shasta" "Sierra" "Siskiyou" "Solano"
## [49] "Sonoma" "Stanislaus" "Sutter" "Tehama"
## [53] "Trinity" "Tulare" "Tuolumne" "Ventura"
## [57] "Yolo" "Yuba"
table(kindergarten_no_na$county, kindergarten_no_na$Public_Private)
##
## Private Public
## Alameda 1308 3175
## Alpine 1 13
## Amador 7 91
## Butte 100 607
## Calaveras 9 156
## Colusa 10 72
## Contra Costa 844 2189
## Del Norte 10 117
## El Dorado 88 457
## Fresno 298 2663
## Glenn 1 112
## Humboldt 34 511
## Imperial 81 523
## Inyo 2 57
## Kern 278 2064
## Kings 77 426
## Lake 14 185
## Lassen 1 148
## Los Angeles 8727 18363
## Madera 30 440
## Marin 328 591
## Mariposa 0 70
## Mendocino 53 250
## Merced 97 745
## Modoc 0 47
## Mono 0 50
## Monterey 163 1053
## Napa 132 361
## Nevada 45 280
## Orange 2899 5812
## Placer 205 988
## Plumas 6 69
## Riverside 1182 4025
## Sacramento 848 3379
## San Benito 26 170
## San Bernardino 1016 4785
## San Diego 2074 6376
## San Francisco 881 1101
## San Joaquin 341 2063
## San Luis Obispo 154 595
## San Mateo 737 1601
## Santa Barbara 292 1096
## Santa Clara 1658 3635
## Santa Cruz 211 610
## Shasta 102 568
## Sierra 0 15
## Siskiyou 1 196
## Solano 222 873
## Sonoma 268 1422
## Stanislaus 188 1480
## Sutter 44 346
## Tehama 20 217
## Trinity 0 69
## Tulare 117 1413
## Tuolumne 30 154
## Ventura 700 1986
## Yolo 129 453
## Yuba 20 308
ggplot(kindergarten_no_na,
aes(enrollment, complete)) +
geom_point() +
facet_wrap(.~ Public_Private )+
ggtitle("Total Enrollment vs. # of Children who complete Immunizations")
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
P <- plot_ly(kindergarten_no_na, x = ~Public_Private, y = ~Percentage, type = "box") %>%
layout(
title = "Boxplot of Complete Immunizations Percentage by School Type", xaxis = list(title = "Type of School", font_family = "Courier New"))
P
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
kindergarten_no_na %>%
ggplot(aes(start_year, Percentage, fill = Public_Private))+
ggtitle("Percentage of Complete Immunizations Enrolled Per Year Between Public and Private School Across Counties")+
xlab("Year")+
geom_boxplot()+
facet_wrap(~county)
ggplot(kindergarten_no_na)+
geom_point(aes(start_year, Percentage, color = county))+
facet_grid(.~Public_Private)
DF <- kindergarten_no_na%>%
group_by(start_year) %>%
summarise(sumenrollment = sum(enrollment) , sumcomplete = sum(complete))
## `summarise()` ungrouping output (override with `.groups` argument)
DF
## # A tibble: 15 x 3
## start_year sumenrollment sumcomplete
## <dbl> <dbl> <dbl>
## 1 2001 517854 470856
## 2 2002 513560 474445
## 3 2003 507680 470176
## 4 2004 504450 468992
## 5 2005 507224 471216
## 6 2006 497817 461879
## 7 2007 493626 455245
## 8 2008 496085 455199
## 9 2009 491653 448480
## 10 2010 505088 458409
## 11 2011 524336 477356
## 12 2012 525536 474871
## 13 2013 530530 478701
## 14 2014 531940 481256
## 15 2015 547520 508693
mutate(DF, sumcomplete / sumenrollment * 100)
## # A tibble: 15 x 4
## start_year sumenrollment sumcomplete `sumcomplete/sumenrollment * 100`
## <dbl> <dbl> <dbl> <dbl>
## 1 2001 517854 470856 90.9
## 2 2002 513560 474445 92.4
## 3 2003 507680 470176 92.6
## 4 2004 504450 468992 93.0
## 5 2005 507224 471216 92.9
## 6 2006 497817 461879 92.8
## 7 2007 493626 455245 92.2
## 8 2008 496085 455199 91.8
## 9 2009 491653 448480 91.2
## 10 2010 505088 458409 90.8
## 11 2011 524336 477356 91.0
## 12 2012 525536 474871 90.4
## 13 2013 530530 478701 90.2
## 14 2014 531940 481256 90.5
## 15 2015 547520 508693 92.9
PercentagePerYear <- c(90.92447,92.38356,92.61267,92.97096,90.90097,92.78088,92.22468,91.75827,91.21881,90.75824,91.04010,90.35937,90.23071,90.47186,92.90857)
Year <- c("2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015")
PPY <- data.frame(PercentagePerYear, Year)
PPY
## PercentagePerYear Year
## 1 90.92447 2001
## 2 92.38356 2002
## 3 92.61267 2003
## 4 92.97096 2004
## 5 90.90097 2005
## 6 92.78088 2006
## 7 92.22468 2007
## 8 91.75827 2008
## 9 91.21881 2009
## 10 90.75824 2010
## 11 91.04010 2011
## 12 90.35937 2012
## 13 90.23071 2013
## 14 90.47186 2014
## 15 92.90857 2015
ggplot(PPY, aes(x = Year, y = PercentagePerYear, group = 1))+
geom_point()+
geom_line(stat = "identity")+
labs(x = "Year", y = "Percentage", title = "Percentage of Children with Complete Immunization Enrolled by Year")
library(viridis)
## Warning: package 'viridis' was built under R version 3.6.3
## Loading required package: viridisLite
kindergarten_no_na %>%
mutate(percentage = complete / enrollment * 100) %>%
ggplot(aes(start_year,county, fill = percentage)) +
geom_tile(colour = "#FF67A4")+
scale_fill_viridis() +
ggtitle("Percentage by County each Year Between Private and Public School")+
labs(x = "Year" , y = "County")+
facet_grid(.~ Public_Private)
Essay:
My dataset is from the California Department of Public Health. It is a dataset documenting the number of kindergarten students enrolled and the number of children with complete immunizations in California from 2001-2015. This dataset has a total of 8 variables before mutate. The categorical variables includes: district(school district), school code(unique identifying code for each school), pub_priv(whether the school is private or public), school(name), and start_year(year of entry). The numerical variables includes: enrollment(number of children enrolled), and complete(number of children with complete immunizations).The dataset started off with 1652 missing datas, but because I wanted to look at the number of missing datas between private and public school, I just made a new data frame without the missing datas by using the na.omit function. I also did not like the variable name “pub_priv,” so I decided to rename it to “Public_Private” using the rename function. I chose this topic and dataset because I was unaware of the requirement to complete immunizations in order to attend school. I decided to choose this topic to educate myself about it. According to an article I read called, " California schools have to submit vaccine rates, but face no consequences if they don’t," written by Alayna Shulman that vaccination rates are require to be submitted every year but some schools do not comply and there aren’t any real consequences either. According to the article, private schools were the one to not report immunization rates compares to public schools. While public schools depends on state money and private schools do not have state funding to lose, there are really no consequences for private school. In the barplot of comparing public and private school’s missing datas by year, it shows that private school has a higher number of missing datas, even though public school has a lot more datas in the dataset. Another thing I have also noticed is the drastic difference in percentage of children with complete immunization between the year 2014 and 2015. The reason for the big difference was because of the new 2015 vaccination law in California according to another article, " Quick Guide: What schools and parents need to know about California’s vaccination law," written by Jane Meredith Adams and Diana Lambert. Some parents were able to opt out of immunizing their children due to their personal beliefs, but with the new vaccination law, the exemptions are not longer allow.
Adams, J. M., & Lambert, D. (2019, June 20). Quick Guide: What schools and parents need to know about California’s vaccination law. EdSource. https://edsource.org/2019/what-schools-and-parents-need-to-know-about-the-new-vaccination-law/82242
Shulman, A. R. R. S. (2019, May 13). California schools have to submit vaccine rates, but face no consequences if they don’t. Redding Record Searchlight. https://eu.redding.com/story/news/local/2019/05/13/ca-schools-not-reporting-vaccination-rates-face-no-consequences/3576981002/