library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
Loading into the dataset
kindergarten_CA <- read_csv("kindergarten_CA.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## district = col_character(),
## sch_code = col_double(),
## county = col_character(),
## pub_priv = col_character(),
## school = col_character(),
## enrollment = col_double(),
## complete = col_double(),
## start_year = col_double()
## )
View, Data Structure, Summary
View(kindergarten_CA)
str(kindergarten_CA)
## spec_tbl_df [110,382 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ district : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
## $ sch_code : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
## $ county : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ pub_priv : chr [1:110382] "Private" "Public" "Public" "Public" ...
## $ school : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
## $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
## $ complete : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
## $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
## - attr(*, "spec")=
## .. cols(
## .. district = col_character(),
## .. sch_code = col_double(),
## .. county = col_character(),
## .. pub_priv = col_character(),
## .. school = col_character(),
## .. enrollment = col_double(),
## .. complete = col_double(),
## .. start_year = col_double()
## .. )
summary(kindergarten_CA)
## district sch_code county pub_priv
## Length:110382 Min. : 1501 Length:110382 Length:110382
## Class :character 1st Qu.:6019905 Class :character Class :character
## Mode :character Median :6048706 Mode :character Mode :character
## Mean :5879880
## 3rd Qu.:6134460
## Max. :9999999
##
## school enrollment complete start_year
## Length:110382 Min. : 10.00 Min. : 0.00 Min. :2001
## Class :character 1st Qu.: 34.00 1st Qu.: 29.00 1st Qu.:2004
## Mode :character Median : 68.00 Median : 61.00 Median :2008
## Mean : 70.77 Mean : 64.89 Mean :2008
## 3rd Qu.: 98.00 3rd Qu.: 91.00 3rd Qu.:2012
## Max. :981.00 Max. :973.00 Max. :2015
## NA's :1652 NA's :1652
kindergarten_CA1 <- na.omit(kindergarten_CA)
kindergarten_CA1
## # A tibble: 108,730 x 8
## district sch_code county pub_priv school enrollment complete start_year
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alameda U~ 6967434 Alameda Private ALAMEDA ~ 12 11 2001
## 2 Alameda U~ 6110779 Alameda Public BAY FARM~ 78 77 2001
## 3 Alameda U~ 6100374 Alameda Public EARHART ~ 77 73 2001
## 4 Alameda U~ 6090013 Alameda Public EDISON E~ 56 53 2001
## 5 Alameda U~ 6090039 Alameda Public FRANKLIN~ 41 41 2001
## 6 Alameda U~ 6090047 Alameda Public HAIGHT E~ 75 65 2001
## 7 Alameda U~ 6090062 Alameda Public LONGFELL~ 40 34 2001
## 8 Alameda U~ 6090005 Alameda Public LUM (DON~ 80 76 2001
## 9 Alameda U~ 6090088 Alameda Public MILLER (~ 61 61 2001
## 10 Alameda U~ 6090021 Alameda Public OTIS (FR~ 49 43 2001
## # ... with 108,720 more rows
Grab the first 30 Rows of Data
KCA_df <- head(kindergarten_CA1, 30)
KCA_df
## # A tibble: 30 x 8
## district sch_code county pub_priv school enrollment complete start_year
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alameda U~ 6967434 Alameda Private ALAMEDA ~ 12 11 2001
## 2 Alameda U~ 6110779 Alameda Public BAY FARM~ 78 77 2001
## 3 Alameda U~ 6100374 Alameda Public EARHART ~ 77 73 2001
## 4 Alameda U~ 6090013 Alameda Public EDISON E~ 56 53 2001
## 5 Alameda U~ 6090039 Alameda Public FRANKLIN~ 41 41 2001
## 6 Alameda U~ 6090047 Alameda Public HAIGHT E~ 75 65 2001
## 7 Alameda U~ 6090062 Alameda Public LONGFELL~ 40 34 2001
## 8 Alameda U~ 6090005 Alameda Public LUM (DON~ 80 76 2001
## 9 Alameda U~ 6090088 Alameda Public MILLER (~ 61 61 2001
## 10 Alameda U~ 6090021 Alameda Public OTIS (FR~ 49 43 2001
## # ... with 20 more rows
Plotting Data
KCA_bar <- KCA_df %>%
ggplot(aes(x=pub_priv, y=enrollment, fill= school)) +
geom_bar(colour="white", width=.8, stat="identity") +
xlab("Public vs Private") + ylab("Enrollment") +
ggtitle("Alameda County Public vs Private School")
KCA_bar
The dataset I chose was Kindergarten_CA. This dataset came from Professor Saidi’s dataset collection. This dataset has a set of a total of 8 variables. Categorical variables are included in this dataset. The variables for this dataset are district (school district), school code( unique identifying code for each school), pub_priv(whether the school is public or private), school(name), and start_year(year of entry. The numerical variables include enrollment(number of children), and complete(number of children with complete immunizations. I chose this topic and dataset because I was unaware of the requirement for individuals to complete the immunizations for themselves to attend school. The reason I chose this dataset is that I wanted to educate myself on immunizations for kindergarteners in school. In this article, I read, called “California schools have to submit vaccine rates, but face no consequences if they don’t” by Alya Shuman. This article was saying that vaccination rates are required to be submitted every year but some schools do not comply and there are not any actual real consequences either. https://eu.redding.com/story/news/local/2019/05/13/ca-schools-not-reporting-vaccination-rates-face-no-consequences/3576981002/