library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)

Loading into the dataset

kindergarten_CA <- read_csv("kindergarten_CA.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   district = col_character(),
##   sch_code = col_double(),
##   county = col_character(),
##   pub_priv = col_character(),
##   school = col_character(),
##   enrollment = col_double(),
##   complete = col_double(),
##   start_year = col_double()
## )

View, Data Structure, Summary

View(kindergarten_CA)
str(kindergarten_CA)
## spec_tbl_df [110,382 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ district  : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
##  $ sch_code  : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
##  $ county    : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ pub_priv  : chr [1:110382] "Private" "Public" "Public" "Public" ...
##  $ school    : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
##  $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
##  $ complete  : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
##  $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   district = col_character(),
##   ..   sch_code = col_double(),
##   ..   county = col_character(),
##   ..   pub_priv = col_character(),
##   ..   school = col_character(),
##   ..   enrollment = col_double(),
##   ..   complete = col_double(),
##   ..   start_year = col_double()
##   .. )
summary(kindergarten_CA)
##    district            sch_code          county            pub_priv        
##  Length:110382      Min.   :   1501   Length:110382      Length:110382     
##  Class :character   1st Qu.:6019905   Class :character   Class :character  
##  Mode  :character   Median :6048706   Mode  :character   Mode  :character  
##                     Mean   :5879880                                        
##                     3rd Qu.:6134460                                        
##                     Max.   :9999999                                        
##                                                                            
##     school            enrollment        complete        start_year  
##  Length:110382      Min.   : 10.00   Min.   :  0.00   Min.   :2001  
##  Class :character   1st Qu.: 34.00   1st Qu.: 29.00   1st Qu.:2004  
##  Mode  :character   Median : 68.00   Median : 61.00   Median :2008  
##                     Mean   : 70.77   Mean   : 64.89   Mean   :2008  
##                     3rd Qu.: 98.00   3rd Qu.: 91.00   3rd Qu.:2012  
##                     Max.   :981.00   Max.   :973.00   Max.   :2015  
##                     NA's   :1652     NA's   :1652
kindergarten_CA1 <- na.omit(kindergarten_CA)

kindergarten_CA1
## # A tibble: 108,730 x 8
##    district   sch_code county  pub_priv school    enrollment complete start_year
##    <chr>         <dbl> <chr>   <chr>    <chr>          <dbl>    <dbl>      <dbl>
##  1 Alameda U~  6967434 Alameda Private  ALAMEDA ~         12       11       2001
##  2 Alameda U~  6110779 Alameda Public   BAY FARM~         78       77       2001
##  3 Alameda U~  6100374 Alameda Public   EARHART ~         77       73       2001
##  4 Alameda U~  6090013 Alameda Public   EDISON E~         56       53       2001
##  5 Alameda U~  6090039 Alameda Public   FRANKLIN~         41       41       2001
##  6 Alameda U~  6090047 Alameda Public   HAIGHT E~         75       65       2001
##  7 Alameda U~  6090062 Alameda Public   LONGFELL~         40       34       2001
##  8 Alameda U~  6090005 Alameda Public   LUM (DON~         80       76       2001
##  9 Alameda U~  6090088 Alameda Public   MILLER (~         61       61       2001
## 10 Alameda U~  6090021 Alameda Public   OTIS (FR~         49       43       2001
## # ... with 108,720 more rows

Grab the first 30 Rows of Data

KCA_df <- head(kindergarten_CA1, 30)

KCA_df
## # A tibble: 30 x 8
##    district   sch_code county  pub_priv school    enrollment complete start_year
##    <chr>         <dbl> <chr>   <chr>    <chr>          <dbl>    <dbl>      <dbl>
##  1 Alameda U~  6967434 Alameda Private  ALAMEDA ~         12       11       2001
##  2 Alameda U~  6110779 Alameda Public   BAY FARM~         78       77       2001
##  3 Alameda U~  6100374 Alameda Public   EARHART ~         77       73       2001
##  4 Alameda U~  6090013 Alameda Public   EDISON E~         56       53       2001
##  5 Alameda U~  6090039 Alameda Public   FRANKLIN~         41       41       2001
##  6 Alameda U~  6090047 Alameda Public   HAIGHT E~         75       65       2001
##  7 Alameda U~  6090062 Alameda Public   LONGFELL~         40       34       2001
##  8 Alameda U~  6090005 Alameda Public   LUM (DON~         80       76       2001
##  9 Alameda U~  6090088 Alameda Public   MILLER (~         61       61       2001
## 10 Alameda U~  6090021 Alameda Public   OTIS (FR~         49       43       2001
## # ... with 20 more rows

Plotting Data

KCA_bar <- KCA_df %>%
  ggplot(aes(x=pub_priv, y=enrollment, fill= school)) + 
    geom_bar(colour="white", width=.8, stat="identity") + 
    xlab("Public vs Private") + ylab("Enrollment") +
    ggtitle("Alameda County Public vs Private School")

KCA_bar

The dataset I chose was Kindergarten_CA. This dataset came from Professor Saidi’s dataset collection. This dataset has a set of a total of 8 variables. Categorical variables are included in this dataset. The variables for this dataset are district (school district), school code( unique identifying code for each school), pub_priv(whether the school is public or private), school(name), and start_year(year of entry. The numerical variables include enrollment(number of children), and complete(number of children with complete immunizations. I chose this topic and dataset because I was unaware of the requirement for individuals to complete the immunizations for themselves to attend school. The reason I chose this dataset is that I wanted to educate myself on immunizations for kindergarteners in school. In this article, I read, called “California schools have to submit vaccine rates, but face no consequences if they don’t” by Alya Shuman. This article was saying that vaccination rates are required to be submitted every year but some schools do not comply and there are not any actual real consequences either. https://eu.redding.com/story/news/local/2019/05/13/ca-schools-not-reporting-vaccination-rates-face-no-consequences/3576981002/