library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v stringr 1.4.0
## v tidyr   1.2.0     v forcats 0.5.1
## v readr   2.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)

Loading into the dataset

kindergarten_CA <- read_csv("kindergarten_CA.csv")
## Rows: 110382 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): district, county, pub_priv, school
## dbl (4): sch_code, enrollment, complete, start_year
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

View, Data Structure, Summary

View(kindergarten_CA)
str(kindergarten_CA)
## spec_tbl_df [110,382 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ district  : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
##  $ sch_code  : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
##  $ county    : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ pub_priv  : chr [1:110382] "Private" "Public" "Public" "Public" ...
##  $ school    : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
##  $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
##  $ complete  : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
##  $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   district = col_character(),
##   ..   sch_code = col_double(),
##   ..   county = col_character(),
##   ..   pub_priv = col_character(),
##   ..   school = col_character(),
##   ..   enrollment = col_double(),
##   ..   complete = col_double(),
##   ..   start_year = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(kindergarten_CA)
##    district            sch_code          county            pub_priv        
##  Length:110382      Min.   :   1501   Length:110382      Length:110382     
##  Class :character   1st Qu.:6019905   Class :character   Class :character  
##  Mode  :character   Median :6048706   Mode  :character   Mode  :character  
##                     Mean   :5879880                                        
##                     3rd Qu.:6134460                                        
##                     Max.   :9999999                                        
##                                                                            
##     school            enrollment        complete        start_year  
##  Length:110382      Min.   : 10.00   Min.   :  0.00   Min.   :2001  
##  Class :character   1st Qu.: 34.00   1st Qu.: 29.00   1st Qu.:2004  
##  Mode  :character   Median : 68.00   Median : 61.00   Median :2008  
##                     Mean   : 70.77   Mean   : 64.89   Mean   :2008  
##                     3rd Qu.: 98.00   3rd Qu.: 91.00   3rd Qu.:2012  
##                     Max.   :981.00   Max.   :973.00   Max.   :2015  
##                     NA's   :1652     NA's   :1652
kindergarten_CA1 <- na.omit(kindergarten_CA)

kindergarten_CA1
## # A tibble: 108,730 x 8
##    district       sch_code county pub_priv school enrollment complete start_year
##    <chr>             <dbl> <chr>  <chr>    <chr>       <dbl>    <dbl>      <dbl>
##  1 Alameda Unifi~  6967434 Alame~ Private  ALAME~         12       11       2001
##  2 Alameda Unifi~  6110779 Alame~ Public   BAY F~         78       77       2001
##  3 Alameda Unifi~  6100374 Alame~ Public   EARHA~         77       73       2001
##  4 Alameda Unifi~  6090013 Alame~ Public   EDISO~         56       53       2001
##  5 Alameda Unifi~  6090039 Alame~ Public   FRANK~         41       41       2001
##  6 Alameda Unifi~  6090047 Alame~ Public   HAIGH~         75       65       2001
##  7 Alameda Unifi~  6090062 Alame~ Public   LONGF~         40       34       2001
##  8 Alameda Unifi~  6090005 Alame~ Public   LUM (~         80       76       2001
##  9 Alameda Unifi~  6090088 Alame~ Public   MILLE~         61       61       2001
## 10 Alameda Unifi~  6090021 Alame~ Public   OTIS ~         49       43       2001
## # ... with 108,720 more rows

Grab the first 30 Rows of Data

KCA_df <- head(kindergarten_CA1, 30)

KCA_df
## # A tibble: 30 x 8
##    district       sch_code county pub_priv school enrollment complete start_year
##    <chr>             <dbl> <chr>  <chr>    <chr>       <dbl>    <dbl>      <dbl>
##  1 Alameda Unifi~  6967434 Alame~ Private  ALAME~         12       11       2001
##  2 Alameda Unifi~  6110779 Alame~ Public   BAY F~         78       77       2001
##  3 Alameda Unifi~  6100374 Alame~ Public   EARHA~         77       73       2001
##  4 Alameda Unifi~  6090013 Alame~ Public   EDISO~         56       53       2001
##  5 Alameda Unifi~  6090039 Alame~ Public   FRANK~         41       41       2001
##  6 Alameda Unifi~  6090047 Alame~ Public   HAIGH~         75       65       2001
##  7 Alameda Unifi~  6090062 Alame~ Public   LONGF~         40       34       2001
##  8 Alameda Unifi~  6090005 Alame~ Public   LUM (~         80       76       2001
##  9 Alameda Unifi~  6090088 Alame~ Public   MILLE~         61       61       2001
## 10 Alameda Unifi~  6090021 Alame~ Public   OTIS ~         49       43       2001
## # ... with 20 more rows

Plotting Data

KCA_bar <- KCA_df %>%
  ggplot(aes(x=pub_priv, y=enrollment, fill= school)) + 
    geom_bar(colour="white", width=.8, stat="identity") + 
    xlab("Public vs Private") + ylab("Enrollment") +
    ggtitle("Alameda County Public vs Private School")

KCA_bar