library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v stringr 1.4.0
## v tidyr 1.2.0 v forcats 0.5.1
## v readr 2.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
Loading into the dataset
kindergarten_CA <- read_csv("kindergarten_CA.csv")
## Rows: 110382 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): district, county, pub_priv, school
## dbl (4): sch_code, enrollment, complete, start_year
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View, Data Structure, Summary
View(kindergarten_CA)
str(kindergarten_CA)
## spec_tbl_df [110,382 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ district : chr [1:110382] "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
## $ sch_code : num [1:110382] 6967434 6110779 6100374 6090013 6090039 ...
## $ county : chr [1:110382] "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ pub_priv : chr [1:110382] "Private" "Public" "Public" "Public" ...
## $ school : chr [1:110382] "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
## $ enrollment: num [1:110382] 12 78 77 56 41 75 40 80 61 49 ...
## $ complete : num [1:110382] 11 77 73 53 41 65 34 76 61 43 ...
## $ start_year: num [1:110382] 2001 2001 2001 2001 2001 ...
## - attr(*, "spec")=
## .. cols(
## .. district = col_character(),
## .. sch_code = col_double(),
## .. county = col_character(),
## .. pub_priv = col_character(),
## .. school = col_character(),
## .. enrollment = col_double(),
## .. complete = col_double(),
## .. start_year = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(kindergarten_CA)
## district sch_code county pub_priv
## Length:110382 Min. : 1501 Length:110382 Length:110382
## Class :character 1st Qu.:6019905 Class :character Class :character
## Mode :character Median :6048706 Mode :character Mode :character
## Mean :5879880
## 3rd Qu.:6134460
## Max. :9999999
##
## school enrollment complete start_year
## Length:110382 Min. : 10.00 Min. : 0.00 Min. :2001
## Class :character 1st Qu.: 34.00 1st Qu.: 29.00 1st Qu.:2004
## Mode :character Median : 68.00 Median : 61.00 Median :2008
## Mean : 70.77 Mean : 64.89 Mean :2008
## 3rd Qu.: 98.00 3rd Qu.: 91.00 3rd Qu.:2012
## Max. :981.00 Max. :973.00 Max. :2015
## NA's :1652 NA's :1652
kindergarten_CA1 <- na.omit(kindergarten_CA)
kindergarten_CA1
## # A tibble: 108,730 x 8
## district sch_code county pub_priv school enrollment complete start_year
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alameda Unifi~ 6967434 Alame~ Private ALAME~ 12 11 2001
## 2 Alameda Unifi~ 6110779 Alame~ Public BAY F~ 78 77 2001
## 3 Alameda Unifi~ 6100374 Alame~ Public EARHA~ 77 73 2001
## 4 Alameda Unifi~ 6090013 Alame~ Public EDISO~ 56 53 2001
## 5 Alameda Unifi~ 6090039 Alame~ Public FRANK~ 41 41 2001
## 6 Alameda Unifi~ 6090047 Alame~ Public HAIGH~ 75 65 2001
## 7 Alameda Unifi~ 6090062 Alame~ Public LONGF~ 40 34 2001
## 8 Alameda Unifi~ 6090005 Alame~ Public LUM (~ 80 76 2001
## 9 Alameda Unifi~ 6090088 Alame~ Public MILLE~ 61 61 2001
## 10 Alameda Unifi~ 6090021 Alame~ Public OTIS ~ 49 43 2001
## # ... with 108,720 more rows
Grab the first 30 Rows of Data
KCA_df <- head(kindergarten_CA1, 30)
KCA_df
## # A tibble: 30 x 8
## district sch_code county pub_priv school enrollment complete start_year
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alameda Unifi~ 6967434 Alame~ Private ALAME~ 12 11 2001
## 2 Alameda Unifi~ 6110779 Alame~ Public BAY F~ 78 77 2001
## 3 Alameda Unifi~ 6100374 Alame~ Public EARHA~ 77 73 2001
## 4 Alameda Unifi~ 6090013 Alame~ Public EDISO~ 56 53 2001
## 5 Alameda Unifi~ 6090039 Alame~ Public FRANK~ 41 41 2001
## 6 Alameda Unifi~ 6090047 Alame~ Public HAIGH~ 75 65 2001
## 7 Alameda Unifi~ 6090062 Alame~ Public LONGF~ 40 34 2001
## 8 Alameda Unifi~ 6090005 Alame~ Public LUM (~ 80 76 2001
## 9 Alameda Unifi~ 6090088 Alame~ Public MILLE~ 61 61 2001
## 10 Alameda Unifi~ 6090021 Alame~ Public OTIS ~ 49 43 2001
## # ... with 20 more rows
Plotting Data
KCA_bar <- KCA_df %>%
ggplot(aes(x=pub_priv, y=enrollment, fill= school)) +
geom_bar(colour="white", width=.8, stat="identity") +
xlab("Public vs Private") + ylab("Enrollment") +
ggtitle("Alameda County Public vs Private School")
KCA_bar