Let’s explore some of the data sets that exist in the package.

Cohorts Dataset:

library(rCAEDDATA)
data(cohorts)
head(cohorts)
##              CDS              Name AggLevel DFC Subgroup Subgrouptype
## 1 30664643030574 Aliso Niguel High        S   N      All            4
## 2 30664643030574 Aliso Niguel High        S   N      All            5
## 3 30664643030574 Aliso Niguel High        S   N      All            1
## 4 30664643030574 Aliso Niguel High        S   N      All            3
## 5 30664643030574 Aliso Niguel High        S   N      All            9
## 6 30664643030574 Aliso Niguel High        S   N      All            7
##   NumCohort NumGraduates Cohort Graduation Rate NumDropouts
## 1        23           22                  95.65           *
## 2        83           83                 100.00           *
## 3         *            *                 100.00           *
## 4         *            *                 100.00           *
## 5        25           25                 100.00           *
## 6       531          520                  97.93           *
##   Cohort Dropout Rate NumSpecialEducation Special Ed Completers Rate
## 1                 0.0                   *                        4.3
## 2                 0.0                   *                        0.0
## 3                 0.0                   *                        0.0
## 4                 0.0                   *                        0.0
## 5                 0.0                   *                        0.0
## 6                 1.1                   *                        0.8
##   NumStillEnrolled Still Enrolled Rate NumGED GED Rate Year
## 1                *                 0.0      *        0 0910
## 2                *                 0.0      *        0 0910
## 3                *                 0.0      *        0 0910
## 4                *                 0.0      *        0 0910
## 5                *                 0.0      *        0 0910
## 6                *                 0.2      *        0 0910
colnames(cohorts)
##  [1] "CDS"                        "Name"                      
##  [3] "AggLevel"                   "DFC"                       
##  [5] "Subgroup"                   "Subgrouptype"              
##  [7] "NumCohort"                  "NumGraduates"              
##  [9] "Cohort Graduation Rate"     "NumDropouts"               
## [11] "Cohort Dropout Rate"        "NumSpecialEducation"       
## [13] "Special Ed Completers Rate" "NumStillEnrolled"          
## [15] "Still Enrolled Rate"        "NumGED"                    
## [17] "GED Rate"                   "Year"
ncol(cohorts)
## [1] 18
nrow(cohorts)
## [1] 512812
# Let's find out how many unique cases there are in the Cohorts dataset. 
length(unique(cohorts$CDS)) 
## [1] 3759
# Let's find out the min and max number of cases. First, we need to compute the total number
# of frequency of each case in the dataset. 
cohorts1 <- data.frame(table(cohorts$Name))
summary(cohorts1$Freq) # this way, you can see the max and min. 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    74.0   137.0   144.8   197.0 12842.0
# One way to find out which program has the max and min. 
cohorts1 <- cohorts1[order(cohorts1$Freq),]
# After sorting, you can use the following to see which program has the lowest # of cases
# and which one has the highest # of cases.
head(cohorts1) 
##                                            Var1 Freq
## 291                            Bear Valley High    4
## 806  Del Norte County Community Day (Secondary)    4
## 1264       Gridley Unified Community Day (7-12)    4
## 2426                  Pioneer/Quincy Elementary    4
## 2870                   SBC - Pacific Technology    4
## 58                  Alabama Hills Community Day    5
tail(cohorts1)
##                                   Var1  Freq
## 1686                      Liberty High   839
## 1422                 Independence High   920
## 1495              John F. Kennedy High  1036
## 860                    District Office  1085
## 1112                     Foothill High  1176
## 2142 Non-public, non-sectarian schools 12842
# Another way to find out which program has the max and min after knowing the max and min values. 
# Max = 12842
# Min = 4

cohorts1[which.max(cohorts1$Freq),]
##                                   Var1  Freq
## 2142 Non-public, non-sectarian schools 12842
cohorts1[which.min(cohorts1$Freq),]
##                 Var1 Freq
## 291 Bear Valley High    4
# These two commands would not give you all the rows that have max and min frequency. 
# They only give us the first and last max and min frequencies. 
cohorts1[cohorts1$Freq==min(cohorts1$Freq),]
##                                            Var1 Freq
## 291                            Bear Valley High    4
## 806  Del Norte County Community Day (Secondary)    4
## 1264       Gridley Unified Community Day (7-12)    4
## 2426                  Pioneer/Quincy Elementary    4
## 2870                   SBC - Pacific Technology    4
cohorts1[cohorts1$Freq==max(cohorts1$Freq),]
##                                   Var1  Freq
## 2142 Non-public, non-sectarian schools 12842
## cohorts' variables.

unique(cohorts$AggLevel)
## [1] "S" "O" "T" "D"
unique(cohorts$DFC)
## [1] "N" "Y"
unique(cohorts$Subgroup)
## [1] "All" "SE"  "MIG" "EL"  "SD"  "FEM" "MAL" "FOS"
unique(cohorts$Subgrouptype)
##  [1] "4"   "5"   "1"   "3"   "9"   "7"   "2"   "6"   "0"   "All"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Average graduation rate by each program
cohorts %>% group_by(Name) %>% summarize(GradRateAvg=round(mean(`Cohort Graduation Rate`))) %>% arrange(desc(GradRateAvg))
## # A tibble: 3,542 x 2
##                                                    Name GradRateAvg
##                                                   <chr>       <dbl>
##  1                          Alabama Hills Community Day         100
##  2          Aspire Vanguard College Preparatory Academy         100
##  3                         Baypoint Preparatory Academy         100
##  4                                     Bear Valley High         100
##  5                                      Big Sur Unified         100
##  6 Burbank Unified Independent Learning Academy (BUILA)         100
##  7                California Virtual Academy @ Maricopa         100
##  8                            Casa Blanca Community Day         100
##  9                          Central Coast New Tech High         100
## 10                         Charles J. Carver Elementary         100
## # ... with 3,532 more rows
# Average "still enrolled" rate by each  program
cohorts %>% group_by(Name) %>% summarize(StillEnrolledRateAvg=round(mean(`Still Enrolled Rate`))) %>% arrange(desc(StillEnrolledRateAvg))
## # A tibble: 3,542 x 2
##                                         Name StillEnrolledRateAvg
##                                        <chr>                <dbl>
##  1 Alfonso B. Perez Special Education Center                  100
##  2                          Brier Elementary                  100
##  3                   Career & Technical High                  100
##  4   Dunsmuir Joint Union High Community Day                  100
##  5                             Elite Academy                  100
##  6                                   Everest                  100
##  7                                  Highland                  100
##  8                                Loma Vista                  100
##  9                         Los Alisos Middle                  100
## 10       Mt. Burney Special Education Center                  100
## # ... with 3,532 more rows
# Average "special ed completion" rate by each program
cohorts %>% group_by(Name) %>% summarize(SpecialEdRateAvg=round(mean(`Special Ed Completers Rate`))) %>% arrange(desc(SpecialEdRateAvg))
## # A tibble: 3,542 x 2
##                                         Name SpecialEdRateAvg
##                                        <chr>            <dbl>
##  1                        La Contenta Middle              100
##  2           Colusa County Special Education               85
##  3  East County Elementary Special Education               77
##  4             West Sonoma County Consortium               71
##  5                           Monarch Academy               70
##  6                          Floyd I. Marchus               67
##  7                          Florence E. Rata               49
##  8 Heritage CCCOE Special Education Programs               41
##  9          Irvine Adult Transition Programs               40
## 10                           Sequoia Academy               40
## # ... with 3,532 more rows
# Average drop out rate by each program
cohorts %>% group_by(Name) %>% summarize(DropOutRateAvg=round(mean(`Cohort Dropout Rate`))) %>% arrange(desc(DropOutRateAvg))
## # A tibble: 3,542 x 2
##                                          Name DropOutRateAvg
##                                         <chr>          <dbl>
##  1                   Adelanto Charter Academy            100
##  2                        Adelanto Elementary            100
##  3                       Advent Community Day            100
##  4                 BUSD School of Opportunity            100
##  5                    Carr Lake Community Day            100
##  6                       Clearlake Creativity            100
##  7                           Corcoran Academy            100
##  8                 Cutler-Orosi Community Day            100
##  9 Del Norte County Community Day (Secondary)            100
## 10                        Dixon Community Day            100
## # ... with 3,532 more rows
## What if we want to combine all four rates into one table. 
fouravgcohorts <- cohorts %>% group_by(Name) %>% summarise_each(funs(mean(.,na.rm=TRUE)),c(`Cohort Graduation Rate`,`Still Enrolled Rate`,`Special Ed Completers Rate`,`Cohort Dropout Rate`))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
head(fouravgcohorts)
## # A tibble: 6 x 5
##                              Name `Cohort Graduation Rate`
##                             <chr>                    <dbl>
## 1 21st Century Learning Institute                 67.38432
## 2     ABC Secondary (Alternative)                 41.55227
## 3                     ABC Unified                 91.51634
## 4     Abraham Lincoln Alternative                 43.43574
## 5    Abraham Lincoln Continuation                 61.79528
## 6            Abraham Lincoln High                 92.59166
## # ... with 3 more variables: `Still Enrolled Rate` <dbl>, `Special Ed
## #   Completers Rate` <dbl>, `Cohort Dropout Rate` <dbl>

Dropouts Dataset:

head(dropouts)
## # A tibble: 6 x 20
##         CDS_CODE ETHNIC GENDER    E7    E8    E9   E10   E11   E12   EUS
##            <chr>  <int>  <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 01100170130401      1      M     0     0     1     0     0     0     0
## 2 01100170130401      2      M     0     0     3     3     3     2     0
## 3 01100170130401      3      M     0     0     0     1     1     2     0
## 4 01100170130401      4      M     0     0     0     2     0     1     0
## 5 01100170130401      5      M     0     6     4    13    12    13     0
## 6 01100170130401      5      F     0     1     1     0     3     0     0
## # ... with 10 more variables: ETOT <int>, D7 <int>, D8 <int>, D9 <int>,
## #   D10 <int>, D11 <int>, D12 <int>, DUS <int>, DTOT <int>, YEAR <fctr>
data.frame(table(dropouts$GENDER))
##   Var1   Freq
## 1    F 556012
## 2    M 576533
# Ethinic = 0: Not reported
# Ethinic = 1: American Indian
# Ethinic = 2: Asian
# Ethinic = 3: Pacific Islander
# Ethinic = 4: Filipino
# Ethinic = 5: Hispanic
# Ethinic = 6: African-American
# Ethinic = 7: White
# Ethinic = 8|9: Two or More

## Let's graph proportion of student droupouts by genders grade 7-12
library(reshape2)
library(dplyr)
dropouts1 <- dropouts %>% 
  group_by(GENDER, add=TRUE) %>% 
  summarise_at(.vars=colnames(dropouts[12:17]),.funs=c(sum="sum")) %>%  
  mutate(D7 = D7_sum/sum(D7_sum),
                  D8 = D8_sum/sum(D8_sum),
                  D9 = D9_sum/sum(D9_sum),
                  D10 = D10_sum/sum(D10_sum),
                  D11 = D11_sum/sum(D11_sum),
                  D12 = D12_sum/sum(D12_sum)) %>% as.data.frame()

dropouts2 <- melt(dropouts1,id.vars="GENDER",
                  measure.vars=c("D7","D8","D9","D10","D11","D12"))
library(ggplot2)
dropouts2 %>% ggplot(aes(variable,value,fill=GENDER)) + 
  geom_bar(stat="identity",position="fill") + 
  labs(x="Grade",
       y="Percentage",
       title="Proportion of Student Dropouts by Gender, Grades 7-12",
       fill= "Gender")

Dropouts Dataset: A different approach to achieve the same plot as above:

library(tidyr) # have to load this library for "gather" function to work.
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
dropouts %>% 
  select(GENDER, matches("D[0-9]")) %>% 
  gather(GRADE, DROPOUTS, -GENDER) %>% 
  mutate(GRADE = as.numeric(stringr::str_replace(GRADE, "D", ""))) %>% 
  group_by(GENDER, GRADE) %>% 
  summarize(DROPOUTS = sum(DROPOUTS)) %>%
  ggplot(aes(GRADE, DROPOUTS, fill = GENDER)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_x_continuous(breaks = c(7:12)) +
  labs(x = "Grade",
       y = "",
       title = "Proportion of Student Dropouts by Gender, Grades 7-12",
       fill = "Gender") +
  theme_minimal()

## Let's graph proportion of student enrollments by genders grade 7-12
dropouts %>%
  select(GENDER,matches("E[0-9]")) %>%
  gather(GRADE, ENROLLMENTS, -GENDER) %>%
  group_by(GENDER,GRADE) %>%
  summarize(ENROLLMENTS = sum(ENROLLMENTS)) %>%
  ggplot(aes(GRADE,ENROLLMENTS,fill=GENDER)) +
           geom_bar(stat="identity",position="fill")+
           labs(x="Grade",
                y="Percentage",
                title = "Proportion of Student Enrollments by Gender, Grades 7-12",
                fill="Gender")

## Let's graph proportion of student enrollments by ethinicity grade 7-12
library(scales)
dropouts %>%
  select(ETHNIC, matches("E[0-9]")) %>%
  gather(GRADE, ENROLLMENTS,-ETHNIC) %>%
  mutate(GRADE = as.numeric(stringr::str_replace(GRADE,"E",""))) %>%
  group_by(ETHNIC,GRADE) %>%
  summarize(ENROLLMENTS=sum(ENROLLMENTS)) %>%
  ggplot(aes(GRADE,ENROLLMENTS,fill=ETHNIC)) +
  geom_bar(stat="identity",position="fill")+
  scale_x_continuous(breaks=c(7:12))+
  scale_y_continuous(labels=percent_format())+
  scale_fill_gradient(high="blue",low="green") +
  #scale_color_discrete()+
  #scale_color_gradientn(colours = terrain.colors(10))+
  labs(x="Grade",
       y="Percentage",
       title = "Proportion of Student Enrollments by Ethinicity, Grades 7-12",
       fill="Ethnicity")

## English Learner Dataset:

data(english_learners)
unique(english_learners$LANGUAGE)
##  [1] "Vietnamese"                                 
##  [2] "Cantonese"                                  
##  [3] "Japanese"                                   
##  [4] "All other non-English lan"                  
##  [5] "Russian"                                    
##  [6] "Spanish"                                    
##  [7] "Portuguese"                                 
##  [8] "German"                                     
##  [9] "Turkish"                                    
## [10] "Arabic"                                     
## [11] "Farsi (Persian)"                            
## [12] "Hindi"                                      
## [13] "Urdu"                                       
## [14] "Pashto"                                     
## [15] "Bengali"                                    
## [16] "Gujarati"                                   
## [17] "Korean"                                     
## [18] "Punjabi"                                    
## [19] "Albanian"                                   
## [20] "Khmer (Cambodian)"                          
## [21] "Mandarin (Putonghua)"                       
## [22] "Italian"                                    
## [23] "Tigrinya"                                   
## [24] "Filipino (Pilipino or Tag"                  
## [25] "Taiwanese"                                  
## [26] "Thai"                                       
## [27] "Tongan"                                     
## [28] "Cebuano (Visayan)"                          
## [29] "Ilocano"                                    
## [30] "Samoan"                                     
## [31] "Mien (Yao)"                                 
## [32] "Serbo-Croatian (Bosnian,"                   
## [33] "Indonesian"                                 
## [34] "French"                                     
## [35] "Hungarian"                                  
## [36] "Hmong"                                      
## [37] "Lao"                                        
## [38] "Chaozhou (Chaochow)"                        
## [39] "Dutch"                                      
## [40] "Greek"                                      
## [41] "Chamorro (Guamanian)"                       
## [42] "Hebrew"                                     
## [43] "Burmese"                                    
## [44] "Rumanian"                                   
## [45] "Polish"                                     
## [46] "Armenian"                                   
## [47] "Kurdish (Kurdi, Kurmanji)"                  
## [48] "Ukrainian"                                  
## [49] "Toishanese"                                 
## [50] "Assyrian"                                   
## [51] "Marshallese"                                
## [52] "Somali"                                     
## [53] "Lahu"                                       
## [54] "Khmu"                                       
## [55] "Mixteco"                                    
## [56] "Chaldean"                                   
## [57] "Other non-English languages"                
## [58] "Filipino (Pilipino or Tagalog)"             
## [59] "Serbo-Croatian (Bosnian, Croatian, Serbian)"
## [60] "Chaozhou (Chiuchow)"                        
## [61] "Unknown"                                    
## [62] "Tamil"                                      
## [63] "Telugu"                                     
## [64] "Kannada"                                    
## [65] "Marathi"                                    
## [66] "Swedish"                                    
## [67] "Amharic"                                    
## [68] "Bulgarian"                                  
## [69] "Zapoteco"                                   
## [70] "Uzbek"                                      
## [71] "Kashmiri"                                   
## [72] "Kikuyu (Gikuyu)"
# which schools that have Khmer speakers? 

english_learners %>% select(COUNTY,DISTRICT,YEAR,SCHOOL,LANGUAGE,TOTAL_EL, matches("Gr_[0-9]")) %>% 
  filter(LANGUAGE=="Khmer (Cambodian)") %>% top_n(10)
## Selecting by GR_12
## # A tibble: 10 x 18
##         COUNTY           DISTRICT    YEAR                SCHOOL
##          <chr>              <chr>   <chr>                 <chr>
##  1 Los Angeles Long Beach Unified 2007-08      Polytechnic High
##  2 San Joaquin       Lodi Unified 2007-08 Ronald E. McNair High
##  3 Los Angeles Long Beach Unified 2008-09      Polytechnic High
##  4 San Joaquin       Lodi Unified 2008-09 Ronald E. McNair High
##  5 Los Angeles Long Beach Unified 2009-10      Polytechnic High
##  6 San Joaquin       Lodi Unified 2009-10 Ronald E. McNair High
##  7 Los Angeles Long Beach Unified 2010-11      Polytechnic High
##  8 Los Angeles Long Beach Unified 2011-12      Polytechnic High
##  9 Los Angeles Long Beach Unified 2012-13      Polytechnic High
## 10 Los Angeles Long Beach Unified 2013-14      Polytechnic High
## # ... with 14 more variables: LANGUAGE <chr>, TOTAL_EL <int>, GR_1 <int>,
## #   GR_2 <int>, GR_3 <int>, GR_4 <int>, GR_5 <int>, GR_6 <int>,
## #   GR_7 <int>, GR_8 <int>, GR_9 <int>, GR_10 <int>, GR_11 <int>,
## #   GR_12 <int>
# which "language" school that has the highest number of english learners.
maxlearner = max(english_learners$TOTAL_EL)
english_learners %>% select(COUNTY,SCHOOL,LANGUAGE,TOTAL_EL) %>%
  filter(TOTAL_EL==maxlearner)
## # A tibble: 1 x 4
##   COUNTY                  SCHOOL LANGUAGE TOTAL_EL
##    <chr>                   <chr>    <chr>    <int>
## 1 Orange Access County Community  Spanish     2131
# Find top and bottom 10 schools ranked by the number of english learners.

english_learners %>% group_by(SCHOOL) %>% tally(TOTAL_EL) %>% top_n(10)
## Selecting by n
## # A tibble: 10 x 2
##                     SCHOOL     n
##                      <chr> <int>
##  1 Cesar Chavez Elementary 29353
##  2     Franklin Elementary 30307
##  3      Fremont Elementary 31732
##  4     Garfield Elementary 29058
##  5    Jefferson Elementary 61990
##  6      Lincoln Elementary 66301
##  7     McKinley Elementary 36650
##  8    Roosevelt Elementary 51068
##  9   Washington Elementary 60127
## 10       Wilson Elementary 34473
english_learners %>% group_by(SCHOOL) %>% tally(TOTAL_EL) %>% top_n(-10)
## Selecting by n
## # A tibble: 35 x 2
##                                                        SCHOOL     n
##                                                         <chr> <int>
##  1                           Alameda County Special Education     0
##  2                                            Archway Academy     0
##  3                                       Bass Lake Elementary     0
##  4                                       Benton Community Day     0
##  5                       Berkeley Special Education Preschool     0
##  6                                     Bridgeville Elementary     0
##  7                                         Cambrian Community     0
##  8 Charter Transitional Reporting Educational Center (TREC I)     0
##  9                                          Chrysalis Charter     0
## 10                                              Credence High     0
## # ... with 25 more rows

Enrollment Dataset:

# Let's graph diversity in FIVE districts of LA, 2007-2017
data(enrollments)
head(enrollments)
## # A tibble: 6 x 24
##         CDS_CODE ETHNIC GENDER  KDGN  GR_1  GR_2  GR_3  GR_4  GR_5  GR_6
##            <chr>  <int>  <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 01100170130401      1      M     0     0     0     0     0     0     0
## 2 01100170130401      2      F     0     0     0     0     0     0     0
## 3 01100170130401      2      M     0     0     0     0     0     0     0
## 4 01100170130401      3      M     0     0     0     0     0     0     0
## 5 01100170130401      4      M     0     0     0     0     0     0     0
## 6 01100170130401      5      F     0     0     0     0     0     0     0
## # ... with 14 more variables: GR_7 <int>, GR_8 <int>, UNGR_ELM <int>,
## #   GR_9 <int>, GR_10 <int>, GR_11 <int>, GR_12 <int>, UNGR_SEC <int>,
## #   ENR_TOTAL <int>, ADULT <int>, YEAR <fctr>, COUNTY <chr>,
## #   DISTRICT <chr>, SCHOOL <chr>
# Let's randomly pick the first five different districts in LA. 
enrollments %>% select(DISTRICT,COUNTY) %>% distinct(DISTRICT,COUNTY) %>% filter(COUNTY=="Los Angeles")
## # A tibble: 104 x 2
##                         DISTRICT      COUNTY
##                            <chr>       <chr>
##  1           Los Angeles Unified Los Angeles
##  2                  Lowell Joint Los Angeles
##  3               Lynwood Unified Los Angeles
##  4                    Los Nietos Los Angeles
##  5   Little Lake City Elementary Los Angeles
##  6            Long Beach Unified Los Angeles
##  7 Hermosa Beach City Elementary Los Angeles
##  8             Inglewood Unified Los Angeles
##  9          Las Virgenes Unified Los Angeles
## 10           Lawndale Elementary Los Angeles
## # ... with 94 more rows
# Los Angeles Unified
# Lowell Joint
# Lynwood Unified
# Los Nietos
# Long Beach Unified

# Ethinic = 0: Not reported
# Ethinic = 1: American Indian
# Ethinic = 2: Asian
# Ethinic = 3: Pacific Islander
# Ethinic = 4: Filipino
# Ethinic = 5: Hispanic
# Ethinic = 6: African-American
# Ethinic = 7: White
# Ethinic = 8|9: Two or More

enrollments %>%
  mutate(ETHNIC= case_when(ETHNIC==0 ~"Not Reported",
         ETHNIC==1 ~"American Indian",
         ETHNIC==2 ~"Asian",
         ETHNIC==3 ~"Pacific Islander",
         ETHNIC==4 ~"Filipino",
         ETHNIC==5 ~"Hispanic",
         ETHNIC==6 ~"African-American",
         ETHNIC==7 ~"White",
         ETHNIC==8 | ETHNIC==9 ~"Tow or More")) %>%
  filter(DISTRICT==c("Los Angeles Unified","Lowell Joint",
                     "Lynwood Unified","Los Nietos",
                     "Long Beach Unified")) %>%
  select(ETHNIC,DISTRICT,ENR_TOTAL,YEAR) %>%
  group_by(ETHNIC,DISTRICT,YEAR) %>% summarise(`Total Enrollments` = sum(ENR_TOTAL)) %>%
  ggplot(aes(YEAR,`Total Enrollments`,fill=ETHNIC)) +
    geom_bar(stat="identity",position="fill") +
    facet_wrap(~DISTRICT, nrow = 5) +
    labs(title="Enrollment Diversity in 5 Districts of Los Angeles, 2007-2017",fill="Ethinicity")

## Suspensions Dataset:

data(suspensions)
head(suspensions)
## # A tibble: 6 x 15
##   AGGEGATELEVEL            CDS    NAME DISCIPLINETYPE ETHNICITY WEAPONS
##           <chr>          <chr>   <chr>          <chr>     <int>   <int>
## 1             O 01000000000000 Alameda              E         2       1
## 2             O 01000000000000 Alameda              E         6       5
## 3             O 01000000000000 Alameda              E         4       0
## 4             O 01000000000000 Alameda              E         5      12
## 5             O 01000000000000 Alameda              E         1       1
## 6             O 01000000000000 Alameda              E         3       2
## # ... with 9 more variables: DRUGS <int>, VIOLENCEWITHINJURY <int>,
## #   VIOLENCEWITHOUTINJURY <int>, OTHERNONDEFIANCE <int>,
## #   OTHERDEFIANCE <int>, TOTAL <int>, YEAR <fctr>, DATECREATED <chr>,
## #   DATEUPDATED <chr>
library(maps)
library(ggmap)
library(mapdata)
states <- map_data("state")
ca_df <- subset(states, region == "california")
counties <- map_data("county")
ca_county <- subset(counties, region == "california")

drugtotal <- suspensions %>% select(YEAR,DRUGS,TOTAL,AGGEGATELEVEL,NAME) %>% 
  group_by(NAME,YEAR) %>% 
  filter(YEAR=="2014-15",AGGEGATELEVEL=="O") %>%
  summarise(DRUG=sum(DRUGS,na.rm=TRUE),TOTAL=sum(TOTAL,na.rm=TRUE), Drug_prop = round(DRUG/TOTAL,2)) %>%
  arrange(desc(Drug_prop))
drugmapdata <- left_join(ca_county, 
                         drugtotal %>%  mutate(subregion = stringr::str_to_lower(NAME)),
                         by="subregion")
ggplot(data = ca_df, mapping = aes(x = long, y = lat, group = group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color = "black", fill = "gray") +
  geom_polygon(data = drugmapdata, aes(fill = Drug_prop), color = "white") +
  geom_polygon(color = "black", fill = NA) +
  labs(title = "Proportion of Drugs-Related Suspensions by County, 2014-2015",
       fill = "Proportion") +
  theme_void() +
  viridis::scale_fill_viridis()

## Acknowledgement: #https://github.com/daranzolin/rCAEDDATA