Load the libraries and load the data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
setwd("/Users/shoshigilg/Desktop/DATA 110/BOE")
votes <- read_csv("BOE_-_Precinct_Voter_Counts2.csv")
## Rows: 4470 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Year/Month
## dbl (10): Precinct, TOTAL, DEM, REP, GRN, LIB, WCP, BAR, UNA, OTHERS
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
votes$Precinct <- as.factor(votes$Precinct)
districts <- read_csv("District_Precinct.csv")
## Rows: 274 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): district, precinct
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
districts$district <- as.factor(districts$district)
districts$precinct <- as.factor(districts$precinct)
str(votes)
## spec_tbl_df [4,470 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Year/Month: chr [1:4470] "22-Jan" "22-Jan" "22-Jan" "22-Jan" ...
## $ Precinct : Factor w/ 270 levels "1001","1002",..: 95 90 147 228 172 179 42 154 35 78 ...
## $ TOTAL : num [1:4470] 1900 4363 2258 2277 3708 ...
## $ DEM : num [1:4470] 1074 2293 1294 1612 2289 ...
## $ REP : num [1:4470] 318 726 302 239 456 47 713 133 335 489 ...
## $ GRN : num [1:4470] 1 5 3 3 9 2 7 3 3 4 ...
## $ LIB : num [1:4470] 3 20 14 6 4 1 11 5 6 8 ...
## $ WCP : num [1:4470] 0 1 5 1 1 0 1 0 4 3 ...
## $ BAR : num [1:4470] 0 0 0 0 0 0 0 0 0 0 ...
## $ UNA : num [1:4470] 488 1288 613 405 899 ...
## $ OTHERS : num [1:4470] 16 30 27 11 50 6 44 18 16 33 ...
## - attr(*, "spec")=
## .. cols(
## .. `Year/Month` = col_character(),
## .. Precinct = col_double(),
## .. TOTAL = col_double(),
## .. DEM = col_double(),
## .. REP = col_double(),
## .. GRN = col_double(),
## .. LIB = col_double(),
## .. WCP = col_double(),
## .. BAR = col_double(),
## .. UNA = col_double(),
## .. OTHERS = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(districts)
## spec_tbl_df [274 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ district: Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ precinct: Factor w/ 272 levels "1001","1002",..: 1 2 3 4 5 6 7 8 9 11 ...
## - attr(*, "spec")=
## .. cols(
## .. district = col_double(),
## .. precinct = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Clean the data and select only certain variables
names(votes) <- tolower(names(votes))
names(votes) <- gsub("/", "_", names(votes))
votes2 <- votes %>%
select("year_month", "precinct", "total", "dem", "rep")
head(votes2)
## # A tibble: 6 × 5
## year_month precinct total dem rep
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 22-Jan 6013 1900 1074 318
## 2 22-Jan 6007 4363 2293 726
## 3 22-Jan 9003 2258 1294 302
## 4 22-Jan 13024 2277 1612 239
## 5 22-Jan 9031 3708 2289 456
## 6 22-Jan 9038 446 309 47
Adding ‘other’ as a category of vote
votes2$other <- votes2$total - votes2$dem - votes2$rep
votes2
## # A tibble: 4,470 × 6
## year_month precinct total dem rep other
## <chr> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 22-Jan 6013 1900 1074 318 508
## 2 22-Jan 6007 4363 2293 726 1344
## 3 22-Jan 9003 2258 1294 302 662
## 4 22-Jan 13024 2277 1612 239 426
## 5 22-Jan 9031 3708 2289 456 963
## 6 22-Jan 9038 446 309 47 90
## 7 22-Jan 4020 4068 2236 713 1119
## 8 22-Jan 9010 864 491 133 240
## 9 22-Jan 4013 2186 1352 335 499
## 10 22-Jan 5017 3719 2431 489 799
## # … with 4,460 more rows
Remove ‘total’ from votes2 data frame
votes2 <- votes2 %>%
select("year_month", "precinct", "dem", "rep", "other")
head(votes2)
## # A tibble: 6 × 5
## year_month precinct dem rep other
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 22-Jan 6013 1074 318 508
## 2 22-Jan 6007 2293 726 1344
## 3 22-Jan 9003 1294 302 662
## 4 22-Jan 13024 1612 239 426
## 5 22-Jan 9031 2289 456 963
## 6 22-Jan 9038 309 47 90
Fix the dates
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
votes3 <- votes2 %>%
mutate(date = as.Date(as.yearmon(year_month, format = "%b-%y")))
head(votes3)
## # A tibble: 6 × 6
## year_month precinct dem rep other date
## <chr> <fct> <dbl> <dbl> <dbl> <date>
## 1 22-Jan 6013 1074 318 508 NA
## 2 22-Jan 6007 2293 726 1344 NA
## 3 22-Jan 9003 1294 302 662 NA
## 4 22-Jan 13024 1612 239 426 NA
## 5 22-Jan 9031 2289 456 963 NA
## 6 22-Jan 9038 309 47 90 NA
Convert from wide to long
votes_long <- votes3 %>%
pivot_longer(
cols = 3:5,
names_to = c("party"),
values_to = "total_votes"
)
head(votes_long)
## # A tibble: 6 × 5
## year_month precinct date party total_votes
## <chr> <fct> <date> <chr> <dbl>
## 1 22-Jan 6013 NA dem 1074
## 2 22-Jan 6013 NA rep 318
## 3 22-Jan 6013 NA other 508
## 4 22-Jan 6007 NA dem 2293
## 5 22-Jan 6007 NA rep 726
## 6 22-Jan 6007 NA other 1344
Link precincts with districts
votes4 <- left_join(votes_long, districts, by = "precinct")
calculate total votes by district
votes_sum <- votes4 %>%
group_by(district, party) %>%
summarise(votes = sum(total_votes)) %>%
arrange(desc(votes)) ## votes_sum is a new variable created
## `summarise()` has grouped output by 'district'. You can override using the
## `.groups` argument.
votes_sum
## # A tibble: 15 × 3
## # Groups: district [5]
## district party votes
## <fct> <chr> <dbl>
## 1 3 dem 1583279
## 2 5 dem 1436627
## 3 4 dem 1270021
## 4 2 dem 1247568
## 5 1 dem 1168441
## 6 2 other 593177
## 7 3 other 566873
## 8 1 other 510283
## 9 5 other 498672
## 10 1 rep 421914
## 11 3 rep 402717
## 12 5 rep 395452
## 13 2 rep 373526
## 14 4 other 360770
## 15 4 rep 197545
# Barplot
p <- ggplot(data=votes_sum, aes(x=district, y=votes, fill=party)) +
geom_bar(stat="identity", position=position_dodge())
print(p + scale_fill_manual(values=c("blue", "purple", "red")) +
ggtitle("All Votes Cast in Montgomery County Board of Education Elections \nBy District and Party \nDecember 2020 to May 2022") +
labs(y = "Votes Cast", x = "District"))

The Essay
For Project 1, I choose Montgomery Counties Board of Education (BOE) Precinct Voter Counts. This dataset is from data.montgomerycountymd.gov and contains political parties updated monthly, with the most recent update being June 14, 2022. Variables included the precincts with Districts 1-5, total vote counts, and eight different political parties. I had to create an additional CSV file to correlate precincts with districts and altered the original BOE dataset to remove the leading zeroes on the precincts. I organized the precinct votes into districts using command str. Naming the votes, I cleaned up the data by only selecting and piping certain variables such as year_month, precinct, total, democrat, and republican. Thus, creating votes2. Then I created an additional variable category for other parties. I removed the ‘total’ from the votes2 data frame and organized the year_month, precinct, dem, rep, and other. Fixing the dates with zoo and lubridate packages, mutating the date = as.Date(as.yearmon(year_month, format = %b-%y" to format the dates. Creating the new dataset votes3 consolidates dem, rep, and other into a column called ‘party,’ which contains the party values—converted from wide to long, organizing each party’s total votes named votes_long. Using left_join votes4 dataset brought votes_long, districts organized by precinct. Summarizing the votes and calculating total votes by district, votes_sum was a new variable created.
I chose a grouped bar chart to represent all three parties dem for democrat, other for neither, and rep for republican in each of five Montgomery County BOE districts. Democrats are represented with blue, the other parties are purple, and the red is republican. The most interesting pattern is throughout all five districts, democrats are the majority, and Republicans are the minority, didn’t exceed the third parties or write-in. District 3, Bethesda and Potomac, has the highest vote count of all districts.
I could not figure out the legend title and labels to update. I wanted to replace abbreviations for parties with their full names ex. Democrat. When I tried to do so, R took away any other formatting. I had some technical issues that I could resolve, but overall, it was much cleaning to make the graph I wanted.