Colorado Exploratory Data Analysis

set up

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

this didn’t work and I’m not sure why but I’d like to figure it out

# for (i in 1:7){
# paste('CO', i, sep = ".") <- read.csv(paste('Registered_Voters_List_ Part', i,  '.txt', sep = ""))
# }

instead this was necessary, but less efficient

CO.1 <- read.csv('Registered_Voters_List_ Part1.txt')
# CO.2 <- read.csv('Registered_Voters_List_ Part2.txt')
# CO.3 <- read.csv('Registered_Voters_List_ Part3.txt')
# CO.4 <- read.csv('Registered_Voters_List_ Part4.txt')
# CO.5 <- read.csv('Registered_Voters_List_ Part5.txt')
# CO.6 <- read.csv('Registered_Voters_List_ Part6.txt')
# CO.7 <- read.csv('Registered_Voters_List_ Part7.txt')
# CO.8 <- read.csv('Registered_Voters_List_ Part8.txt')

for the purpose of this markdown, only one .txt file was loaded

This isn’t applicable right now, but will be with all the data loaded in

colnames(CO.1) == colnames(CO.8)
CO.all <- rbind(CO.1, CO.2, CO.3, CO.4, CO.5, CO.6, CO.7, CO.8)

Examine what the column names are

colnames(CO.1)
##  [1] "VOTER_ID"                "COUNTY_CODE"            
##  [3] "COUNTY"                  "LAST_NAME"              
##  [5] "FIRST_NAME"              "MIDDLE_NAME"            
##  [7] "NAME_SUFFIX"             "VOTER_NAME"             
##  [9] "STATUS_CODE"             "PRECINCT_NAME"          
## [11] "ADDRESS_LIBRARY_ID"      "HOUSE_NUM"              
## [13] "HOUSE_SUFFIX"            "PRE_DIR"                
## [15] "STREET_NAME"             "STREET_TYPE"            
## [17] "POST_DIR"                "UNIT_TYPE"              
## [19] "UNIT_NUM"                "RESIDENTIAL_ADDRESS"    
## [21] "RESIDENTIAL_CITY"        "RESIDENTIAL_STATE"      
## [23] "RESIDENTIAL_ZIP_CODE"    "RESIDENTIAL_ZIP_PLUS"   
## [25] "EFFECTIVE_DATE"          "REGISTRATION_DATE"      
## [27] "STATUS"                  "STATUS_REASON"          
## [29] "BIRTH_YEAR"              "GENDER"                 
## [31] "PRECINCT"                "SPLIT"                  
## [33] "VOTER_STATUS_ID"         "PARTY"                  
## [35] "PREFERENCE"              "PARTY_AFFILIATION_DATE" 
## [37] "PHONE_NUM"               "MAIL_ADDR1"             
## [39] "MAIL_ADDR2"              "MAIL_ADDR3"             
## [41] "MAILING_CITY"            "MAILING_STATE"          
## [43] "MAILING_ZIP_CODE"        "MAILING_ZIP_PLUS"       
## [45] "MAILING_COUNTRY"         "SPL_ID"                 
## [47] "PERMANENT_MAIL_IN_VOTER" "CONGRESSIONAL"          
## [49] "STATE_SENATE"            "STATE_HOUSE"            
## [51] "ID_REQUIRED"

create age variable

CO.1 <- mutate(CO.1, AGE = (2019 - as.numeric(BIRTH_YEAR)))

look at how many voters in each party

age.party <- CO.1 %>% 
  group_by(PARTY) %>% 
  count(AGE)

DEM <- filter(age.party, PARTY == 'DEM')
REP <- filter(age.party, PARTY == 'REP')

ggplot(DEM, mapping = aes(AGE, n)) +
  geom_bar(stat="identity")

ggplot(REP, mapping = aes(AGE, n)) +
  geom_bar(stat="identity")

look at spread of ages

ggplot(age.party, mapping = aes(AGE, n)) +
  geom_bar(stat="identity")

look at status and status by ages

table(CO.1$STATUS)
## 
##   Active Inactive 
##   446999    53000
age.status <- CO.1 %>% 
  group_by(STATUS) %>% 
  count(AGE)

Active <- filter(age.status, STATUS == 'Active')
Inactive <- filter(age.status, STATUS == 'Inactive') 

ggplot(Active, mapping = aes(AGE,n)) +
  geom_bar(stat="identity")

ggplot(Inactive, mapping = aes(AGE,n)) +
  geom_bar(stat="identity")