I start by removing some un-needed columns in the data before trying to reshape the file

elec_result_PC_all <- select(elec_result_PC_all, -startsWith("DivisionID"))
## Error in select(elec_result_PC_all, -startsWith("DivisionID")): could not find function "select"

I realised I hadn’t reloaded the tidyverse library since my computer battery died

library(tidyverse)
library(tidyselect)
library(tidyr)
library(dplyr)
elec_result_PC_all <- read_csv("/Users/Bec/R Projects/AT2/cleaned_data/elec_results_PC_all.csv")
glimpse(elec_result_PC_all)
## Observations: 3,035
## Variables: 24
## $ Postcode             <chr> "0800", "0810", "0812", "0820", "0822", "...
## $ `Electoral division` <chr> "Solomon", "Solomon", "Solomon", "Solomon...
## $ `Per cent`           <dbl> 100.0, 100.0, 100.0, 100.0, 96.1, 3.9, 10...
## $ DivisionID           <int> 307, 307, 307, 307, 306, 307, 307, 307, 3...
## $ StateAb.x            <chr> "NT", "NT", "NT", "NT", "NT", "NT", "NT",...
## $ CandidateID          <int> 28737, 28737, 28737, 28737, 28735, 28737,...
## $ GivenNm              <chr> "Luke", "Luke", "Luke", "Luke", "Warren",...
## $ Surname              <chr> "GOSLING", "GOSLING", "GOSLING", "GOSLING...
## $ PartyNm              <chr> "Australian Labor Party", "Australian Lab...
## $ PartyAb              <chr> "ALP", "ALP", "ALP", "ALP", "ALP", "ALP",...
## $ DivisionID2010       <int> 307, 307, 307, 307, 306, 307, 307, 307, 3...
## $ StateAb.y            <chr> "NT", "NT", "NT", "NT", "NT", "NT", "NT",...
## $ CandidateID2010      <int> 21477, 21477, 21477, 21477, 21513, 21477,...
## $ GivenNm2010          <chr> "Natasha", "Natasha", "Natasha", "Natasha...
## $ Surname2010          <chr> "GRIGGS", "GRIGGS", "GRIGGS", "GRIGGS", "...
## $ PartyNm2010          <chr> "CLP - The Territory Party", "CLP - The T...
## $ PartyAb2010          <chr> "CLP", "CLP", "CLP", "CLP", "ALP", "CLP",...
## $ DivisionID2013       <int> 307, 307, 307, 307, 306, 307, 307, 307, 3...
## $ StateAb              <chr> "NT", "NT", "NT", "NT", "NT", "NT", "NT",...
## $ CandidateID2013      <int> 23502, 23502, 23502, 23502, 23522, 23502,...
## $ GivenNm2013          <chr> "Natasha", "Natasha", "Natasha", "Natasha...
## $ Surname2013          <chr> "GRIGGS", "GRIGGS", "GRIGGS", "GRIGGS", "...
## $ PartyNm2013          <chr> "Country Liberals (NT)", "Country Liberal...
## $ PartyAb2013          <chr> "CLP", "CLP", "CLP", "CLP", "ALP", "CLP",...
elec_result_PC_all <- select_(elec_result_PC_all, -startsWith("DivisionID"))
## Error in startsWith("DivisionID"): argument "prefix" is missing, with no default

ok let’s try that again, I’m going to test this on ‘DivisionID’ - it looks like starts_with needed the underscore

elec_result_PC_all <- select(elec_result_PC_all, -starts_with("DivisionID"))

Thats worked, ok now for more columns

elec_result_PC_all <- select(elec_result_PC_all, -starts_with("DivisionID", "Surname", "GivenNm", "PartyAB", "CandidateID"))
## Error in starts_with("DivisionID", "Surname", "GivenNm", "PartyAB", "CandidateID"): unused arguments ("PartyAB", "CandidateID")

ok it looks like I can’t do multiple at once this way, I’m going to try one-of instead.

elec_result_PC_all <- select(elec_result_PC_all, -one_of(c("DivisionID", "CandidateID")))

That worked, but seems a bit tedious naming every single column seperately, there must be a better way, back to starts_with, at least that can remove column ‘groups’ with the same start.

elec_result_PC_all <- select(elec_result_PC_all, -starts_with("DivisionID", "Surname", "GivenNm", "PartyAb", "CandidateId"))
## Error in starts_with("DivisionID", "Surname", "GivenNm", "PartyAb", "CandidateId"): unused arguments ("PartyAb", "CandidateId")

No luck, how about contains

elec_result_PC_all <- select(elec_result_PC_all, -contains("DivisionID", "Surname", "GivenNm", "PartyAb", "CandidateId"))
## Error in contains("DivisionID", "Surname", "GivenNm", "PartyAb", "CandidateId"): unused arguments ("PartyAb", "CandidateId")

Back to starts_with but why does this only seem to work one by one (?), this will take a while

elec_result_PC_all <- select(elec_result_PC_all, -starts_with("Surname"))
elec_result_PC_all <- select(elec_result_PC_all, -starts_with("GivenNm"))
elec_result_PC_all <- select(elec_result_PC_all, -starts_with("PartyAb"))

How about a few at once

elec_result_PC_all <- select(elec_result_PC_all, -starts_with("Surname", "GivenNm"))
## Error in if (ignore.case) match <- tolower(match): argument is not interpretable as logical

Didn’t like that. The last column to clean up is state, but I want to leave one of them behind, the other 2 can go (we don’t need 3 state columns by year seeing as they don’t change each election)

rename(elec_result_PC_all, "State" = "StateAb.x")

This looks promising, let’s take a look

head(elec_result_PC_all)
## # A tibble: 6 x 11
##   Postcode `Electoral divi… `Per cent` StateAb.x PartyNm StateAb.y
##   <chr>    <chr>                 <dbl> <chr>     <chr>   <chr>    
## 1 0800     Solomon               100   NT        Austra… NT       
## 2 0810     Solomon               100   NT        Austra… NT       
## 3 0812     Solomon               100   NT        Austra… NT       
## 4 0820     Solomon               100   NT        Austra… NT       
## 5 0822     Lingiari               96.1 NT        Austra… NT       
## 6 0822     Solomon                 3.9 NT        Austra… NT       
## # ... with 5 more variables: CandidateID2010 <int>, PartyNm2010 <chr>,
## #   StateAb <chr>, CandidateID2013 <int>, PartyNm2013 <chr>

Now to remove those extra state by year columns.

elec_result_PC_all <- select(elec_result_PC_all, -matches("StateAb.y"))
elec_result_PC_all <- select(elec_result_PC_all, -matches("StateAb"))

And the final piece which is the CandidateID columns

elec_result_PC_all <- select(elec_result_PC_all, -starts_with("CandidateID"))

Now that I have a cleaner data file in wide format, I’d like to convert it to long format. Before I do this, I ned to make sure the postcode column is a factor.

elec_result_PC_all$Postcode <- factor(elec_result_PC_all$Postcode)

Now use the gather function to combine ‘PartyNm2016’, ‘PartyNm2013’, and PartyNm2010’ into the column ‘Party_Year’

elec_result_PC_all_long <- gather(elec_result_PC_all, Party_Year, Party_Name, PartyNm2016:PartyNm2013, factor_key = TRUE)
head(elec_result_PC_all_long)

Rename factor names from “PartyNm2016”, “PartyNm2010” and “PartyNm2013” to “2016”, “2010” and “2013”

levels(elec_result_PC_all_long$Party_Year)[levels(elec_result_PC_all_long$Party_Year)=="PartyNm2016"] <- "2016"
levels(elec_result_PC_all_long$Party_Year)[levels(elec_result_PC_all_long$Party_Year)=="PartyNm2013"] <- "2013"
levels(elec_result_PC_all_long$Party_Year)[levels(elec_result_PC_all_long$Party_Year)=="PartyNm2010"] <- "2010"

Done - ready for merging with the other datasets in long format & with unecessary columns removed.