The data was taken from Australian Marriage Law Postal Survey, 2017.
Australian Marriage Law Postal Survey, 2017 was conducted by the Australian Bureau of Statistics. This survey tries to find out people’s views on legalizing same-sex marriage in Australia.
Two excel sheets are being used in the coming tasks:
Australian Marriage Law Postal Survey 2017 - Participation contains information on participants, Distributed over different age bands. A further division by (State/federal division) and gender(male/female)are provided in different sheets.
In this particular script sheet 5 and sheet 6 are going to be used:
sheet 5 contains male participants distributed over different age bands (RHS) and federal division(LHS).
sheet 6 contains female participants distributed over different age bands and federal divisions.
The image is the screenshot of sheet 5. Sheet 5 and 6 are identical in the arrangement.
Australian Marriage Law Postal Survey 2017 - Response contains information on responses of various participants of the study.
In this analysis sheet, 3 of the data will be used from the Australian Marriage Law Postal Survey 2017 - Response the sheet contains responses of eligible participants and also contains info on non-responding participant and unclear responses over different federal divisions
The data is not in tidy format. Reason: In Australian Marriage Law Postal Survey 2017 - Participation, the clumms age brackets are distributed over columns that directly violate tidy data rules. To make data a tidy df, a gather operation is to be performed to create a column with age brackets as different levels of columns.
You can understand better about tidy data rules here.
To tidy these data a good portion of the gamut of tidyverse techniques will be used.
XLConnecttidyrdplyrjanitorstringr and rebus suppressMessages(library(XLConnect)) # dloading data
suppressMessages(library(stringr)) # string manipulation
suppressMessages(library(dplyr)) # data manipulation
suppressMessages(library(janitor)) # data manipulation
suppressMessages(library(tidyr)) # data manipulation
suppressMessages(library(rebus)) # string manipulation
files <- dir("Dirty_data", full = T)
if(length(files)==0 ){
warning("FIRST YOU MUST CREATE A NEW R PROJEC THEN COPY THE FILES DOWNLOADED FROM GIT HUB
AND PASTE ALL INTO THE NEW PROJECT FOLDER") }
my_book <- loadWorkbook(files[1])
book <- loadWorkbook(files[2])
readingFiles <- function(sex, col_num, col_nam, sheet_num){
colnames <- readWorksheet(my_book, sheet=5, endRow =16)
colnames <- unlist( colnames[5,] )
names(colnames) <- NULL
no <- readWorksheet(my_book, sheet=4, startRow =6 , startCol = 18, endCol = 18)
gen <- readWorksheet(my_book, sheet=sheet_num, startRow =6)
if(sex == "male"){gen <- cbind(gen, no)}
colnames[col_num] <- col_nam
names(gen) <- colnames
return(gen)
}
transfom <- function(bit, div){
}
firstStep <- function(bit, div, type){
if(type == "suv"){
bit <- bit %>%
select( - names(bit[,18, drop = F]))%>%
filter(!is.na(label))%>%
fill(state)%>%
mutate(division = as.factor(div))
}else if(type == "resp"){
bit <- bit %>%
mutate(division = as.factor(div),
region = str_remove(region, char_class("(")%R%WRD%R%char_class(")")))%>%
select(division, everything() )}
return(bit)
}
secondStep <- function(data, type, num, place){
start <- str_which(data[,place], fixed("Divisions"))
Division <- str_remove(data[start,place], " Divisions")
New_South_Wales <- firstStep( data[(start[1]+1) : (start[2]-num[1]),] , Division[1],type)
Victoria <- firstStep( data[(start[2]+1) : (start[3]-num[1]),] , Division[2],type)
Queensland <- firstStep( data[(start[3]+1) : (start[4]-num[1]),] , Division[3],type)
South <- firstStep( data[(start[4]+1) : (start[5]-num[1]),] , Division[4],type)
Western <- firstStep( data[(start[5]+1) : (start[6]-num[1]),] , Division[5],type)
Tasmania <- firstStep( data[(start[6]+1) : (start[7]-num[1]),] , Division[6],type)
Northern <- firstStep( data[(start[7]+1) : (start[8]-num[1]),] , Division[7],type)
Capital <- firstStep( data[(start[8]+1) : num[2],] , Division[8],type)
full <- rbind(New_South_Wales, Victoria, Queensland, South, Western, Tasmania, Northern,
Capital)
return(full)
}
thirdStep <- function(full,sex,fitu){
full <- full %>%
gather("years", "participants",- fitu)%>%
select(c(fitu, division, years, participants))%>%
mutate(years = as.factor(str_trim(str_remove(years, fixed("years")))),
years = recode(years, "85 and over" = "85-over"),
state = str_remove(state, char_class("(")%R%WRD%R%char_class(")")))%>%
clean_names()
if(sex == "male"){
tidy <- full %>%
select( - no_gender) %>%
mutate(sex = factor(sex)) %>%
rename(measure = label,
count = participants)%>%
clean_names()
no_gender <- full %>%
select( - participants) %>%
filter(years == "18-19")%>%
mutate(years = NA, sex = NA)%>%
rename(measure = label,
count = no_gender)%>%
clean_names()
tidy <- bind_rows(tidy,no_gender)
}else{
tidy <- full %>%
mutate(sex = factor(sex)) %>%
rename(measure = label,
count = participants)%>%
clean_names()
}
tidy <- tidy %>%
mutate (measure = ifelse(measure == "Participation rate (%)",
"rate percent", measure))%>%
mutate_if(is.character,as.factor)%>%
select (division, state, years, sex, measure,count ) %>%
arrange(years)
return(tidy)
}
data <- readingFiles("male",c(1,2,19),c("state", "label","no_gender"),5)
full <- secondStep(data,"suv",c(6,636),"state")
tidy_male <- thirdStep(full,"male",c("state", "label", "no_gender", "division"))
data <- readingFiles("female",c(1,2,18),c("state", "label","Total Females(b)"),6)
full <- secondStep(data,"suv",c(6,636),"state")
tidy_female <- thirdStep(full,"female",c("state", "label", "division"))
Participation_final <- rbind(tidy_male,tidy_female)
str(Participation_final)
## 'data.frame': 13950 obs. of 6 variables:
## $ division: Factor w/ 8 levels "New South Wales",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ state : Factor w/ 150 levels "Adelaide","Aston",..: 4 4 4 6 6 6 10 10 10 11 ...
## $ years : Factor w/ 15 levels "18-19","20-24",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "male","female": 1 1 1 1 1 1 1 1 1 1 ...
## $ measure : Factor w/ 3 levels "Eligible participants",..: 3 1 2 3 1 2 3 1 2 3 ...
## $ count : num 1102 1431 77 977 1278 ...
head(Participation_final,10)
## division state years sex measure count
## 1 New South Wales Banks 18-19 male Total participants 1102.0
## 2 New South Wales Banks 18-19 male Eligible participants 1431.0
## 3 New South Wales Banks 18-19 male rate percent 77.0
## 4 New South Wales Barton 18-19 male Total participants 977.0
## 5 New South Wales Barton 18-19 male Eligible participants 1278.0
## 6 New South Wales Barton 18-19 male rate percent 76.4
## 7 New South Wales Bennelong 18-19 male Total participants 1177.0
## 8 New South Wales Bennelong 18-19 male Eligible participants 1488.0
## 9 New South Wales Bennelong 18-19 male rate percent 79.1
## 10 New South Wales Berowra 18-19 male Total participants 1523.0
tail(Participation_final,10)
## division state years sex
## 13941 Northern Territory Lingiari 85-over female
## 13942 Northern Territory Solomon 85-over female
## 13943 Northern Territory Solomon 85-over female
## 13944 Northern Territory Solomon 85-over female
## 13945 Australian Capital Territory Canberra 85-over female
## 13946 Australian Capital Territory Canberra 85-over female
## 13947 Australian Capital Territory Canberra 85-over female
## 13948 Australian Capital Territory Fenner 85-over female
## 13949 Australian Capital Territory Fenner 85-over female
## 13950 Australian Capital Territory Fenner 85-over female
## measure count
## 13941 rate percent 49.4
## 13942 Total participants 211.0
## 13943 Eligible participants 283.0
## 13944 rate percent 74.6
## 13945 Total participants 1679.0
## 13946 Eligible participants 2162.0
## 13947 rate percent 77.7
## 13948 Total participants 1082.0
## 13949 Eligible participants 1387.0
## 13950 rate percent 78.0
bind <- function(sheet_num,col_num, col_name){
response <- readWorksheet(book,sheet=3,startRow =7,startCol=sheet_num[1],endCol =sheet_num[2])
response[,col_num]<- NULL
names(response) <- col_name
return(response)
}
response_clear <- bind(c(1,6),c(3,5,6),c("region", "yes", "no"))
response_not_clear <- bind(c(11,13),2 ,c("not_clear", "non_responding"))
all_data <- cbind(response_clear,response_not_clear)
full <- secondStep(all_data,"resp",c(3,172),"region")
response_final <- full %>%
gather(vote, count ,-division,-region)%>%
mutate_if(is.character,as.factor)%>%
arrange(region)
str(response_final)
## 'data.frame': 600 obs. of 4 variables:
## $ division: Factor w/ 8 levels "New South Wales",..: 4 4 4 4 2 2 2 2 2 2 ...
## $ region : Factor w/ 150 levels "Adelaide","Aston",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ vote : Factor w/ 4 levels "no","non_responding",..: 4 1 3 2 4 1 3 2 4 1 ...
## $ count : num 62769 26771 217 20477 48455 ...
head(response_final,10)
## division region vote count
## 1 South Australia Adelaide yes 62769
## 2 South Australia Adelaide no 26771
## 3 South Australia Adelaide not_clear 217
## 4 South Australia Adelaide non_responding 20477
## 5 Victoria Aston yes 48455
## 6 Victoria Aston no 29730
## 7 Victoria Aston not_clear 234
## 8 Victoria Aston non_responding 17664
## 9 Victoria Ballarat yes 65613
## 10 Victoria Ballarat no 27405
tail(response_final,10)
## division region vote count
## 591 Queensland Wide Bay not_clear 319
## 592 Queensland Wide Bay non_responding 21645
## 593 Victoria Wills yes 68450
## 594 Victoria Wills no 29399
## 595 Victoria Wills not_clear 250
## 596 Victoria Wills non_responding 20169
## 597 Queensland Wright yes 47109
## 598 Queensland Wright no 35812
## 599 Queensland Wright not_clear 280
## 600 Queensland Wright non_responding 22144
You can find the code used in this article in the Github Repository. Right-click on the link and choose (open in a New Window).