http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/6523.02015-16?OpenDocument
In particular I want to look at three files from the above site
Household Income and Income Distribution, Victoria
Household Income and Income Distribution, Tasmania
Household Income and Income Distribution, Western Australia
a look at three states household incomes for the years 1995 - 2016
please note : NOT all years were recorded for each state, however the years shown were recorded for each state
# Load packages
library(readr)
library(tidyr) # Restructuring original datasets (from wide to long format)
library(dplyr) # Data manipulation (factor levels, labels etc.)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) # the whole shebeng)
## -- Attaching packages ---------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v stringr 1.3.1
## v ggplot2 3.0.0 v forcats 0.3.0
## -- Conflicts ------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(readxl)
TasHouse .xlsx and
VicHouse .xlsx
WestHouse .xlxs
Because they had the SAME name, and importing them caused probs
as one would simply overwrite the other
#library(readxl)
# TasHouse <- read_excel("C:/Users/dan/Desktop/Pre Pro 3/TasHouse.xlsx")
# #View(TasHouse)
# VicHouse <- read_excel("C:/Users/dan/Desktop/Pre Pro 3/VicHouse.xlsx")
# #View(VivHouse)
# WestHouse <- read_excel("C:/Users/dan/Desktop/Pre Pro 3/WestHouse.xlsx")
# #View(WestHouse)
#library(readxl)
TasHouse <- read_excel("TasHouse.xlsx")
#View(TasHouse)
#library(readxl)
VicHouse <- read_excel("VicHouse.xlsx")
#View(VicHouse)
#library(readxl)
WestHouse <- read_excel("WestHouse.xlsx")
#View(WestHouse)
Section 1 ######## Preparing Data for Victoria
#need rows
vic94a <- VicHouse[,1:16]
vic94b <- VicHouse[ 7:13 , 1:16]
vic94c <- VicHouse[ 16:24 , 1:16]
vic <- rbind(as.data.frame(vic94a, vic94b, vic94c))
vic1 <- vic[-c(25:117),]
#View(vic1)
vic1 <- vic1[-c(1:3),]
#View(vic1)
vic1 <- vic1[-c(2),]
#View(vic1)
vic1 <- vic1[-c(10),]
#View(vic1)
rownames(vic1) <- NULL ## Renumber rows
#View(vic1)
#head(vic1)
##-------------I'm using the row names in this section to
##-------------Identify which rows are what later in the coding
vic1[1, 1] <- "Years" ## Change the name of the rows
vic1[1, 2] <- "Dollars"
rownames(vic1) <- NULL## Now remove row 2
vic2 <- vic1[-c(2),] ############## <<<<<=========== Change to vic2 here
rownames(vic2) <- NULL## Now remove row 2 ## because this makes it easier to
#View(vic2) ## know which works from what dosent
vic2 <- vic2[-c(9),]
rownames(vic2) <- NULL## Now remove row 2
#View(vic2)
## NOTE rows 1 - 8 are Wages , and Rows 9 - 17 are Percentiles <<===== For the Victorian data ##########
vic2 <- vic2[-c(2:6),]
rownames(vic2) <- NULL## Now remove row 2
#View(vic2)
vic2 <- vic2[-c(3:12),] ###
rownames(vic2) <- NULL## Now remove row
vic2[,2]<- NULL ## Remove column 2 <<<<<========== Reduce to 2 columns here
#str(vic2)
## Rename columns
names(vic2) <- c("Years", "1995", "1996", "1997", "1998",
"2000", "2001", "2003", "2004",
"2006", "2008", "2010", "2012",
"2014", "2016")
vic2 <- vic2[-1,] # c(3:12)
rownames(vic2) <- NULL## Now remove row 2
#str(vic2)
#View(vic2)
## Used gather here to Transform wide to long <<<=============
vic2 <- gather(vic2) ## used gather here <<==============
#str(vic2)
colnames(vic2)[1] <- 'Years'
colnames(vic2)[2] <- 'VicHouseholds'
## Now remove row 1, then change all to numeric
vic2 <- vic2[-1,]
## Now renumber rows
rownames(vic2) <- NULL## Now remove row 2
vic2 <- as.data.frame(vic2) #, dimnames = NULL
vic2$Years <- as.numeric(vic2$Years)
vic2$VicHouseholds <- as.numeric(vic2$VicHouseholds)
str(vic2) ## Doing a structure check here check here <<===============
## 'data.frame': 14 obs. of 2 variables:
## $ Years : num 1995 1996 1997 1998 2000 ...
## $ VicHouseholds: num 1340 1297 1342 1400 1534 ...
## Checks and Winsoring <<========== Also imputational if needed
##---------------------------------------------------- KEEP -------Check functions
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(vic2){
if (is.numeric(vic2)) !is.finite(vic2) else is.na(vic2)
}
sum (sapply(vic2, is.special)) ## <<=== check here
## [1] 0
# Use Imputational method below ## Not needed , so no variable changed <<============
#surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
#View(vic2)
Section 2 ######## Preparing Data for Tasmania
#need rows
t94a <- TasHouse[,1:16]
t94b <- TasHouse[ 7:13 , 1:16]
t94c <- TasHouse[ 16:24 , 1:16]
tas <- rbind(as.data.frame(t94a, t94b, t94c))
tas1 <- tas[-c(25:117),]
#View(vic1)
tas1 <- tas1[-c(1:3),]
#View(vic1)
tas1 <- tas1[-c(2),]
#View(vic1)
tas1 <- tas1[-c(10),]
#View(vic1)
rownames(vic1) <- NULL ## Renumber rows
#View(vic1)
#head(vic1)
##-------------I'm using the row names in this section to
##-------------Identify which rows are what later in the coding
tas1[1, 1] <- "Years" ## Change the name of the rows
tas1[1, 2] <- "Dollars"
rownames(tas1) <- NULL## Now remove row 2
## NOTE rows 1 - 8 are Wages , and Rows 9 - 17 are Percentiles <<===== For the Victorian data ##########
tas2 <- tas1[-c(2),] ############## <<<<<=========== Change to vic2 here
rownames(tas2) <- NULL## Now remove row 2 ## because this makes it easier to
#View(vic2) ## know which works from what dosent
tas2 <- tas2[-c(9),]
rownames(tas2) <- NULL## Now remove row 2
View(tas2)
tas2 <- tas2[-c(2:6),]
rownames(tas2) <- NULL## Now remove row 2
#View(vic2)
tas2 <- tas2[-c(3:12),] ###
rownames(tas2) <- NULL## Now remove row
tas2[,2]<- NULL ## Remove column 2 <<<<<========== Reduce to 2 columns here
#str(vic2)
## Rename columns
names(tas2) <- c("Years", "1995", "1996", "1997", "1998",
"2000", "2001", "2003", "2004",
"2006", "2008", "2010", "2012",
"2014", "2016")
tas2 <- tas2[-1,] # c(3:12)
rownames(tas2) <- NULL## Now remove row 2
#str(vic2)
#View(vic2)
## Used gather here to Transform wide to long <<<=============
tas2 <- gather(tas2)
#str(vic2)
colnames(tas2)[1] <- 'Years'
colnames(tas2)[2] <- 'TasHouseholds'
## Now remove row 1, then change all to numeric
tas2 <- tas2[-1,]
## Now renumber rows
rownames(tas) <- NULL## Now remove row 2
## Change column 2 of this data frame to numeric for analysis and NA check
tas2 <- as.data.frame(tas2) #, dimnames = NULL
tas2$Years <- as.numeric(tas2$Years)
tas2$TasHouseholds <- as.numeric(tas2$TasHouseholds)
str(tas2) ## Doing a structure check here check here <<===============
## 'data.frame': 14 obs. of 2 variables:
## $ Years : num 1995 1996 1997 1998 2000 ...
## $ TasHouseholds: num 1146 1138 1122 1107 1134 ...
## Checks and Winsoring <<========== Also imputational if needed
##---------------------------------------------------- KEEP -------Check functions
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(tas2){
if (is.numeric(tas2)) !is.finite(tas2) else is.na(tas2)
}
sum (sapply(tas2, is.special)) ## <<=== check here
## [1] 0
# Use Imputational method below ## Not needed , so no variable changed <<============
#surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
#View(tas2)
Section 3 ######## Preparing Data for Western Australia
#need rows
w94a <- WestHouse[,1:16]
w94b <- WestHouse[ 7:13 , 1:16]
w94c <- WestHouse[ 16:24 , 1:16]
wes <- rbind(as.data.frame(w94a, w94b, w94c))
wes1 <- wes[-c(25:117),]
#View(vic1)
wes2 <- wes1[-c(1:3),]
#View(vic1)
wes2 <- wes2[-c(2),]
#View(vic1)
wes2 <- wes2[-c(10),]
#View(vic1)
rownames(wes2) <- NULL ## Renumber rows
#View(vic1)
#head(vic1)
##-------------I'm using the row names in this section to
##-------------Identify which rows are what later in the coding
wes2[1, 1] <- "Years" ## Change the name of the rows
wes2[1, 2] <- "Dollars"
rownames(wes2) <- NULL## Now remove row 2
wes2 <- wes2[-c(2),] ############## <<<<<=========== Change to vic2 here
rownames(wes2) <- NULL## Now remove row 2 ## because this makes it easier to
#View(vic2) ## know which works from what dosent
## NOTE rows 1 - 8 are Wages , and Rows 9 - 17 are Percentiles <<===== For the Victorian data ##########
wes2 <- wes2[-c(9),]
rownames(wes2) <- NULL## Now remove row 2
View(wes2)
wes2 <- wes2[-c(2:6),]
rownames(tas2) <- NULL## Now remove row 2
#View(vic2)
wes2 <- wes2[-c(3:12),] ###
rownames(wes2) <- NULL## Now remove row
wes2[,2]<- NULL ## Remove column 2 <<<<<========== Reduce to 2 columns here
#str(vic2)
## Rename columns
names(wes2) <- c("Years", "1995", "1996", "1997", "1998",
"2000", "2001", "2003", "2004",
"2006", "2008", "2010", "2012",
"2014", "2016")
wes2 <- wes2[-1,] # c(3:12)
rownames(wes2) <- NULL## Now remove row 2
#str(vic2)
#View(vic2)
## Used gather here to Transform wide to long <<<=============
wes2<- gather(wes2)
#str(vic2)
colnames(wes2)[1] <- 'Years'
colnames(wes2)[2] <- 'WesHouseholds'
## Now remove row 1, then change all to numeric
wes2 <- wes2[-1,]
## Now renumber rows
rownames(wes2) <- NULL## Now remove row 2
## Change column 2 of this data frame to numeric for analysis and NA check
wes2 <- as.data.frame(wes2) #, dimnames = NULL
wes2$Years <- as.numeric(wes2$Years)
wes2$WesHouseholds <- as.numeric(wes2$WesHouseholds)
str(wes2) ## Doing a structure check here check here <<===============
## 'data.frame': 14 obs. of 2 variables:
## $ Years : num 1995 1996 1997 1998 2000 ...
## $ WesHouseholds: num 1339 1341 1396 1394 1449 ...
## Checks and Winsoring <<========== Also imputational if needed
##---------------------------------------------------- KEEP -------Check functions
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(wes2){
if (is.numeric(wes2)) !is.finite(wes2) else is.na(wes2)
}
sum (sapply(vic2, is.special)) ## <<=== check here
## [1] 0
#View(wes2)
# Use Imputational method below ## Not needed , so no variable changed <<============
#surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
Combined <- left_join(vic2, tas2 ) ## This is the first join
## Joining, by = "Years"
Combined <- inner_join(Combined, wes2) ## This is the second join
## Joining, by = "Years"
## All joined by years <<=========
#View(Combined)
#head(Combined)
#str(Combined)
#class(Combined)
Combined <- mutate(Combined, VicTasDif = TasHouseholds / VicHouseholds )
Combined$VicTasDif = round(Combined$VicTasDif, 3)
Combined <- mutate(Combined, VicWesDif = WesHouseholds / VicHouseholds )
Combined$VicWesDif = round(Combined$VicWesDif, 3)
Combined <- mutate(Combined, TasWesDif = TasHouseholds / WesHouseholds)
Combined$TasWesDif = round(Combined$TasWesDif, 3)
#str(Combined)
#View(Combined)
#head(Combined)
ggplot(Combined, aes(Combined$Years)) +
geom_line(aes(y = Combined$VicHouseholds, colour = "red")) +
geom_line(aes(y = Combined$TasHouseholds, colour = "blue")) +
geom_line(aes(y = Combined$WesHouseholds, colour = "green")) +
xlab("Years") + ylab("Household Incomes per State ($)") +
scale_colour_discrete(name="States", labels=c("Tas","WA","Vic")) +
theme(panel.border = element_blank())
## Plot of WA & Tas showing % difference in household to Victoria
ggplot(Combined, aes(Combined$Years)) +
geom_line(aes(y = Combined$VicTasDif, colour = "red")) +
geom_line(aes(y = Combined$VicWesDif, colour = "blue")) +
#geom_line(aes(y = Combined$TasWesDif, colour = "green")) +
xlab("Years") + ylab("As % to Victoria Household Income") +
scale_colour_discrete(name="States", labels=c("WA","Tas","Vic")) +
theme(panel.border = element_blank())
write.csv(Combined, file = "Combined.csv") ## Create new file and save back to your folder