http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/6523.02015-16?OpenDocument
In particular I want to look at three files from the above site
Household Income and Income Distribution, Victoria
Household Income and Income Distribution, Tasmania
Household Income and Income Distribution, Western Australia
http://rpubs.com/fast_Eddie/431170
In this Study where we wish to compare Household Incomes between three Australian States.
We will look at household incomes for the years 1995 - 2016
please note : NOT all years were recorded for each state, however the years shown were recorded for each state
TasHouse .xlsx and
VicHouse .xlsx
WestHouse .xlxs
Because they had the SAME name, and importing them caused probs
as one would simply overwrite the other.
I also had to select which page from each file to Import
#library(readxl)
TasHouse <- read_excel("C:/Users/dan/Desktop/Pre Pro 3/TasHouse.xlsx")
#View(TasHouse)
VicHouse <- read_excel("C:/Users/dan/Desktop/Pre Pro 3/VicHouse.xlsx")
#View(VivHouse)
WestHouse <- read_excel("C:/Users/dan/Desktop/Pre Pro 3/WestHouse.xlsx")
#View(WestHouse)
# str(VicHouse)
# str(TasHouse)
# str(WestHouse)
Section 1 ######## Preparing Data for Victoria
#need rows
vic94a <- VicHouse[,1:16]
vic94b <- VicHouse[ 7:13 , 1:16]
vic94c <- VicHouse[ 16:24 , 1:16]
vic <- rbind(as.data.frame(vic94a, vic94b, vic94c))
vic1 <- vic[-c(25:117),]
#View(vic1)
vic1 <- vic1[-c(1:3),]
#View(vic1)
vic1 <- vic1[-c(2),]
#View(vic1)
vic1 <- vic1[-c(10),]
#View(vic1)
rownames(vic1) <- NULL ## Renumber rows
#View(vic1)
#head(vic1)
##-------------I'm using the row names in this section to
##-------------Identify which rows are what later in the coding
vic1[1, 1] <- "Years" ## Change the name of the rows
vic1[1, 2] <- "Dollars"
rownames(vic1) <- NULL## Now remove row 2
vic2 <- vic1[-c(2),] ############## <<<<<=========== Change to vic2 here
rownames(vic2) <- NULL## Now remove row 2 ## because this makes it easier to
#View(vic2) ## know which works from what dosent
vic2 <- vic2[-c(9),]
rownames(vic2) <- NULL## Now remove row 2
#View(vic2)
## NOTE rows 1 - 8 are Wages , and Rows 9 - 17 are Percentiles <<===== For the Victorian data ##########
vic2 <- vic2[-c(2:6),]
rownames(vic2) <- NULL## Now remove row 2
#View(vic2)
vic2 <- vic2[-c(3:12),] ###
rownames(vic2) <- NULL## Now remove row
vic2[,2]<- NULL ## Remove column 2 <<<<<========== Reduce to 2 columns here
#str(vic2)
## Rename columns
names(vic2) <- c("Years", "1995", "1996", "1997", "1998",
"2000", "2001", "2003", "2004",
"2006", "2008", "2010", "2012",
"2014", "2016")
vic2 <- vic2[-1,] # c(3:12)
rownames(vic2) <- NULL## Now remove row 2
#str(vic2)
#View(vic2)
## Used gather here to Transform wide to long <<<=============
vic2 <- gather(vic2)
#str(vic2)
colnames(vic2)[1] <- 'Years'
colnames(vic2)[2] <- 'VicHouseholds'
## Now remove row 1, then change all to numeric
vic2 <- vic2[-1,]
## Now renumber rows
rownames(vic2) <- NULL## Now remove row 2
vic2 <- as.data.frame(vic2) #, dimnames = NULL
vic2$Years <- as.numeric(vic2$Years)
vic2$VicHouseholds <- as.numeric(vic2$VicHouseholds)
#str(vic2) ## Doing a structure check here check here <<===============
#nrow(vic2)
## Checks and Winsoring <<========== Also imputational if needed
##---------------------------------------------------- KEEP -------Check functions
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(vic2){
if (is.numeric(vic2)) !is.finite(vic2) else is.na(vic2)
}
sum (sapply(vic2, is.special)) ## <<=== check here
## [1] 0
# Use Imputational method below ## Not needed , so no variable changed <<============
#surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
#View(vic2)
#str(vic2)
Section 2 ######## Preparing Data for Tasmania
#need rows
t94a <- TasHouse[,1:16]
t94b <- TasHouse[ 7:13 , 1:16]
t94c <- TasHouse[ 16:24 , 1:16]
tas <- rbind(as.data.frame(t94a, t94b, t94c))
tas1 <- tas[-c(25:117),]
#View(vic1)
tas1 <- tas1[-c(1:3),]
#View(vic1)
tas1 <- tas1[-c(2),]
#View(vic1)
tas1 <- tas1[-c(10),]
#View(vic1)
rownames(vic1) <- NULL ## Renumber rows
#View(vic1)
#head(vic1)
##-------------I'm using the row names in this section to
##-------------Identify which rows are what later in the coding
tas1[1, 1] <- "Years" ## Change the name of the rows
tas1[1, 2] <- "Dollars"
rownames(tas1) <- NULL## Now remove row 2
## NOTE rows 1 - 8 are Wages , and Rows 9 - 17 are Percentiles <<===== For the Victorian data ##########
tas2 <- tas1[-c(2),] ############## <<<<<=========== Change to vic2 here
rownames(tas2) <- NULL## Now remove row 2 ## because this makes it easier to
#View(vic2) ## know which works from what dosent
tas2 <- tas2[-c(9),]
rownames(tas2) <- NULL## Now remove row 2
View(tas2)
tas2 <- tas2[-c(2:6),]
rownames(tas2) <- NULL## Now remove row 2
#View(vic2)
tas2 <- tas2[-c(3:12),] ###
rownames(tas2) <- NULL## Now remove row
tas2[,2]<- NULL ## Remove column 2 <<<<<========== Reduce to 2 columns here
#str(vic2)
## Rename columns
names(tas2) <- c("Years", "1995", "1996", "1997", "1998",
"2000", "2001", "2003", "2004",
"2006", "2008", "2010", "2012",
"2014", "2016")
tas2 <- tas2[-1,] # c(3:12)
rownames(tas2) <- NULL## Now remove row 2
#str(vic2)
#View(vic2)
## Used gather here to Transform wide to long <<<=============
tas2 <- gather(tas2)
#str(vic2)
colnames(tas2)[1] <- 'Years'
colnames(tas2)[2] <- 'TasHouseholds'
## Now remove row 1, then change all to numeric
tas2 <- tas2[-1,]
## Now renumber rows
rownames(tas) <- NULL## Now remove row 2
## Change column 2 of this data frame to numeric for analysis and NA check
tas2 <- as.data.frame(tas2) #, dimnames = NULL
tas2$Years <- as.numeric(tas2$Years)
tas2$TasHouseholds <- as.numeric(tas2$TasHouseholds)
#str(tas2) ## Doing a structure check here check here <<===============
#nrow(tas2)
## Checks and Winsoring <<========== Also imputational if needed
##---------------------------------------------------- KEEP -------Check functions
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(tas2){
if (is.numeric(tas2)) !is.finite(tas2) else is.na(tas2)
}
sum (sapply(tas2, is.special)) ## <<=== check here
## [1] 0
# Use Imputational method below ## Not needed , so no variable changed <<============
#surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
#View(tas2)
Section 3 ######## Preparing Data for Western Australia
#need rows
w94a <- WestHouse[,1:16]
w94b <- WestHouse[ 7:13 , 1:16]
w94c <- WestHouse[ 16:24 , 1:16]
wes <- rbind(as.data.frame(w94a, w94b, w94c))
wes1 <- wes[-c(25:117),]
#View(vic1)
wes2 <- wes1[-c(1:3),]
#View(vic1)
wes2 <- wes2[-c(2),]
#View(vic1)
wes2 <- wes2[-c(10),]
#View(vic1)
rownames(wes2) <- NULL ## Renumber rows
#View(vic1)
#head(vic1)
##-------------I'm using the row names in this section to
##-------------Identify which rows are what later in the coding
wes2[1, 1] <- "Years" ## Change the name of the rows
wes2[1, 2] <- "Dollars"
rownames(wes2) <- NULL## Now remove row 2
wes2 <- wes2[-c(2),] ############## <<<<<=========== Change to vic2 here
rownames(wes2) <- NULL## Now remove row 2 ## because this makes it easier to
#View(vic2) ## know which works from what dosent
## NOTE rows 1 - 8 are Wages , and Rows 9 - 17 are Percentiles <<===== For the Victorian data ##########
wes2 <- wes2[-c(9),]
rownames(wes2) <- NULL## Now remove row 2
View(wes2)
wes2 <- wes2[-c(2:6),]
rownames(tas2) <- NULL## Now remove row 2
#View(vic2)
wes2 <- wes2[-c(3:12),] ###
rownames(wes2) <- NULL## Now remove row
wes2[,2]<- NULL ## Remove column 2 <<<<<========== Reduce to 2 columns here
#str(vic2)
## Rename columns
names(wes2) <- c("Years", "1995", "1996", "1997", "1998",
"2000", "2001", "2003", "2004",
"2006", "2008", "2010", "2012",
"2014", "2016")
wes2 <- wes2[-1,] # c(3:12)
rownames(wes2) <- NULL## Now remove row 2
#str(vic2)
#View(vic2)
## Used gather here to Transform wide to long <<<=============
wes2<- gather(wes2)
#str(vic2)
colnames(wes2)[1] <- 'Years'
colnames(wes2)[2] <- 'WesHouseholds'
## Now remove row 1, then change all to numeric
wes2 <- wes2[-1,]
## Now renumber rows
rownames(wes2) <- NULL## Now remove row 2
## Change column 2 of this data frame to numeric for analysis and NA check
wes2 <- as.data.frame(wes2) #, dimnames = NULL
wes2$Years <- as.numeric(wes2$Years)
wes2$WesHouseholds <- as.numeric(wes2$WesHouseholds)
#str(wes2) ## Doing a structure check here check here <<===============
#nrow(wes2)
## Checks and Winsoring <<========== Also imputational if needed
##---------------------------------------------------- KEEP -------Check functions
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(wes2){
if (is.numeric(wes2)) !is.finite(wes2) else is.na(wes2)
}
sum (sapply(vic2, is.special)) ## <<=== check here
## [1] 0
#View(wes2)
# Use Imputational method below ## Not needed , so no variable changed <<============
#surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
Section 4 ######## Check that data elements are identical
Using base r
identical(vic2[1], tas2[1]) ## looking at Years to see if they're identical befor we combine
## [1] TRUE
identical(vic2[1], wes2[1]) ## into one data frame
## [1] TRUE
Section 5 ######## Add data columns together to forma a new data frame
Combined <- left_join(vic2, tas2 ) ## This is the first join
## Joining, by = "Years"
Combined <- inner_join(Combined, wes2) ## This is the second join
## Joining, by = "Years"
## All joined by years <<=========
#View(Combined)
#head(Combined)
str(Combined)
## 'data.frame': 14 obs. of 4 variables:
## $ Years : num 1995 1996 1997 1998 2000 ...
## $ VicHouseholds: num 1340 1297 1342 1400 1534 ...
## $ TasHouseholds: num 1146 1138 1122 1107 1134 ...
## $ WesHouseholds: num 1339 1341 1396 1394 1449 ...
#class(Combined)
Combined <- mutate(Combined, VicTasDif = TasHouseholds / VicHouseholds )
Combined$VicTasDif = round(Combined$VicTasDif, 3)
Combined <- mutate(Combined, VicWesDif = WesHouseholds / VicHouseholds )
Combined$VicWesDif = round(Combined$VicWesDif, 3)
Combined <- mutate(Combined, TasWesDif = TasHouseholds / WesHouseholds)
#Combined$TasWesDif = (as.factor(Combined$TasWesDif))
Combined$TasWesDif <- as.numeric(as.character(Combined$TasWesDif))
Combined$TasWesDif <- log10(Combined$TasWesDif)
Combined$TasWesDif <- round(Combined$TasWesDif, digits=3)
Combined$TasWesDif = (as.factor(Combined$TasWesDif))
str(Combined)
## 'data.frame': 14 obs. of 7 variables:
## $ Years : num 1995 1996 1997 1998 2000 ...
## $ VicHouseholds: num 1340 1297 1342 1400 1534 ...
## $ TasHouseholds: num 1146 1138 1122 1107 1134 ...
## $ WesHouseholds: num 1339 1341 1396 1394 1449 ...
## $ VicTasDif : num 0.855 0.877 0.836 0.791 0.739 0.73 0.752 0.784 0.792 0.749 ...
## $ VicWesDif : num 0.999 1.034 1.04 0.996 0.945 ...
## $ TasWesDif : Factor w/ 12 levels "-0.225","-0.171",..: 12 11 9 8 6 5 7 10 6 4 ...
#View(Combined)
head(Combined, 3)
## Years VicHouseholds TasHouseholds WesHouseholds VicTasDif VicWesDif
## 1 1995 1340 1146 1339 0.855 0.999
## 2 1996 1297 1138 1341 0.877 1.034
## 3 1997 1342 1122 1396 0.836 1.040
## TasWesDif
## 1 -0.068
## 2 -0.071
## 3 -0.095
ggplot(Combined, aes(Combined$Years)) +
geom_line(aes(y = Combined$VicHouseholds, colour = "red")) +
geom_line(aes(y = Combined$TasHouseholds, colour = "blue")) +
geom_line(aes(y = Combined$WesHouseholds, colour = "green")) +
xlab("Years") + ylab("Household Incomes per State ($)") +
scale_colour_discrete(name="States", labels=c("Tas","WA","Vic")) +
theme(panel.border = element_blank())
## Plot of WA & Tas showing % difference in household to Victoria
ggplot(Combined, aes(Combined$Years)) +
geom_line(aes(y = Combined$VicTasDif, colour = "red")) +
geom_line(aes(y = Combined$VicWesDif, colour = "blue")) +
#geom_line(aes(y = Combined$TasWesDif, colour = "green")) +
xlab("Years") + ylab("As % to Victoria Household Income") +
scale_colour_discrete(name="States", labels=c("WA","Tas","Vic")) +
theme(panel.border = element_blank())
write.csv(Combined, file = "Combined.csv") ## Create new file and save back to your folder
As a study we found this to quite interesting, notably in the variance between WA and Victoria. In the years 2005 to 2016 WA household income has had a distinctive increase compared to Victoria and for that matter when also compared to Tasmania. Whereas Tasmania appears to have tracked Victoria while consistently remaining at about 80% of the Victorain household income for the years 1996 to 2016. Its possible to conclude from this study that the WA mining boom over the years 2005 to 2016 has had an impact on the WA household income.
Please NOTE all information in relation to this study was accessed from the sites as noted above