Required packages

#Load packages
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.3
library(foreign)
library(gdata)
## Warning: package 'gdata' was built under R version 3.5.3
## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
## 
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
## 
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
## 
##     combine, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(deductive)
## Warning: package 'deductive' was built under R version 3.5.3
library(deducorrect)
## Warning: package 'deducorrect' was built under R version 3.5.3
## Loading required package: editrules
## Warning: package 'editrules' was built under R version 3.5.3
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 3.5.3
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## 
## Attaching package: 'editrules'
## The following objects are masked from 'package:igraph':
## 
##     blocks, normalize
## The following objects are masked from 'package:tidyr':
## 
##     contains, separate
## The following object is masked from 'package:dplyr':
## 
##     contains
library(editrules)
library(validate)
## Warning: package 'validate' was built under R version 3.5.3
## 
## Attaching package: 'validate'
## The following object is masked from 'package:igraph':
## 
##     compare
## The following object is masked from 'package:dplyr':
## 
##     expr
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
## 
##     expr
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:validate':
## 
##     label, label<-
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:rvest':
## 
##     html
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(forecast)
## Warning: package 'forecast' was built under R version 3.5.3
library(stringr)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:igraph':
## 
##     %--%
## The following object is masked from 'package:base':
## 
##     date
library(car)
## Warning: package 'car' was built under R version 3.5.3
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(outliers)
library(MVN)
## Warning: package 'MVN' was built under R version 3.5.3
## sROC 0.1-2 loaded
library(infotheo)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(mlr)
## Warning: package 'mlr' was built under R version 3.5.3
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.5.3
## 
## Attaching package: 'ParamHelpers'
## The following object is masked from 'package:editrules':
## 
##     isFeasible
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
## 
##     train
## The following object is masked from 'package:Hmisc':
## 
##     impute
## The following object is masked from 'package:gdata':
## 
##     resample
library(ggplot2)
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3

Executive Summary

This report provides clean and tidy data on the employment status and highest level of education attained by Australians.

Required packages were loaded and two datasets were imported.

Once imported, the following pre processing steps were undertaken: * Some observations were converted to variable names * Some columns and rows were dropped * New variables were added * Variables were reshaped from wide to long format (and vice versa) * Variables were renamed.

Afterwards, the two datasets were joined. The dimensions and structure of the joined dataset were checked. This led to 3 character variables being converted to factors, and the levels for one of those factors being relabelled.

The data was still untidy - one column had mixed data types: dates (in the form of a year) and characters. The data was tidied by separating the column into two.

The data was then checked for missing, infinite and NaN values, as well as errors and inconsistencies. None were identified.

Next, the data was inspected for univariate outliers. Tukey’s method of outlier detection was used. A boxplot showed 4 univariate outliers. The capping approach was used to remove outliers. After applying capping, a revised boxplot and summary statistics of the capped data were checked, which indicated that outliers had been removed.

Finally, the data was transformed. A histogram showed the data is right (or positively) skewed. Given the right skewed distribution, the data was transformed using mathematical operations in order to achieve a normal distribution. This approach was taken given the preference in statistical inference for normal distribution as opposed to right or left skewed data. Log, natural log, square root and reciprocal transformations were undertaken. The natural log transformation best reduces the skewness and converts the distribution to a normal distribution.

Data

Description of data, sources and variable descriptions

Two seperate datasets were sourced:

  1. Highest educational attainment: By state or territory of usual residence and sex, Persons aged 15-74 years. This dataset was sourced from the Australian Bureau of Statistics (ABS) and is available at https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/6227.0May%202018?OpenDocument

  2. Non-school qualifications in 2010-11 and 2015: Selected characteristics, Persons aged 15-64 years. This dataset was also sourced from the ABS and is available at https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/4235.02015?OpenDocument

Dataset 1 shows the highest level of education attained by Australian persons, broken up by Australian state/territory and gender. The variables include level of education achieved and state/territory. Those variables are overlayed by gender. A number of preprocessing steps will need to be performed before this dataset can be joined with dataset 2.

Dataset 2 shows the proportion of Australians employed, unemployed or not in the labour force according to level of non-school qualification as well as a range of other characteristics. The data is then broken up by gender. The raw dataset imported into R contained characteristic (for example non school qualification), and then 2010-11 data on labour force status followed by 2015 data on labour force status. A number of preprocessing steps will need to be performed before this dataset can be joined with dataset 2.

Steps taken to import, preprocess and join the data:

  1. Files were downloaded and saved as excel files on my computer.

  2. The working directory was set to the location of the files.

  3. Dataset 1 (highet education attainment) was imported using the readxl package.

  4. Dataset 1 was checked using the head() function to identify any data preprocessing required. Some issues were identified and which were subsequently addressed:

    • Some variables appeared as observations i.e. Australian state/territory appeared in row 1. Further, what should have been a variable name for column one (‘Level of highest educational attainment’) appeared as an obseration, not as a column heading. To rectify these issues, I reimported the data, but skipped rows 1 and 2, and renamed columns. The first column was renamed “Highest education level attained”. Subsequent columns were named using a consistent naming convention: state/territory_gender, for addressed in a later step. The re-imported dataset was checked using the head() function. This showed the dataset was imported correctly, with correct column names.

    • Some columns and rows were dropped to help the reader/other analysts understand the data content. The total row was dropped (row 10). Further, only three columns were kept: those showing highest education level, as well as male and female totals. This was done for a similar reason: to help the reader/other analyststo best understand the data content. The revised dataset was checked using the head() function.

    • The format of some data is inconsistent with the format of relevant data in dataset 2 (which is yet to be imported). The data in columns 2 and 3 were actual numbers. The second dataset (which will be imported in the following steps) shows % of people in certain categories (as opposed to raw numbers). Therefore, to ensure like for like data once the datasets are joined, I decided to undertake an additional preprocessing step here - by creating new columns which show % of each gender that make up a certain education qualification. Once these columns were created, the figures were converted to percentages and rounded. Once this was done, the columns with raw numbers were dropped. The revised dataset was checked throughout this process.

    • Some other preprocessing was conducted - with a view to having consistent data once the two datasets were merged. These included converting the data from wide into long format, renaming the columns and adding a further column titled Employment or education (with all observations for this dataset being set to “Education”). The final dataset was checked and considered ok.

  5. Dataset 2 (non-school qualifications) was imported.

  6. Dataset 2 was checked to identify any data preprocessing required. Issues identified and subsequently addressed:

    • Some variables appeared as observations i.e. labour force status appeared in row 2. To rectify this, the data was reimported but rows 1-4 were skipped, and columns were renamed.

    • Some data was dropped. I only kept rows showing total male and female figures. In terms of columns, I dropped the total columns, as I only wanted to show the figures for the respective labour force status categories, not the total of all those categories. This is consistent with the first dataset imported.

    • Steps were taken to convert the data into tidy format and to be in a consistent format to the first dataset. Education qualification variables were converted into long format.

    • The male and female rows were converted into wide format. This was so that another column could be created (‘Employment or Education’ - with all observations being recorded as ‘Employment’). Column names were all renamed to enable joining on common names.

    • The gender was then converted back to long format, consistent with the first dataset.

  7. The two datasets were joined using a full join, with all columns from both datasets kept. The joined dataset was checked using the head() function and it appears ok.

#Set working directory
setwd("~/RMIT/Data Preprocessing/Assignment 3")

## Import and data preprocess education dataset

#Import dataset on highest education level attained
Education <- read_excel("Education.xlsx")
## New names:
## * `` -> ...1
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 21 more problems
#View education dataset
head(Education)
#View shows some issues. View row 1 to see what column names should be given
head(Education[1,])
#Reimport dataset, but skip rows 1 and 2, and rename columns
Education <- read_excel("Education.xlsx", skip = 2, col_names = c("Highest_education_level_attained","NSW_m", "VIC_m", "QLD_m", "SA_m", "WA_m", "TAS_m", "NT_m", "ACT_m", "Total males","NSW_f", "VIC_f", "QLD_f", "SA_f", "WA_f", "TAS_f", "NT_f", "ACT_f", "Total females","blank column","NSW_total", "VIC_total", "QLD_total", "SA_total", "WA_total", "TAS_total", "NT_total", "ACT_total", "Total"))

#View revised dataset
head(Education)
#Keep first column and then columns on total for each gender, remove total row
Education <- Education [-10,c(1,10,19)]

#View revised dataset
head(Education)
#Calculate % of males and females with the various levels of education qualifications
Education <- Education %>% mutate(Total_males_percent = `Total males`/sum(`Total males`), Total_females_percent = `Total females`/sum(`Total females`))

#Convert to % and round to one decimal place
Education$Total_males_percent <- as.numeric(sub("%", "", Education$Total_males_percent, fixed = TRUE))*100
Education$Total_females_percent <- as.numeric(sub("%", "", Education$Total_females_percent, fixed = TRUE))*100
Education$Total_males_percent <- round(Education$Total_males_percent,1)
Education$Total_females_percent <- round(Education$Total_females_percent,1)

#Drop columns 2 and 3 that have actual no's - only want to keep percentages
Education <- Education [,c(-2,-3)]

#check revised dataset
head(Education)
#Rename columns
colnames(Education) <- c("Education or employment status", "Male", "Female")

#Gather Education dataset to make data into long format
Education <- Education %>% gather(`Male`, `Female`, key = "Gender", value = "% of gender")

#Mutate to add employment or eduation column
Education <- Education %>% mutate(Employment_or_education = "Education")

head(Education)
## Import and data preprocess labour_force dataset

#Import Labour Force dataset
Labour_force <- read_excel("Labour force.xlsx")
## New names:
## * `` -> ...1
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...7
## * ... and 2 more problems
#View imported dataset
head(Labour_force)
#Reimport dataset, but skip rows 1-4, and rename columns
Labour_force <- read_excel("Labour force.xlsx", skip = 4, col_names = c("Gender","Employed (%)_2010-11","Unemployed (%)_2010-11","Not in labour force (%)_2010-11","Total persons (%)_2010-11","Employed (%)_2015","Unemployed (%)_2015","Not in labour force (%)_2015","Total persons (%)_2015"))

#Keep data only from rows with overall data on males and females (rows 35 and 70) and remove total columns
Labour_force <- Labour_force [c(35,70),c(-5,-9)]

#Gather Labour_force dataset to make data into long format
Labour_force <- Labour_force %>% gather(`Employed (%)_2010-11`, `Unemployed (%)_2010-11`, `Not in labour force (%)_2010-11`, `Employed (%)_2015`, `Unemployed (%)_2015`, `Not in labour force (%)_2015`, key = "Characteristic", value = "value")

#Spread Labour_force dataset to make gender column into wide format
Labour_force <- Labour_force %>% spread(key = "Gender", value = "value")

#Rename male and female columns in labour_force dataset
colnames(Labour_force) <- c("Education or employment status","Male", "Female")

#Mutate to add employment or eduation column
Labour_force <- Labour_force %>% mutate(Employment_or_education = "Employment")

#Gather Labour_force dataset to make data into long format
Labour_force <- Labour_force %>% gather(Male, Female, key = "Gender", value = "% of gender")

#Check revised dataset
head(Labour_force)
#Join the two datasets and keep all observations
Joined_dataset <- full_join(Education, Labour_force)
## Joining, by = c("Education or employment status", "Gender", "% of gender", "Employment_or_education")
#Show output
head(Joined_dataset,30)

Understand

#Check dimensions of joined dataset
dim(Joined_dataset)
## [1] 30  4
#Check structure of joined dataset
str(Joined_dataset) 
## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  4 variables:
##  $ Education or employment status: chr  "Postgraduate Degree" "Graduate Diploma/Graduate Certificate" "Bachelor Degree" "Advanced Diploma/Diploma" ...
##  $ Gender                        : chr  "Male" "Male" "Male" "Male" ...
##  $ % of gender                   : num  6.4 2 16.6 8.8 23.8 19 5.3 10.4 7.6 6.6 ...
##  $ Employment_or_education       : chr  "Education" "Education" "Education" "Education" ...
#Check type of numeric variabe
typeof(Joined_dataset$`% of gender`)
## [1] "double"
#Apply data conversions to change character variables to factors
Joined_dataset$Gender <- as.factor(Joined_dataset$Gender)
Joined_dataset$Employment_or_education <- as.factor(Joined_dataset$Employment_or_education)
Joined_dataset$`Education or employment status` <- as.factor(Joined_dataset$`Education or employment status`)

#Recheck strucuture and data types
str(Joined_dataset) #confirms change of 3 variables from characters to factors
## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  4 variables:
##  $ Education or employment status: Factor w/ 15 levels "Advanced Diploma/Diploma",..: 10 7 2 1 4 15 14 13 3 10 ...
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ % of gender                   : num  6.4 2 16.6 8.8 23.8 19 5.3 10.4 7.6 6.6 ...
##  $ Employment_or_education       : Factor w/ 2 levels "Education","Employment": 1 1 1 1 1 1 1 1 1 1 ...
#Check levels of factors
levels(Joined_dataset$`Education or employment status`)
##  [1] "Advanced Diploma/Diploma"             
##  [2] "Bachelor Degree"                      
##  [3] "Below Year 10"                        
##  [4] "Certificate III/IV"                   
##  [5] "Employed (%)_2010-11"                 
##  [6] "Employed (%)_2015"                    
##  [7] "Graduate Diploma/Graduate Certificate"
##  [8] "Not in labour force (%)_2010-11"      
##  [9] "Not in labour force (%)_2015"         
## [10] "Postgraduate Degree"                  
## [11] "Unemployed (%)_2010-11"               
## [12] "Unemployed (%)_2015"                  
## [13] "Year 10"                              
## [14] "Year 11"                              
## [15] "Year 12 or equivalent"
levels(Joined_dataset$Gender)
## [1] "Female" "Male"
levels(Joined_dataset$Employment_or_education)
## [1] "Education"  "Employment"
#Relabel Education or employment status variable
levels(Joined_dataset$`Education or employment status`) <- c("Diploma_2018","Bachelor degree_2018","Below year 10_2018", "Certificate 3 or 4_2018", "Employed_2011","Employed_2015", "Graduate Diploma or Graduate Certificate_2018", "Not in labour force_2011", "Not in labour force_2015", "Masters_2018", "Unemployed_2011", "Unemployed_2015", "Year 10_2018", "Year 11_2018", "Year 12_2018")

#Recheck levels of factor variable
levels(Joined_dataset$`Education or employment status`)
##  [1] "Diploma_2018"                                 
##  [2] "Bachelor degree_2018"                         
##  [3] "Below year 10_2018"                           
##  [4] "Certificate 3 or 4_2018"                      
##  [5] "Employed_2011"                                
##  [6] "Employed_2015"                                
##  [7] "Graduate Diploma or Graduate Certificate_2018"
##  [8] "Not in labour force_2011"                     
##  [9] "Not in labour force_2015"                     
## [10] "Masters_2018"                                 
## [11] "Unemployed_2011"                              
## [12] "Unemployed_2015"                              
## [13] "Year 10_2018"                                 
## [14] "Year 11_2018"                                 
## [15] "Year 12_2018"
#Show output
head(Joined_dataset)

Tidy & Manipulate Data I

#Spread Labour_force dataset to make gender column into wide format
Joined_dataset <- Joined_dataset %>% tidyr::separate(`Education or employment status`, into = c("Education or employment status", "Year"), sep = "_")

#Show output
head(Joined_dataset)

Tidy & Manipulate Data II

#Refer to Data section at top

Scan I

#Create function that will allow us to check for infinite, NaN or NA values
is.specialorNA <- function(x){if (is.numeric(x)) (is.infinite(x) | is.nan(x) | is.na(x))} 

#Apply function to the joined_dataset
sapply(Joined_dataset, is.specialorNA) #no infinite, NaN or NA values in the numerical column (% of gender)
## $`Education or employment status`
## NULL
## 
## $Year
## NULL
## 
## $Gender
## NULL
## 
## $`% of gender`
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 
## $Employment_or_education
## NULL
#Check for obvious inconsistencies or errors
#Load rules from txt file
Rules <- editfile("rules.txt")
Rules
## 
## Data model:
## dat1 : Education or employment status %in% c('Bachelor degree', 'Below year 10', 'Certificate 3 or 4', 'Diploma', 'Employed', 'Graduate Diploma or Graduate Certificate', 'Masters', 'Not in labour force', 'Unemployed', 'Year 10', 'Year 11', 'Year 12')
## dat2 : Employment_or_education %in% c('Education', 'Employment')
## dat3 : Gender %in% c('Female', 'Male')
## dat4 : Year %in% c('2011', '2015', '2018') 
## 
## Edit set:
## NULL :
#Check and show summary of violated rules
ve <- violatedEdits(Rules, Joined_dataset)
summary(ve)
## No violations detected, 0 checks evaluated to NA
## NULL

Scan II

#Produce boxplot to test for outliers
Joined_dataset$`% of gender` %>%  boxplot(main="Box Plot of % of gender variable", ylab="Percentage", col = "blue") #4 apparent outliers identified

#Create a function to cap the values outside the limits
cap <- function(x){
    quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
    x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
    x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
    x
}

#Take numeric variable and save as an object
Joined_dataset2 <- Joined_dataset %>%  dplyr::select(`% of gender`)

#Show summary statistics of new dataframe
summary(Joined_dataset2)
##   % of gender   
##  Min.   : 2.00  
##  1st Qu.: 5.95  
##  Median :11.70  
##  Mean   :20.00  
##  3rd Qu.:20.35  
##  Max.   :81.40
#Apply function "cap" to new dataframe
Joined_dataset2_capped <- sapply(Joined_dataset2, FUN = cap)

#Check summary statistics again
summary(Joined_dataset2_capped)
##   % of gender   
##  Min.   : 2.00  
##  1st Qu.: 5.95  
##  Median :11.70  
##  Mean   :20.01  
##  3rd Qu.:20.35  
##  Max.   :74.45
#Check outliers removed by producing another boxplot, this time of the capped dataframe
Joined_dataset2_capped %>%  boxplot(main="Box Plot of % of gender variable to check capping of outliers", ylab="Percentage", col = "blue") #shows 1 outlier

Transform

#View histogram of '% of gender variable'
hist(Joined_dataset$`% of gender`)

#Apply log transformation and view histogram of log 10 transformed data
log_joined_dataset <- log10(Joined_dataset$`% of gender`)
hist(log_joined_dataset, main = "Histogram of log10 transformed data")

#Apply natural log transformation and view histogram of natural log transformed data
ln_joined_dataset <- log(Joined_dataset$`% of gender`)
hist(ln_joined_dataset, main = "Histogram of natural log transformed data")

#Apply square root transformation and view histogram of square root transformed data
sqrt_joined_dataset <- sqrt(Joined_dataset$`% of gender`)
hist(sqrt_joined_dataset, main = "Histogram of square root transformed data")

#Apply reciprocal transformation and view histogram of transformed data
joined_dataset_recip <- 1/Joined_dataset$`% of gender`
hist(joined_dataset_recip, main = "Histogram of reciprocal transformed data")