MATH2349 Semester 2, 2019

Required packages

#Load packages
library(readxl)

## Warning: package 'readxl' was built under R version 3.5.3

library(foreign)
library(gdata)

## Warning: package 'gdata' was built under R version 3.5.3

## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)

## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.

##

## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.

##

## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

library(rvest)

## Warning: package 'rvest' was built under R version 3.5.3

## Loading required package: xml2

## Warning: package 'xml2' was built under R version 3.5.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:gdata':
## 
##     combine, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.5.3

library(deductive)

## Warning: package 'deductive' was built under R version 3.5.3

library(deducorrect)

## Warning: package 'deducorrect' was built under R version 3.5.3

## Loading required package: editrules

## Warning: package 'editrules' was built under R version 3.5.3

## Loading required package: igraph

## Warning: package 'igraph' was built under R version 3.5.3

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

## 
## Attaching package: 'editrules'

## The following objects are masked from 'package:igraph':
## 
##     blocks, normalize

## The following objects are masked from 'package:tidyr':
## 
##     contains, separate

## The following object is masked from 'package:dplyr':
## 
##     contains

library(editrules)
library(validate)

## Warning: package 'validate' was built under R version 3.5.3

## 
## Attaching package: 'validate'

## The following object is masked from 'package:igraph':
## 
##     compare

## The following object is masked from 'package:dplyr':
## 
##     expr

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.5.3

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:validate':
## 
##     expr

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:validate':
## 
##     label, label<-

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following object is masked from 'package:rvest':
## 
##     html

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(forecast)

## Warning: package 'forecast' was built under R version 3.5.3

library(stringr)
library(lubridate)

## Warning: package 'lubridate' was built under R version 3.5.3

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:igraph':
## 
##     %--%

## The following object is masked from 'package:base':
## 
##     date

library(car)

## Warning: package 'car' was built under R version 3.5.3

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(outliers)
library(MVN)

## Warning: package 'MVN' was built under R version 3.5.3

## sROC 0.1-2 loaded

library(infotheo)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret)

## Warning: package 'caret' was built under R version 3.5.3

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

library(mlr)

## Warning: package 'mlr' was built under R version 3.5.3

## Loading required package: ParamHelpers

## Warning: package 'ParamHelpers' was built under R version 3.5.3

## 
## Attaching package: 'ParamHelpers'

## The following object is masked from 'package:editrules':
## 
##     isFeasible

## 
## Attaching package: 'mlr'

## The following object is masked from 'package:caret':
## 
##     train

## The following object is masked from 'package:Hmisc':
## 
##     impute

## The following object is masked from 'package:gdata':
## 
##     resample

library(ggplot2)
library(knitr)

## Warning: package 'knitr' was built under R version 3.5.3

Executive Summary

This report provides clean and tidy data on the employment status and highest level of education attained by Australians.

Required packages were loaded and two datasets were imported.

Once imported, the following pre processing steps were undertaken: * Some observations were converted to variable names * Some columns and rows were dropped * New variables were added * Variables were reshaped from wide to long format (and vice versa) * Variables were renamed.

Afterwards, the two datasets were joined. The dimensions and structure of the joined dataset were checked. This led to 3 character variables being converted to factors, and the levels for one of those factors being relabelled.

The data was still untidy - one column had mixed data types: dates (in the form of a year) and characters. The data was tidied by separating the column into two.

The data was then checked for missing, infinite and NaN values, as well as errors and inconsistencies. None were identified.

Next, the data was inspected for univariate outliers. Tukey’s method of outlier detection was used. A boxplot showed 4 univariate outliers. The capping approach was used to remove outliers. After applying capping, a revised boxplot and summary statistics of the capped data were checked, which indicated that outliers had been removed.

Finally, the data was transformed. A histogram showed the data is right (or positively) skewed. Given the right skewed distribution, the data was transformed using mathematical operations in order to achieve a normal distribution. This approach was taken given the preference in statistical inference for normal distribution as opposed to right or left skewed data. Log, natural log, square root and reciprocal transformations were undertaken. The natural log transformation best reduces the skewness and converts the distribution to a normal distribution.

Data

Description of data, sources and variable descriptions

Two seperate datasets were sourced:

Highest educational attainment: By state or territory of usual residence and sex, Persons aged 15-74 years. This dataset was sourced from the Australian Bureau of Statistics (ABS) and is available at https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/6227.0May%202018?OpenDocument
Non-school qualifications in 2010-11 and 2015: Selected characteristics, Persons aged 15-64 years. This dataset was also sourced from the ABS and is available at https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/4235.02015?OpenDocument

Dataset 1 shows the highest level of education attained by Australian persons, broken up by Australian state/territory and gender. The variables include level of education achieved and state/territory. Those variables are overlayed by gender. A number of preprocessing steps will need to be performed before this dataset can be joined with dataset 2.

Dataset 2 shows the proportion of Australians employed, unemployed or not in the labour force according to level of non-school qualification as well as a range of other characteristics. The data is then broken up by gender. The raw dataset imported into R contained characteristic (for example non school qualification), and then 2010-11 data on labour force status followed by 2015 data on labour force status. A number of preprocessing steps will need to be performed before this dataset can be joined with dataset 2.

Steps taken to import, preprocess and join the data:

Files were downloaded and saved as excel files on my computer.
The working directory was set to the location of the files.
Dataset 1 (highet education attainment) was imported using the readxl package.
Dataset 1 was checked using the head() function to identify any data preprocessing required. Some issues were identified and which were subsequently addressed:
- Some variables appeared as observations i.e. Australian state/territory appeared in row 1. Further, what should have been a variable name for column one (‘Level of highest educational attainment’) appeared as an obseration, not as a column heading. To rectify these issues, I reimported the data, but skipped rows 1 and 2, and renamed columns. The first column was renamed “Highest education level attained”. Subsequent columns were named using a consistent naming convention: state/territory_gender, for addressed in a later step. The re-imported dataset was checked using the head() function. This showed the dataset was imported correctly, with correct column names.
- Some columns and rows were dropped to help the reader/other analysts understand the data content. The total row was dropped (row 10). Further, only three columns were kept: those showing highest education level, as well as male and female totals. This was done for a similar reason: to help the reader/other analyststo best understand the data content. The revised dataset was checked using the head() function.
- The format of some data is inconsistent with the format of relevant data in dataset 2 (which is yet to be imported). The data in columns 2 and 3 were actual numbers. The second dataset (which will be imported in the following steps) shows % of people in certain categories (as opposed to raw numbers). Therefore, to ensure like for like data once the datasets are joined, I decided to undertake an additional preprocessing step here - by creating new columns which show % of each gender that make up a certain education qualification. Once these columns were created, the figures were converted to percentages and rounded. Once this was done, the columns with raw numbers were dropped. The revised dataset was checked throughout this process.
- Some other preprocessing was conducted - with a view to having consistent data once the two datasets were merged. These included converting the data from wide into long format, renaming the columns and adding a further column titled Employment or education (with all observations for this dataset being set to “Education”). The final dataset was checked and considered ok.
Dataset 2 (non-school qualifications) was imported.
Dataset 2 was checked to identify any data preprocessing required. Issues identified and subsequently addressed:
- Some variables appeared as observations i.e. labour force status appeared in row 2. To rectify this, the data was reimported but rows 1-4 were skipped, and columns were renamed.
- Some data was dropped. I only kept rows showing total male and female figures. In terms of columns, I dropped the total columns, as I only wanted to show the figures for the respective labour force status categories, not the total of all those categories. This is consistent with the first dataset imported.
- Steps were taken to convert the data into tidy format and to be in a consistent format to the first dataset. Education qualification variables were converted into long format.
- The male and female rows were converted into wide format. This was so that another column could be created (‘Employment or Education’ - with all observations being recorded as ‘Employment’). Column names were all renamed to enable joining on common names.
- The gender was then converted back to long format, consistent with the first dataset.
The two datasets were joined using a full join, with all columns from both datasets kept. The joined dataset was checked using the head() function and it appears ok.

#Set working directory
setwd("~/RMIT/Data Preprocessing/Assignment 3")

## Import and data preprocess education dataset

#Import dataset on highest education level attained
Education <- read_excel("Education.xlsx")

## New names:
## * `` -> ...1
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 21 more problems

#View education dataset
head(Education)

#View shows some issues. View row 1 to see what column names should be given
head(Education[1,])

#Reimport dataset, but skip rows 1 and 2, and rename columns
Education <- read_excel("Education.xlsx", skip = 2, col_names = c("Highest_education_level_attained","NSW_m", "VIC_m", "QLD_m", "SA_m", "WA_m", "TAS_m", "NT_m", "ACT_m", "Total males","NSW_f", "VIC_f", "QLD_f", "SA_f", "WA_f", "TAS_f", "NT_f", "ACT_f", "Total females","blank column","NSW_total", "VIC_total", "QLD_total", "SA_total", "WA_total", "TAS_total", "NT_total", "ACT_total", "Total"))

#View revised dataset
head(Education)

#Keep first column and then columns on total for each gender, remove total row
Education <- Education [-10,c(1,10,19)]

#View revised dataset
head(Education)

#Calculate % of males and females with the various levels of education qualifications
Education <- Education %>% mutate(Total_males_percent = `Total males`/sum(`Total males`), Total_females_percent = `Total females`/sum(`Total females`))

#Convert to % and round to one decimal place
Education$Total_males_percent <- as.numeric(sub("%", "", Education$Total_males_percent, fixed = TRUE))*100
Education$Total_females_percent <- as.numeric(sub("%", "", Education$Total_females_percent, fixed = TRUE))*100
Education$Total_males_percent <- round(Education$Total_males_percent,1)
Education$Total_females_percent <- round(Education$Total_females_percent,1)

#Drop columns 2 and 3 that have actual no's - only want to keep percentages
Education <- Education [,c(-2,-3)]

#check revised dataset
head(Education)

#Rename columns
colnames(Education) <- c("Education or employment status", "Male", "Female")

#Gather Education dataset to make data into long format
Education <- Education %>% gather(`Male`, `Female`, key = "Gender", value = "% of gender")

#Mutate to add employment or eduation column
Education <- Education %>% mutate(Employment_or_education = "Education")

head(Education)

## Import and data preprocess labour_force dataset

#Import Labour Force dataset
Labour_force <- read_excel("Labour force.xlsx")

## New names:
## * `` -> ...1
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...7
## * ... and 2 more problems

#View imported dataset
head(Labour_force)

#Reimport dataset, but skip rows 1-4, and rename columns
Labour_force <- read_excel("Labour force.xlsx", skip = 4, col_names = c("Gender","Employed (%)_2010-11","Unemployed (%)_2010-11","Not in labour force (%)_2010-11","Total persons (%)_2010-11","Employed (%)_2015","Unemployed (%)_2015","Not in labour force (%)_2015","Total persons (%)_2015"))

#Keep data only from rows with overall data on males and females (rows 35 and 70) and remove total columns
Labour_force <- Labour_force [c(35,70),c(-5,-9)]

#Gather Labour_force dataset to make data into long format
Labour_force <- Labour_force %>% gather(`Employed (%)_2010-11`, `Unemployed (%)_2010-11`, `Not in labour force (%)_2010-11`, `Employed (%)_2015`, `Unemployed (%)_2015`, `Not in labour force (%)_2015`, key = "Characteristic", value = "value")

#Spread Labour_force dataset to make gender column into wide format
Labour_force <- Labour_force %>% spread(key = "Gender", value = "value")

#Rename male and female columns in labour_force dataset
colnames(Labour_force) <- c("Education or employment status","Male", "Female")

#Mutate to add employment or eduation column
Labour_force <- Labour_force %>% mutate(Employment_or_education = "Employment")

#Gather Labour_force dataset to make data into long format
Labour_force <- Labour_force %>% gather(Male, Female, key = "Gender", value = "% of gender")

#Check revised dataset
head(Labour_force)

#Join the two datasets and keep all observations
Joined_dataset <- full_join(Education, Labour_force)

## Joining, by = c("Education or employment status", "Gender", "% of gender", "Employment_or_education")

#Show output
head(Joined_dataset,30)

Understand

The first step was to check and the dimensions and structure of the data. This confirmed:
the dataset is a dataframe
there are 30 observations
there are 4 variables
there are multiple data types: 3 variables are characters and one variable is a numeric variable.
the numeric variable is a double
The check also indicated that the character variables should be factors. The 3 character variables (‘Gender’, ‘Employment_or_education’ and ‘Education or employment status’) were changed to factors.
The structure of the dataset was rechecked, and this confirmed that the 3 variables have been converted from characters to factors.
The levels of the 3 factors were checked. No change was needed to the levels of 2 of the 3 variables (Gender and Employment_or_education). The 3rd factor variable (Education or Employment Status) had its levels relabelled - names were revised slightly and the year 2018 was added to the Education-related levels (the education data was current to 2018). This was done to ensure consistency with the employment-related levels.
The revised levels were checked and appear ok. The output was shown.

#Check dimensions of joined dataset
dim(Joined_dataset)

## [1] 30  4

#Check structure of joined dataset
str(Joined_dataset)

## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  4 variables:
##  $ Education or employment status: chr  "Postgraduate Degree" "Graduate Diploma/Graduate Certificate" "Bachelor Degree" "Advanced Diploma/Diploma" ...
##  $ Gender                        : chr  "Male" "Male" "Male" "Male" ...
##  $ % of gender                   : num  6.4 2 16.6 8.8 23.8 19 5.3 10.4 7.6 6.6 ...
##  $ Employment_or_education       : chr  "Education" "Education" "Education" "Education" ...

#Check type of numeric variabe
typeof(Joined_dataset$`% of gender`)

## [1] "double"

#Apply data conversions to change character variables to factors
Joined_dataset$Gender <- as.factor(Joined_dataset$Gender)
Joined_dataset$Employment_or_education <- as.factor(Joined_dataset$Employment_or_education)
Joined_dataset$`Education or employment status` <- as.factor(Joined_dataset$`Education or employment status`)

#Recheck strucuture and data types
str(Joined_dataset) #confirms change of 3 variables from characters to factors

## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  4 variables:
##  $ Education or employment status: Factor w/ 15 levels "Advanced Diploma/Diploma",..: 10 7 2 1 4 15 14 13 3 10 ...
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ % of gender                   : num  6.4 2 16.6 8.8 23.8 19 5.3 10.4 7.6 6.6 ...
##  $ Employment_or_education       : Factor w/ 2 levels "Education","Employment": 1 1 1 1 1 1 1 1 1 1 ...

#Check levels of factors
levels(Joined_dataset$`Education or employment status`)

##  [1] "Advanced Diploma/Diploma"             
##  [2] "Bachelor Degree"                      
##  [3] "Below Year 10"                        
##  [4] "Certificate III/IV"                   
##  [5] "Employed (%)_2010-11"                 
##  [6] "Employed (%)_2015"                    
##  [7] "Graduate Diploma/Graduate Certificate"
##  [8] "Not in labour force (%)_2010-11"      
##  [9] "Not in labour force (%)_2015"         
## [10] "Postgraduate Degree"                  
## [11] "Unemployed (%)_2010-11"               
## [12] "Unemployed (%)_2015"                  
## [13] "Year 10"                              
## [14] "Year 11"                              
## [15] "Year 12 or equivalent"

levels(Joined_dataset$Gender)

## [1] "Female" "Male"

levels(Joined_dataset$Employment_or_education)

## [1] "Education"  "Employment"

#Relabel Education or employment status variable
levels(Joined_dataset$`Education or employment status`) <- c("Diploma_2018","Bachelor degree_2018","Below year 10_2018", "Certificate 3 or 4_2018", "Employed_2011","Employed_2015", "Graduate Diploma or Graduate Certificate_2018", "Not in labour force_2011", "Not in labour force_2015", "Masters_2018", "Unemployed_2011", "Unemployed_2015", "Year 10_2018", "Year 11_2018", "Year 12_2018")

#Recheck levels of factor variable
levels(Joined_dataset$`Education or employment status`)

##  [1] "Diploma_2018"                                 
##  [2] "Bachelor degree_2018"                         
##  [3] "Below year 10_2018"                           
##  [4] "Certificate 3 or 4_2018"                      
##  [5] "Employed_2011"                                
##  [6] "Employed_2015"                                
##  [7] "Graduate Diploma or Graduate Certificate_2018"
##  [8] "Not in labour force_2011"                     
##  [9] "Not in labour force_2015"                     
## [10] "Masters_2018"                                 
## [11] "Unemployed_2011"                              
## [12] "Unemployed_2015"                              
## [13] "Year 10_2018"                                 
## [14] "Year 11_2018"                                 
## [15] "Year 12_2018"

#Show output
head(Joined_dataset)

Tidy & Manipulate Data I

The dataset is not in tidy form. Observations in the ‘Education or employment status’ column have dates (year) as part of the observation. The data was tidied by splitting the column into two columns: ‘Education or employment status’ and ‘Year’. This was done by using the separate() function.
The output was then shown using the head() function.
Although this is the only tidying step completed under this scection, a number of tidying processes were undertaken in an earlier section of this report (see section titled ‘’Data’). Those tidying steps included:
- Reshaping data from wide to long format and vice versa
- Dropping columns/rows
- Creating new variables
- Changing data types (characters to factors)
- Tidying up variable names imported as observations

#Spread Labour_force dataset to make gender column into wide format
Joined_dataset <- Joined_dataset %>% tidyr::separate(`Education or employment status`, into = c("Education or employment status", "Year"), sep = "_")

#Show output
head(Joined_dataset)

Tidy & Manipulate Data II

This was done under the data step i.e. when importing the data and preprocessing it ready to join the two datasets.
New variables were created within the Education dataset to show the % of male/female persons who have the different level of education qualifications. This was done using the mutate() function. The values were the converted to percentages and the figures were rounded to 1 decimal place.
The reason for creating this new column of data was because, prior to creating the new variables, the Education dataset contained raw numbers of males and females with the various eucational qualifications. However the labour_force dataset contained % rather than raw numbers. Therefore the education dataset was updated to include new columns showing percentages, to be consisentent for when the two datasets were merged. The columns containing the raw numbers were subsequently dropped.
The Employment_or_education variable was also created in an earlier step to help categorise the Education or employment status column.

#Refer to Data section at top

Scan I

Firstly, the data was checked for missing, infinite and NaN values. To do this, a function was created that woul allow us to check for those values.
Next, the function was applied to the Joined_dataset. This showed no infinite, NaN or NA values in the numerical variable coumn (% of gender).
Next, the data was checked for inconsistences and errors. 4 rules were created (in a txt file) and imported into R. Those rules checked that :
- Gender was only recorded as Male or Female
- Year was only recorded as 2011, 2015 or 2018
- Employment or Education variable was only recorded as Education or Employment
- Education or employment status was only recorded as one of : ‘Bachelor degree’, ‘Below year 10’, ‘Certificate 3 or 4’, ‘Diploma’, ‘Employed’, ‘Graduate Diploma or Graduate Certificate’, ‘Masters’, ‘Not in labour force’, ‘Unemployed’, ‘Year 10’, ‘Year 11’, ‘Year 12’
The rules were applied to the dataframe and checked for errors using the violatedEdits function. This confirmed no violations were detected.

#Create function that will allow us to check for infinite, NaN or NA values
is.specialorNA <- function(x){if (is.numeric(x)) (is.infinite(x) | is.nan(x) | is.na(x))} 

#Apply function to the joined_dataset
sapply(Joined_dataset, is.specialorNA) #no infinite, NaN or NA values in the numerical column (% of gender)

## $`Education or employment status`
## NULL
## 
## $Year
## NULL
## 
## $Gender
## NULL
## 
## $`% of gender`
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 
## $Employment_or_education
## NULL

#Check for obvious inconsistencies or errors
#Load rules from txt file
Rules <- editfile("rules.txt")
Rules

## 
## Data model:
## dat1 : Education or employment status %in% c('Bachelor degree', 'Below year 10', 'Certificate 3 or 4', 'Diploma', 'Employed', 'Graduate Diploma or Graduate Certificate', 'Masters', 'Not in labour force', 'Unemployed', 'Year 10', 'Year 11', 'Year 12')
## dat2 : Employment_or_education %in% c('Education', 'Employment')
## dat3 : Gender %in% c('Female', 'Male')
## dat4 : Year %in% c('2011', '2015', '2018') 
## 
## Edit set:
## NULL :

#Check and show summary of violated rules
ve <- violatedEdits(Rules, Joined_dataset)
summary(ve)

## No violations detected, 0 checks evaluated to NA

## NULL

Scan II

The ‘Joined_dataset’ dataframe has just one numeric variable: ‘% of gender’. It is acknowledged that the data in this column are % and are based on actual raw numbers and so it is unlikely that the values are outliers - nevertheless, I have still approached the values in this column as though they potentially could be outliers.
Using the Joined_dataset data frame, I inspected the variable ‘% of gender’ for possible univariate outliers. Tukey’s method of outlier detection was used. This was done by producing a box plot. The boxplot shows 4 univariate outliers.
The capping approach to outlier removal will be used. This was done because z scores cannot be used (this was attempted but gave a result of “integer (0)” and was considered unreliable) hence deleting/removing outliers and imputing outliers will not work.
To cap the vales outside the limits, a function was created. Before applying the function to the numeric variable, a new R object was created. This new object was created so the numeric variable could be saved as its own R object (ie a subset of the Joined_dataset).
Summary statistics were shown. This will provide a point of comparison after we apply capping.
Capping was then applied to the subsetted data (ie the Joined_dataset2 dataframe).
Two things were then done to check whether outliers have been removed:
- Summary statistics were then shown again. This indicates at least some outliers have been removed.
- Another boxplot was produced. This boxplot shows one outlier.
No further attempts were made to remove this outlier, given that it is not unexpected that further outliers appear after applying the capping strategy above.

#Produce boxplot to test for outliers
Joined_dataset$`% of gender` %>%  boxplot(main="Box Plot of % of gender variable", ylab="Percentage", col = "blue") #4 apparent outliers identified

#Create a function to cap the values outside the limits
cap <- function(x){
    quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
    x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
    x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
    x
}

#Take numeric variable and save as an object
Joined_dataset2 <- Joined_dataset %>%  dplyr::select(`% of gender`)

#Show summary statistics of new dataframe
summary(Joined_dataset2)

##   % of gender   
##  Min.   : 2.00  
##  1st Qu.: 5.95  
##  Median :11.70  
##  Mean   :20.00  
##  3rd Qu.:20.35  
##  Max.   :81.40

#Apply function "cap" to new dataframe
Joined_dataset2_capped <- sapply(Joined_dataset2, FUN = cap)

#Check summary statistics again
summary(Joined_dataset2_capped)

##   % of gender   
##  Min.   : 2.00  
##  1st Qu.: 5.95  
##  Median :11.70  
##  Mean   :20.01  
##  3rd Qu.:20.35  
##  Max.   :74.45

#Check outliers removed by producing another boxplot, this time of the capped dataframe
Joined_dataset2_capped %>%  boxplot(main="Box Plot of % of gender variable to check capping of outliers", ylab="Percentage", col = "blue") #shows 1 outlier

Transform

The focus of the transformation is the ‘% of gender’ variable.
The first step was to generate a histogram to visualise the distribution of the data.
A histogram shows that the data is right (or positively) skewed. This is not surprising as many of the values are quite low, with a few extremes at the top end of the scale.
Given the data is right skewed, the data will be transformed using mathematical operations in an attempt to achieve a normal distribution. This approach was taken given the preference in statistical inference for normal distribution as opposd to right or left sweked data.
A range of mathematical transformations were applied to the data including logarithmic, square root and reciprocal transformations. One that works well will then be identified.
Certain mathematical operations were not considered relevant, and were not used. For example, the square transformation was not applied, because that type of transformation works for left skewed data. In this case, our data is right skewed.
Firstly, the log transformation was applied to the ‘% of gender’ variable. A histogram of the transformed data was then viewed. The histogram shows that the log transformation has reduced the skewness and the distribution of the data has been converted into a more normal-type distribution.
Secondly, a natural log transformation was applied to the data. A histogram of the transformed data was then viewed. The histogram shows that the natural log transformation has reduced the skewness and the data has been converted to a normal distribution.
Thirdly, a square root transformation was applied to the data. A histogram of the transformed data was then viewed. The histogram shows that the data is still somewhat right skewed. The transformation has not succeeded in normalising the data.
The final approach taken was to apply a reciprocal transformation. A histogram shows that the data is still right skewed. Thus, the reciprocal transformation has not succeeded in normalising the data.
Based on the four mathematical operation approaches tested, the natural log transformation best reduces the skewness and converts the distribution to a normal distribution.

#View histogram of '% of gender variable'
hist(Joined_dataset$`% of gender`)

#Apply log transformation and view histogram of log 10 transformed data
log_joined_dataset <- log10(Joined_dataset$`% of gender`)
hist(log_joined_dataset, main = "Histogram of log10 transformed data")

#Apply natural log transformation and view histogram of natural log transformed data
ln_joined_dataset <- log(Joined_dataset$`% of gender`)
hist(ln_joined_dataset, main = "Histogram of natural log transformed data")

#Apply square root transformation and view histogram of square root transformed data
sqrt_joined_dataset <- sqrt(Joined_dataset$`% of gender`)
hist(sqrt_joined_dataset, main = "Histogram of square root transformed data")

#Apply reciprocal transformation and view histogram of transformed data
joined_dataset_recip <- 1/Joined_dataset$`% of gender`
hist(joined_dataset_recip, main = "Histogram of reciprocal transformed data")