Data Wrangling Assessment Task 3: Dataset challenge

Voiceover Presentation URL: https://www.loom.com/share/30a1f8ad097a4a18ae8f48b1d2413d1f

Required packages

library (readxl)

## Warning: package 'readxl' was built under R version 4.1.1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data

# CPI and All Ordinaries Index are closely related, both are measures of the state of the Australian economy.

# CPI (Consumer Price Index), commonly known as inflation, measures the percentage change in the price of a basket of goods and services consumed by households (Reserve Bank of Australia, RBA). 
# For this report, the quarterly CPI from 1-9-2001 to 1-6-2021 (20 years) are extracted from the Australian Bureau of Statistics (ABS) website, and the original title is:
# "Percentage Change from Previous Period ;  All groups CPI ;  Australia".  

# The file containing the date and the CPI is called ABSCPI2.xlsx. 

# All Ordinaries Index (AORD) was established in 1980. It is made up of the share prices for 500 of the largest companies listed on the Australian Securities Exchange (wikipedia).

# The AORD indices from 1-6-2001 to 1-6-2021 are found on Investing.com website, and the quarterly % index movements was calculated by using the same formula for the CPI, which is: (AORDt+1 - AORDt) x 100/ AORDt. Note that the first calculated % change is on 1-9-2001.

# The file containing the date, the AORD and the % index movements (change) is called AllOrd.xlsx.

# Step 1: Get two open real-life datasets

## The CPI data are read from the file "ABSCPI2.xlsx"

CPI<- read_excel("~/data/ABSCPI2.xlsx", sheet = "Data2")

head(CPI)

str(CPI)

## tibble [80 x 2] (S3: tbl_df/tbl/data.frame)
##  $ Dates: POSIXct[1:80], format: "2001-09-01" "2001-12-01" ...
##  $ CPI  : num [1:80] 0.3 0.9 0.9 0.6 0.5 0.6 1.3 0 0.1 0.9 ...

attributes(CPI)

## $names
## [1] "Dates" "CPI"  
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [76] 76 77 78 79 80
## 
## $class
## [1] "tbl_df"     "tbl"        "data.frame"

## The All Ordinaries Index data are read from the file AllOrd.xlsx"

AORD<-  read_excel("~/data/AllOrd.xlsx", sheet = )

head(AORD)

str(AORD)

## tibble [80 x 3] (S3: tbl_df/tbl/data.frame)
##  $ Date  : POSIXct[1:80], format: "2001-09-01" "2001-12-01" ...
##  $ Price : num [1:80] 2988 3360 3363 3163 2928 ...
##  $ Change: num [1:80] -12.7642 12.4498 0.0982 -5.9495 -7.426 ...

attributes(AORD)

## $names
## [1] "Date"   "Price"  "Change"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [76] 76 77 78 79 80
## 
## $class
## [1] "tbl_df"     "tbl"        "data.frame"

Dates <- CPI$Dates

attributes(Dates)

## $class
## [1] "POSIXct" "POSIXt" 
## 
## $tzone
## [1] "UTC"

## Classes of Dates are "POSIXct" and "POSIXt" 
## POSIXct is the number of seconds since the epoch. In this case the epoch Jan 1st 1970. 
## POSIXlt is a mixed text and character format like. May, 6 1985.
## Convert "POSIXct" and "POSIXt" to the classes of "character" and "Date"

NewDates <- format(as.Date(Dates, format = "%d/%m/%y"), format = "%d/%m/%y")

attributes(NewDates)

## NULL

class(NewDates)

## [1] "character"

NewDates2<-as.Date(NewDates)

attributes(NewDates2)

## $class
## [1] "Date"

# Select only the CPI variable in "CPI" as the Dates matches those for the AORD.

CPIOnly <- CPI$CPI

head(CPIOnly)

## [1] 0.3 0.9 0.9 0.6 0.5 0.6

str(CPIOnly)

##  num [1:80] 0.3 0.9 0.9 0.6 0.5 0.6 1.3 0 0.1 0.9 ...

attributes(CPIOnly)

## NULL

class(CPIOnly)

## [1] "numeric"

CPIOnly

##  [1]  0.3  0.9  0.9  0.6  0.5  0.6  1.3  0.0  0.1  0.9  0.9  0.4  0.5  0.9  0.5
## [16]  0.6  1.0  0.4  0.8  1.6  0.9 -0.2 -0.1  1.2  0.5  0.9  1.3  1.6  1.1 -0.3
## [31]  0.1  0.4  1.1  0.5  0.8  0.4  0.7  0.4  1.6  1.0  0.7 -0.1  0.1  0.6  1.7
## [46]  0.1  0.4  0.4  1.2  0.7  0.6  0.4  0.6  0.2  0.5  0.9  0.3  0.3 -0.2  0.6
## [61]  1.0  0.5  0.4  0.4  0.7  0.7  0.3  0.4  0.6  0.4 -0.1  0.7  0.5  0.5  0.3
## [76] -2.3  1.8  1.0  0.4  0.8

## The "Change" variable in AORD measures the % change in the All Ord Index quarterly, the same formula as the CPI published by ABS.
## Select only the "Change" variable in "AORD" as the Dates matches those for the CPI, and named as AORDC.

AORDC<- round((AORD$Change),2)

head(AORDC)

## [1] -12.76  12.45   0.10  -5.95  -7.43   1.61

str(AORDC)

##  num [1:80] -12.76 12.45 0.1 -5.95 -7.43 ...

attributes(AORDC)

## NULL

class(AORDC)

## [1] "numeric"

AORDC

##  [1] -12.76  12.45   0.10  -5.95  -7.43   1.61  -4.26   5.30   5.88   4.09
## [11]   3.33   3.34   4.09  10.30   1.17   3.15   8.57   2.53   8.04  -1.05
## [21]   1.57  10.39   5.93   5.55   4.28  -2.43 -15.75  -1.42 -13.16 -20.99
## [31]  -3.47  11.76  20.05   3.03   0.21 -11.61   7.22   4.53   1.69  -5.45
## [41] -12.66   1.00   7.52  -6.44   6.55   5.86   6.76  -4.11   9.26   2.60
## [51]   0.93  -0.39  -1.58   1.73   8.78  -7.01  -7.20   5.65  -3.61   3.08
## [61]   4.04   3.51   3.23  -2.37  -0.33   7.35  -4.84   7.17   0.57  -9.74
## [71]   9.67   6.99   1.51   0.03 -24.87  17.43   0.13  14.00   2.43   8.09

# Step 2: Merge the datasets using R functions

Merge<-tbl_df(data.frame(cbind(NewDates, CPIOnly, AORDC)))

## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.

head(Merge)

str(Merge)

## tibble [80 x 3] (S3: tbl_df/tbl/data.frame)
##  $ NewDates: chr [1:80] "01/09/01" "01/12/01" "01/03/02" "01/06/02" ...
##  $ CPIOnly : chr [1:80] "0.3" "0.9" "0.9" "0.6" ...
##  $ AORDC   : chr [1:80] "-12.76" "12.45" "0.1" "-5.95" ...

attributes(Merge)

## $names
## [1] "NewDates" "CPIOnly"  "AORDC"   
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [76] 76 77 78 79 80
## 
## $class
## [1] "tbl_df"     "tbl"        "data.frame"

Merge

Understand

# Step 3 Your dataset should include multiple data types (numerics, characters, factors, dates etc.).

# The following data types are included:

## 1. Numerics are given by the CPI, the Price and the Change variables.
## 2. Characters are given by the "NewDates", "CPIOnly" and "AORDC" in the Merge dataset.
## 3. As required in Step 5, factor variables are created from the two original datasets in the Box Plot outlier analysis, shown after Step 9 as: Very_Low, Low, IQR, High, Very_High to describe the distribution of the CPI and AORDC.  The factors are labelled and ordered, and used for the ranking of both CPI and AORDC
## 4. Dates are shown in the data types of: POSIXct, POSIXt, character strings and date.
## 5. As required in Step 4, variables are suitable for data type conversions, and they have been carried out throughout this report by using R functions.  e.g. as.Date, character to factor, character string to date etc
## 6. The R Codes and outputs are shown in various steps in this report.

Tidy & Manipulate Data I

# Step 6 Is the dataset in a tidy or untidy format?

# The original data from ABS contains many data that are not relevant to this assessment, only the All groups CPI for Australia data have been extracted.
# The All Ordinaries Index data contains the price (index) series, the quarterly % index movements are calculated so that they are comparable to the CPI numbers.
# The end results conform to the tidy data principles.
# The two datasets are tidy as they meet the principle of a tidy file, which are:
## Each observation has its own row.
## Each variable has its own column.
## Each value has its own cell.
## The dates, the CPI and the AORDC are extracted from the two original datasets.  The data types are checked, converted to the correct types before merging.  
## The Merge dataset is shown in a wide format (preferred under the 3 tidy data principles) where each of the CPI/AORDC observations has its own row, and the variables CPI/AORDC each has its own column, and each value has its own cell. 
## The R codes and outputs are shown under Steps 1 and 2.

Tidy & Manipulate Data II

# Step 7 At least one variable needs to be created/mutated from the existing ones

# Several variables are created from the existing variables:
## 1. NewDates (character) and NewDates2 (date format) are created from the dates, which have the formats of "POSIXct" "POSIXt originally.
## 2. CPIOnly (numeric) is created from CPI, which contains both Dates and CPI originally.
## 3. AORDC (numeric) is created from AORD, which contains Date, Price and Change originally.
## 4. Merge is created from NewDates, CPIOnly and AORDC.
## 5. From the Box Plot outlier analysis, the CPIOnly and AORDC are classified into different rankings: Very Low, Low, IQR, High and Very High.  This was achieved by calculating lower fence, q1 (quartile 1), IQR (q3 - q1), q3 (quartile 3), and upper fence, and counting the frequency in each category.  A comparison between the frequencies for CPIOnly and AORDC is made and commented.
## 6. The rankings are used to create the factor variable required in Step 5. 
## 7. The R codes and outputs are shown in Step 1, 2 5, 7 and 9.

Scan I

# Step 8: Scan all variables for missing values and inconsistencies.

## Missing values are represented in R by the NA (or na) symbol. 
## NA (or na) can arise when empty cells in Excel spreadsheet are read.
## Several methods could be used to find the missing values.  However, the Data Wrangling course recommended colSums. It states: "the colSums approach is more practical to see which variables actually have missing variables", in comparison to "for loop" for large dataset.  (Data Wrangling Modules 4.1.2 to 4.1.4).
## Special values are used in R, these are -Inf, Inf and NaN, and they are checked here too.
## Other inconsistencies may show up as outliers, these are discussed in Step 9.

# Identify missing values

Missingval <- colSums(is.na(Merge))

Missingval

## NewDates  CPIOnly    AORDC 
##        0        0        0

## The result shows that there is no missing value.

# check infinite (-inf or +inf) values

infinite1<- is.infinite(CPIOnly)
infinite2<- is.infinite(AORDC)

infinite1

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

infinite2

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

## check not a number (NaN) values

Nan1<- is.nan(CPIOnly)
Nan2<- is.nan(AORDC)

Nan1

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Nan2

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

## The results show that there are no special values.

Scan II

# Step 9 Scan all numeric variables for outliers.

# Apply boxplot() function for CPI and AORDC
# Identify lower fence, q1, IQR, q3, upper fence and outliers for CPI and AORDC
# The oultiers are either below the lower fence or above the upper fence
# A bivariate scatter plot on both the CPIOnly and AORDC is carried out to identify any outlier between the pair.

CPIOnly %>%

boxplot(main = "Box Plot of CPI", ylab = "CPI", col = "grey")

q1 <- quantile(CPIOnly, probs = 0.25)

q3 <- quantile(CPIOnly, probs = 0.75)

q1

## 25% 
## 0.4

q3

## 75% 
## 0.9

iqr <- q3 - q1

iqr

## 75% 
## 0.5

lower_fence <- q1 - (1.5 * iqr) 

upper_fence <- q3 + (1.5 * iqr)

lower_fence

##   25% 
## -0.35

upper_fence

##  75% 
## 1.65

# Identify the frequency of CPI from Very Low to Very High

CPI1<- sum(CPIOnly < lower_fence)

CPI2<- sum(CPIOnly < q1)

CPI3<- sum(CPIOnly <= q3)

CPI4<- sum(CPIOnly <upper_fence)

CPI5<- sum(CPIOnly >= upper_fence)

CPI1

## [1] 1

CPI2

## [1] 18

CPI3

## [1] 65

CPI4

## [1] 78

CPI5

## [1] 2

Very_Low_CPI<- CPI1

Low_CPI<- CPI2-CPI1

IQR_CPI<-CPI3-CPI2

High_CPI<- CPI4-CPI3

Very_High_CPI <- CPI5


Very_Low_CPI

## [1] 1

Low_CPI

## [1] 17

IQR_CPI

## [1] 47

High_CPI

## [1] 13

Very_High_CPI

## [1] 2

# Identify the frequency (in %) of CPI from Very Low to Very High

Very_Low_CPI_Perc<- CPI1 *(100/80)

Low_CPI_Perc<- (CPI2-CPI1) * (100/80)

IQR_CPI_Perc<-(CPI3-CPI2) * (100/80)

High_CPI_Perc<- (CPI4-CPI3) * (100/80)

Very_High_CPI_Perc <- CPI5* (100/80)

Very_Low_CPI_Perc

## [1] 1.25

Low_CPI_Perc

## [1] 21.25

IQR_CPI_Perc

## [1] 58.75

High_CPI_Perc

## [1] 16.25

Very_High_CPI_Perc

## [1] 2.5

# From the Boxplot results for CPI:
# Lower Fence = -0.35
# q1 = 0.4
# q2 = 0.55
# q3 = 0.9
# Upper Fence = 1.65
# IQR = q3 - q1 = 0.5

#Very_Low_CPI = 1 (1.25%) - Outliers

#Low_CPI = 17 (21.25%)

#IQR_CPI = 47 (58.75%)

#High_CPI = 13 (16.25%)

#Very_High_CPI = 2 (2.5%) - Outliers


#
#Apply boxplot() function for AORDC

#Identify lower fence, q1, IQR, q3, upper fence and outliers for AORDC

AORDC %>%
  
  boxplot(main = "Box Plot of AORDC", ylab = "AORDChange", col = "grey")

Aq1 <- quantile(AORDC, probs = 0.25)

Aq3 <- quantile(AORDC, probs = 0.75)

Aq1

##    25% 
## -2.385

Aq3

##   75% 
## 6.085

Aiqr <- Aq3 - Aq1

Aiqr

##  75% 
## 8.47

Alower_fence <- Aq1 - (1.5 * Aiqr) 

Aupper_fence <- Aq3 + (1.5 * Aiqr)

Alower_fence

##    25% 
## -15.09

Aupper_fence

##   75% 
## 18.79

# Identify the frequency of AORDC from Very Low to Very High

AORDC1<- sum(AORDC < Alower_fence)

AORDC2<- sum(AORDC < Aq1)

AORDC3<- sum(AORDC <= Aq3)

AORDC4<- sum(AORDC <Aupper_fence)

AORDC5<- sum(AORDC >= Aupper_fence)

AORDC1

## [1] 3

AORDC2

## [1] 20

AORDC3

## [1] 60

AORDC4

## [1] 79

AORDC5

## [1] 1

Very_Low_AORDC<- AORDC1

Low_AORDC<- AORDC2-AORDC1

IQR_AORDC<-AORDC3-AORDC2

High_AORDC<- AORDC4-AORDC3

Very_High_AORDC <- AORDC5

Very_Low_AORDC

## [1] 3

Low_AORDC

## [1] 17

IQR_AORDC

## [1] 40

High_AORDC

## [1] 19

Very_High_AORDC

## [1] 1

# Identify the frequency AORDC expressed in % from Very Low to Very High

Very_Low_AORDC_Perc<- AORDC1 *(100/80)

Low_AORDC_Perc<- (AORDC2-AORDC1) * (100/80)

IQR_AORDC_Perc<-(AORDC3-AORDC2) * (100/80)

High_AORDC_Perc<- (AORDC4-AORDC3) * (100/80)

Very_High_AORDC_Perc <- AORDC5* (100/80)

Very_Low_AORDC_Perc

## [1] 3.75

Low_AORDC_Perc

## [1] 21.25

IQR_AORDC_Perc

## [1] 50

High_AORDC_Perc

## [1] 23.75

Very_High_AORDC_Perc

## [1] 1.25

# From the Boxplot results for AORDC:
# Lower Fence = -15.09
# q1 = * -2.385
# q3 = 6.085
# Upper Fence = 18.79
# IQR = q3 - q1 = 8.47


#Very_Low_AORDC = 3 (3.75%) - Outliers

#Low_AORDC = 17 (21.25%)

#IQR_AORDC = 40 (50.00%)

#High_AORDC = 19 (23.75%)

#Very_High_CPI = 1 (1.25%) - Outliers


#
# Bivariate Scatter Plot of CPIOnly and AORDC for Outliers

plot(CPIOnly, AORDC, main="Bivariate Scatter plot",
   xlab="CPIOnly", ylab="AORDC", pch=15)

# The bivariate scatter plot shows that there are at least 3 outliers: one at the north west corner that is high AORDC close to 20 vs a low and negative CPI less than -2; two below AORDC of -20 when CPIOnly are close to zero.

# Step 5: The dataset should include at least one factor variable that needs to be labelled and/or ordered.
# The factor variable is created below.

# The factors are:  Very_Low, Low, IQR, High, Very_High
# The factors are defined with the quartiles identified by the Box Plot.
# Very_Low means below the lower fence (outlier)
# Low means between Lower fence and q1
# IQR means between q1 and q3
# High means between q3 and the upper fence
# Very_High means more than the upper fence (outlier)

Ord_vector<- c("Low", "Very_Low", "High", "IQR", "Very_High")

factor_Ord_vector<- tbl_df(data.frame(Rank = factor(Ord_vector, order = TRUE, levels = c("Very_Low", "Low", "IQR", "High", "Very_High"))))

str(factor_Ord_vector)

## tibble [5 x 1] (S3: tbl_df/tbl/data.frame)
##  $ Rank: Ord.factor w/ 5 levels "Very_Low"<"Low"<..: 2 1 4 3 5

Ranking<-c("Very_Low", "Low", "IQR", "High", "Very_High")
CPI_Freq_In_Perc<-c(Very_Low_CPI_Perc, Low_CPI_Perc, IQR_CPI_Perc, High_CPI_Perc, Very_High_CPI_Perc)
AORDC_Freq_In_Perc<-c(Very_Low_AORDC_Perc, Low_AORDC_Perc, IQR_AORDC_Perc, High_AORDC_Perc, Very_High_AORDC_Perc)

CPI_Freq_In_Perc

## [1]  1.25 21.25 58.75 16.25  2.50

AORDC_Freq_In_Perc

## [1]  3.75 21.25 50.00 23.75  1.25

Ranking_of_changes_In_Perc<- cbind(Ranking, CPI_Freq_In_Perc, AORDC_Freq_In_Perc )
Ranking_of_changes_In_Perc

##      Ranking     CPI_Freq_In_Perc AORDC_Freq_In_Perc
## [1,] "Very_Low"  "1.25"           "3.75"            
## [2,] "Low"       "21.25"          "21.25"           
## [3,] "IQR"       "58.75"          "50"              
## [4,] "High"      "16.25"          "23.75"           
## [5,] "Very_High" "2.5"            "1.25"

# The ranking by Box Plot for CPI and the AORDC shows the following results:
## 1. AORDC has more outliers below the lower fence than CPI (Rank: Very_Low)
## 2. AORDC has more observations between q3 and the upper fence than CPI (Rank: High)
## 3. AORDC and CPI have same number of observations between lower fence and q1 (Rank: Low)
## 4. CPI has more observations in IQR than AORDC.
## 5. CPI has more outliers above the upper fence than AORDC (Rank: Very_High)

Transform

# Step 10: Apply data transformations on at least one of the variables.

# The procedure used is to check the shape of the histogram before a transformation is carried out.
# As per the requirement, the transformation below is applied to CPI only.  No analysis is carried out for AORDC.

# Check shape of CPI by Histogram

hist(CPIOnly, main = "Histogram of CPI", xlab = "CPI")

# The CPI histogram shows that it is left skewed.

# Perform a power of square transformation to CPI

CPIOnlySq <- CPIOnly^2

hist(CPIOnlySq, main = "Histogram of CPI Square", xlab = "CPI")

# The resulting Histogram shows that the square transformation has made the distribution right skewed.
# Through trial and error, a transformation using a power of 0.6 gives the closest shape to a normal distribution.

CPIOnlypower0.6 <- CPIOnly^0.6

hist(CPIOnlypower0.6, main = "Histogram of CPI to power of 0.6", xlab = "CPI")

** Data Reference **

ABS CPI viewed 7-10-2021 https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/consumer-price-index-australia/latest-release#data-download
All Ordinaries Index Historical Prices viewed 7-10-2021 https://www.investing.com/indices/all-ordinaries-historical-data

** Reference **

All Ordinaries Index definition viewed 9-10-21 https://en.wikipedia.org/wiki/All_Ordinaries
Inflation and its Measurement, RBA viewed 7-10-2021 https://www.rba.gov.au/education/resources/explainers/inflation-and-its-measurement.html
RMIT Online Jul-Dec 2021 (2149), Data Wrangling Course Notes, Math 2405, RMIT University

Data Wrangling Assessment Task 3: Dataset challenge

CPI and All Ordinaries Index (Australia)

Wing Siang Chang, s3918329

Required packages

Data

Understand

Tidy & Manipulate Data I

Tidy & Manipulate Data II

Scan I

Scan II

Transform