The purpose of this exercise is to design and implement an entire data preparation pipeline in R. We would like you to implement a robust, extensible and generic framework for data preparation.
# load libraries
library(gamlss)
library(gamlss.add)
library(gamlss.dist)
library(DT)
library(roll)
library(dplyr)
library(stats)
library(tseries)
library(ggpubr)
library(psych)
library(magrittr)
library(labelVector)
# create function to pass rawdata
data_preparation_process <- function(rawdata) {
data_oil <- rawdata
return(data_oil)
}
data_oil <- data_preparation_process(gamlss.data::oil)
# summary statistics
summary(data_oil)
## OILPRICE CL2_log CL3_log CL4_log
## Min. :3.266 Min. :3.345 Min. :3.391 Min. :3.428
## 1st Qu.:3.966 1st Qu.:3.982 1st Qu.:4.001 1st Qu.:4.017
## Median :4.517 Median :4.519 Median :4.519 Median :4.518
## Mean :4.309 Mean :4.317 Mean :4.322 Mean :4.326
## 3rd Qu.:4.580 3rd Qu.:4.581 3rd Qu.:4.581 3rd Qu.:4.579
## Max. :4.705 Max. :4.696 Max. :4.682 Max. :4.672
## CL5_log CL6_log CL7_log CL8_log
## Min. :3.482 Min. :3.501 Min. :3.516 Min. :3.529
## 1st Qu.:4.032 1st Qu.:4.048 1st Qu.:4.058 1st Qu.:4.068
## Median :4.519 Median :4.518 Median :4.517 Median :4.514
## Mean :4.329 Mean :4.331 Mean :4.332 Mean :4.333
## 3rd Qu.:4.575 3rd Qu.:4.570 3rd Qu.:4.563 3rd Qu.:4.556
## Max. :4.672 Max. :4.673 Max. :4.673 Max. :4.673
## CL9_log CL10_log CL11_log CL12_log
## Min. :3.542 Min. :3.555 Min. :3.566 Min. :3.576
## 1st Qu.:4.075 1st Qu.:4.082 1st Qu.:4.090 1st Qu.:4.097
## Median :4.512 Median :4.509 Median :4.506 Median :4.503
## Mean :4.333 Mean :4.337 Mean :4.333 Mean :4.332
## 3rd Qu.:4.550 3rd Qu.:4.546 3rd Qu.:4.542 3rd Qu.:4.537
## Max. :4.672 Max. :4.670 Max. :4.667 Max. :4.663
## CL13_log CL14_log CL15_log BDIY_log
## Min. :3.585 Min. :3.594 Min. :3.603 Min. :5.670
## 1st Qu.:4.103 1st Qu.:4.110 1st Qu.:4.117 1st Qu.:6.596
## Median :4.500 Median :4.497 Median :4.493 Median :6.806
## Mean :4.332 Mean :4.331 Mean :4.331 Mean :6.787
## 3rd Qu.:4.532 3rd Qu.:4.527 3rd Qu.:4.522 3rd Qu.:7.011
## Max. :4.658 Max. :4.654 Max. :4.649 Max. :7.757
## SPX_log DX1_log GC1_log HO1_log
## Min. :7.153 Min. :4.369 Min. :6.956 Min. :-0.1442
## 1st Qu.:7.354 1st Qu.:4.391 1st Qu.:7.089 1st Qu.: 0.6220
## Median :7.531 Median :4.417 Median :7.159 Median : 1.0547
## Mean :7.481 Mean :4.459 Mean :7.192 Mean : 0.8600
## 3rd Qu.:7.611 3rd Qu.:4.557 3rd Qu.:7.345 3rd Qu.: 1.1013
## Max. :7.664 Max. :4.613 Max. :7.491 Max. : 1.1877
## USCI_log GNR_log SHCOMP_log FTSE_log
## Min. :3.650 Min. :3.317 Min. :7.576 Min. :8.568
## 1st Qu.:3.838 1st Qu.:3.787 1st Qu.:7.652 1st Qu.:8.716
## Median :4.021 Median :3.868 Median :7.734 Median :8.778
## Mean :3.962 Mean :3.818 Mean :7.840 Mean :8.760
## 3rd Qu.:4.070 3rd Qu.:3.920 3rd Qu.:8.032 3rd Qu.:8.813
## Max. :4.148 Max. :3.985 Max. :8.550 Max. :8.868
## respLAG
## Min. :3.266
## 1st Qu.:3.966
## Median :4.517
## Mean :4.310
## 3rd Qu.:4.580
## Max. :4.705
# view dataframe size
paste0("Data set consist of ", dim(data_oil)[1], " observations and ", dim(data_oil)[2], " variables. ")
## [1] "Data set consist of 1000 observations and 25 variables. "
a. Rolling standard deviation (of arbitrary window)
b. Rolling mean (of arbitrary window)
c. Lagging (of arbitrary order)
d. Leading (of arbitrary order)
e. Differencing
f. Spread (between two input drivers)
g. Ratio (between two input drivers)
h. Product (between two input drivers)
Below we create one function, where user can pass any input time series and the result will generate below columns
[Note: no value given for window, assume daily data so selected window is 7, similary for lag, lead and differencing selected order is 1]
data_pipeline_trans <- function(raw_data_3) {
# added input to the new data frame
df <- as.data.frame(raw_data_3)
# assign input to a new variable
data_oil_2 <- raw_data_3
# convert in to matrix
data_oil_2 <- as.matrix(data_oil_2)
# calculete rtolling standard deviation, window = 7
roll_sd_dev <- roll::roll_sd(data_oil_2, 7)
df$roll_sd_dev <- roll_sd_dev
# Rolling mean (of arbitrary window), window = 7
roll_mean <- roll::roll_mean(data_oil_2, 7)
df$roll_mean <- roll_mean
# Lagging (of arbitrary order), order = 1
df$lag_1 <- dplyr::lag(raw_data_3)
# Leading (of arbitrary order), order = 1
df$lead <- dplyr::lead(raw_data_3)
# Differencing
Diff <- raw_data_3 %>% diff()
Diff[1000] <- NA
df$diff <- Diff
return(df)
}
data_oil_trans <- data_pipeline_trans(data_oil$OILPRICE)
head(data_oil_trans, n =10)
## raw_data_3 roll_sd_dev roll_mean lag_1 lead diff
## 1 4.640923 NA NA NA 4.633077 -0.0078462165
## 2 4.633077 NA NA 4.640923 4.634049 0.0009720063
## 3 4.634049 NA NA 4.633077 4.646312 0.0122629838
## 4 4.646312 NA NA 4.634049 4.631520 -0.0147921680
## 5 4.631520 NA NA 4.646312 4.627616 -0.0039035865
## 6 4.627616 NA NA 4.631520 4.635214 0.0075979325
## 7 4.635214 0.006223043 4.635530 4.627616 4.635796 0.0005820722
## 8 4.635796 0.005767563 4.634798 4.635214 4.640055 0.0042582083
## 9 4.640055 0.006018100 4.635795 4.635796 4.645544 0.0054894923
## 10 4.645544 0.006957400 4.637437 4.640055 4.649665 0.0041213457
Lets create another function to calculate Spread, Ratio and Product using 2 input drivers.
f. Spread (between two input drivers) : I am not sure how to calculate Spread between 2 variables. Need little brief about this function.
data_pipeline_trans_2 <- function(raw_data_4){
df_1 <- as.data.frame(raw_data_4)
#df_1$Spread <-
df_1$Ratio <- df_1[,1]/df_1[,2]
df_1$Product <- df_1[,1] * df_1[,2]
return(df_1)
}
df_1 <- data_pipeline_trans_2(data_oil_trans[, c("roll_sd_dev", "roll_mean")])
head(df_1, n = 10)
## roll_sd_dev roll_mean Ratio Product
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## 7 0.006223043 4.635530 0.001342466 0.02884710
## 8 0.005767563 4.634798 0.001244404 0.02673149
## 9 0.006018100 4.635795 0.001298181 0.02789868
## 10 0.006957400 4.637437 0.001500268 0.03226450
Above data_pipeline_trans_2 function, I pass “roll_sd_dev”, “roll_mean” as input drivers. User can pass other drivers too.
data_pipeline_comp <- function(raw_data_5){
df2 <- as.data.frame(raw_data_5)
difference <- (df2$OILPRICE - df2$respLAG)
df2$difference <- difference
difference <- as.matrix(difference)
# calculete rtolling standard deviation, window = 7
roll_std <- roll::roll_sd(difference, 7)
df2$comp_trans <- roll_std
return(df2)
}
df2 <- data_pipeline_comp(data_oil[,c("OILPRICE", "respLAG")])
head(df2, n = 10)
## OILPRICE respLAG difference comp_trans
## 1 4.640923 4.631812 0.0091112388 NA
## 2 4.633077 4.640923 -0.0078462165 NA
## 3 4.634049 4.633077 0.0009720063 NA
## 4 4.646312 4.634049 0.0122629838 NA
## 5 4.631520 4.646312 -0.0147921680 NA
## 6 4.627616 4.631520 -0.0039035865 NA
## 7 4.635214 4.627616 0.0075979325 0.009882852
## 8 4.635796 4.635214 0.0005820722 0.009140087
## 9 4.640055 4.635796 0.0042582083 0.008704563
## 10 4.645544 4.640055 0.0054894923 0.008868343
Answer
Spread, Ratio and Product drivers act on must be specified by the user. User, needs to select 2 input drivers, while calling data_pipeline_trans_2 function.
Combine all the drivers from their raw form or those that result from the application of one or several transformations using cbind(), named dataset as final_drivers.
final_drivers <- cbind(data_oil_trans, df_1, df2)
final_drivers <-
final_drivers[, c("raw_data_3", "roll_sd_dev", "roll_mean",
"lag_1", "lead", "diff", "Ratio", "Product",
"comp_trans")]
head(final_drivers, n = 10)
## raw_data_3 roll_sd_dev roll_mean lag_1 lead diff Ratio
## 1 4.640923 NA NA NA 4.633077 -0.0078462165 NA
## 2 4.633077 NA NA 4.640923 4.634049 0.0009720063 NA
## 3 4.634049 NA NA 4.633077 4.646312 0.0122629838 NA
## 4 4.646312 NA NA 4.634049 4.631520 -0.0147921680 NA
## 5 4.631520 NA NA 4.646312 4.627616 -0.0039035865 NA
## 6 4.627616 NA NA 4.631520 4.635214 0.0075979325 NA
## 7 4.635214 0.006223043 4.635530 4.627616 4.635796 0.0005820722 0.001342466
## 8 4.635796 0.005767563 4.634798 4.635214 4.640055 0.0042582083 0.001244404
## 9 4.640055 0.006018100 4.635795 4.635796 4.645544 0.0054894923 0.001298181
## 10 4.645544 0.006957400 4.637437 4.640055 4.649665 0.0041213457 0.001500268
## Product comp_trans
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 0.02884710 0.009882852
## 8 0.02673149 0.009140087
## 9 0.02789868 0.008704563
## 10 0.03226450 0.008868343
Create meta data object :
print_with_label <- function(dframe){
stopifnot(inherits(dframe, "data.frame"))
labs <- labelVector::get_label(dframe, names(dframe))
labs <- sprintf("%s: %s", names(dframe), labs)
#print(dframe)
cat("\n")
cat(labs, sep = "\n")
}
final_drivers <-set_label(final_drivers,
raw_data_3 = "target variable",
roll_sd_dev = "Rolling standard deviation(window = 7)",
roll_mean = "Rolling mean (window = 7)",
lag_1 = "Lagging (order = 1)",
lead = "Leading (order = 1)",
diff = "Differencing (order = 1)",
Ratio = "Ration between two input drivers",
Product = "Multiplication between two input drivers"
)
print_with_label(final_drivers)
##
## raw_data_3: target variable
## roll_sd_dev: Rolling standard deviation(window = 7)
## roll_mean: Rolling mean (window = 7)
## lag_1: Lagging (order = 1)
## lead: Leading (order = 1)
## diff: Differencing (order = 1)
## Ratio: Ration between two input drivers
## Product: Multiplication between two input drivers
## comp_trans: comp_trans
These statistics need to be stored in the meta data object. The purpose of this is, we may be interested in keeping in the final model only drivers that are normally distributed, or only drivers whose correlation with the target is above a given threshold, or another combination of such criteria.
Solution
a. Normality test
Below write function to test normality and pass one driver, similary user can pass other driver and perform this test.
Density plot provides a visual judgment about whether the distribution is bell shaped.The R function shapiro.test() can be used to perform the Shapiro-Wilk test of normality for one variable (univariate).
# normality check
normality <- function(input_driver) {
print(ggdensity(input_driver,
main = "Density plot of Rolling Standard deviation",
xlab = ""))
shapiro.test(input_driver)
}
# function call
normality(final_drivers$roll_sd_dev)
##
## Shapiro-Wilk normality test
##
## data: input_driver
## W = 0.84009, p-value < 2.2e-16
From the output, the p-value < 0.05 implying that the distribution of the data are significantly different from normal distribution. In other words, we cannot assume the normality.
b. Stationarity test
Augmented Dickey-Fuller (ADF) t-statistic test to find if the series has a unit root (a series with a trend line will have a unit root and result in a large p-value). If the p-value < 0.05 then data is stationary if p-value > 0.05 then data is non-stationary.
Before forform ADF remove NA, replace NA with 0.
# Stationarity check
stationarity <- function(input_driver) {
input_driver[is.na(input_driver)] <- 0
tseries::adf.test(input_driver)
}
# function call
stationarity(final_drivers$roll_mean)
##
## Augmented Dickey-Fuller Test
##
## data: input_driver
## Dickey-Fuller = -1.7106, Lag order = 9, p-value = 0.7008
## alternative hypothesis: stationary
roll_mean is non-station due to a high p-value. To test stationarity for other drivers replace the other driver with the existing driver and run the function.
c. Correlation coefficient with the target
# Correlation coefficient
correlation <- function(input_drivers){
input_drivers[is.na(input_drivers)] <- 0
return(psych::corPlot(input_drivers, cex = 0.6))
}
correlation(final_drivers)
The above heat map and correlation matrix show, roll_mean, lag_1, and lead have positively co-relate with the target variable others are negatively co-relate.