# Import dataset. Create a function as we may want to do this again later for a clean import.
read_and_clean_data <- function(datasource, hasHeader=T) {
dataSet = read.csv(datasource, header = hasHeader)
#Clean data columns to match data dictionary
dataSet$date = as.Date(dataSet$date, format = "%d/%m/%y")
dataSet$customer_id = as.character(dataSet$customer_id, format = "")
return(dataSet)
}
transactions <- read_and_clean_data("transactions.csv", T)
#View the data's structure to confirm
str(transactions)
## 'data.frame': 94248 obs. of 5 variables:
## $ date : Date, format: "2013-01-01" "2013-02-01" ...
## $ customer_id : chr "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" ...
## $ industry : int 8 8 8 8 8 8 8 8 8 8 ...
## $ location : int 9 9 9 9 9 9 9 9 9 9 ...
## $ monthly_amount: num 753851 651548 1138769 659739 770675 ...
par(mfrow=c(1,2))
hist(transactions$industry, main = "Trans by Industry", xlab="Industry", ylab="Number of transactions", xlim = c(0,10), ylim=c(0,50000), las=0)
hist(transactions$location, main = "Trans by Location", xlab="Location", ylab="Number of transactions", xlim = c(0,10), ylim=c(0,50000), las=0)
Let’s visualise a scatterplot showing industry against monthly amount over time
ggplot(transactions, aes(x=date, y=monthly_amount, color=industry)) +
geom_point() + geom_smooth(se = FALSE, color = "red") + labs(title="Industry and monthly amount over time", x="Date", y="Monthly Amount") + scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Let’s visualise a scatterplot showing location variables against monthly amount over time
ggplot(transactions, aes(x=date, y=monthly_amount, color=location)) +
geom_point() + geom_smooth(se = FALSE, color = "red") + labs(title="Location and monthly amount over time", x="Date", y="Monthly Amount") + scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It makes sense to explore where and why the monthly amounts are higher than average, using the ‘mean’ as the indicator of average.
#Mutate the dataset to create a new column called 'monthly_amount_above_mean'
transactions_with_mean_boolean <- transactions %>%
# creating a new variable to classify transaction column
mutate(monthly_amount_above_mean = ifelse(monthly_amount > mean(monthly_amount), TRUE, FALSE))
Plot a bar chart by monthly amounts by date
qplot(x = date, fill = monthly_amount_above_mean, data = transactions_with_mean_boolean, geom = "bar", main = "Number of transactions by month, showing proportion above the mean")
This reveals most monthly amounts are below the mean and the monthly amounts grow steadily rather than steeply. Let’s now look at it by industry and location, too.
Plot a bar chart by monthly amounts by location.
qplot(x = location, fill = monthly_amount_above_mean, data = transactions_with_mean_boolean, geom = "bar", main = "Number of transactions by location, showing proportion above the mean")
Plot a bar chart by monthly amounts by industry
qplot(x = industry, fill = monthly_amount_above_mean, data = transactions_with_mean_boolean, geom = "bar", main = "Number of transactions by industry, showing proportion above the mean")
The goal will be to build a model that can predict future monthly_amounts.
We will need to add a new variable called “time_number”, which is an integer describing the date order of rows in the dataset. For example: January, 2013 becomes 1, February 2013 becomes 2 etc. We will then do a rough 70:30 test:train data split, ensuring that the lower values seen across December are represented in our testing set.
Aggregate the data, grouping by date, industry and location, and calculating the mean monthly_amount
# We may want to do this again, so write a function
aggregate_transactions <- function(df) {
# Aggregate the data, grouping by date, industry and location, and calculating the mean monthly_amount
output = df %>%
group_by(date, industry, location) %>%
summarize(monthly_amount = mean(monthly_amount, na.rm = TRUE))
# Let's also create a column for the month number and another one for year number
output = output %>%
mutate(month_number = format(as.Date(date), "%m")) %>%
mutate(year_number = format(as.Date(date), "%Y"))
# Make sure the new columns are of the correct type
output$month_number = as.integer(output$month_number)
output$year_number = as.integer(output$year_number)
transform(output, month_number = as.integer(month_number), year_number = as.integer(year_number))
return(output)
}
aggregated_transactions <- aggregate_transactions(transactions)
aggregated_transactions
ggplot(data=filter(aggregated_transactions, industry == 1 & location == 1), aes(x=date, y=monthly_amount, group=1)) + geom_line(color="red") + geom_point() + labs(title="Line plot of mean monthly amount for Industry 1 and Location 1", x="Date", y="Mean Monthly Amount")
We want to explore different linear regression models, with monthly_amount always the target, but trialling different predictors.
As industry and location are the same for all entries, they will not be used as predictors.
#Filter our aggregated data set to just entries with industry and location values of 1
tempSet <- aggregated_transactions[aggregated_transactions$industry == 1 & aggregated_transactions$location == 1, ]
# Want to make sure we capture a couple of months from the preceeding year to capture the seasonal data in our testing set
testingNum = 14
# The test/train split should ensure training data has 'trainingMultiplier' times the number of entries of tesing entries
# * 1.6 was the minimum size based on trials that produced the same RMSE compared to higher sampling.
# * A value of 4 or above returned no results due to insufficient rows. 1.5 or below returned warnings about rank-deficient fits
# * Between 1.6 and 3 provided the same RMSE error values
# * However, when trainingMultiplier was >=1.6 or <=2, results included one extra location/industry combination.
# * The lowest number was chosen that did not change RMSE values, to maximise chance of calculation on smaller sample sizes without compromising predictions.
trainingMultiplier = 1.6
#Arrange dataset by dateand add a time_number value to represent date order
tempSet$time_number = c(1:nrow(arrange(tempSet, date)))
#Just to make sure, let's order by time_number
arrange(tempSet, time_number)
#Training number is the number of rows minus the testingNum, we arrange the dataSet again, just to make sure
trainingNum = nrow(arrange(tempSet, time_number)) - testingNum
#Training set is all rows from the start minus the number in the test set, we arrange the dataSet again, just to make sure
trainingSet = head(arrange(tempSet, time_number), trainingNum)
#Testing set is the last 'testingNum' rows, we arrange the dataSet again, just to make sure
testingSet = tail(arrange(tempSet, time_number), testingNum)
Run a linear model with the training set, with monthly_amount as target and month_number as the predictor
month_number.lm <- lm(monthly_amount~month_number, data=trainingSet)
summary(month_number.lm)
##
## Call:
## lm(formula = monthly_amount ~ month_number, data = trainingSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21904.9 -7126.0 162.6 8778.7 17485.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156947.7 3836.2 40.913 <2e-16 ***
## month_number 629.8 553.5 1.138 0.264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10510 on 31 degrees of freedom
## Multiple R-squared: 0.04009, Adjusted R-squared: 0.009127
## F-statistic: 1.295 on 1 and 31 DF, p-value: 0.2639
The summary statistics let us see the size and significance of the fit for this model, which looks unlikely to be significant or a good fit. The p-value of 0.2639 reflects this. The Adjusted R-Squared of 0.009127 further echoes this observation.
Residuals represent a lack of fit of a line to a model - which can be seen as the model’s error - and you can see the minimum to maximum residuals range from -21904.9 to 17485.4.
ggplot(data = trainingSet, mapping = aes(x = month_number, y = monthly_amount)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title="The linear relationship between monthly amount and month number", x="Month Number", y="Monthly Amount")
It’s not the greatest fit, but there is a trend.
If we plot monthly_amount ~ month_number linear model using statistical diagnostic plots, we can get a better view of the sign, size and significance of the fit and further confirm that month number is not likely to be the best predictor of monthly sales amounts.
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(month_number.lm)
Now let’s run a different linear model with monthly_amount as target and time_number as the predictor to see if it fits better.
time_number.lm <- lm(monthly_amount~time_number, data=trainingSet)
summary(time_number.lm)
##
## Call:
## lm(formula = monthly_amount ~ time_number, data = trainingSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17279 -6992 1486 6303 16923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 150333.8 3159.5 47.581 < 2e-16 ***
## time_number 614.7 162.2 3.791 0.000651 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8870 on 31 degrees of freedom
## Multiple R-squared: 0.3168, Adjusted R-squared: 0.2947
## F-statistic: 14.37 on 1 and 31 DF, p-value: 0.0006507
Let’s try to see if there is a better linear relationship between monthly amount and time_number
ggplot(data = trainingSet, mapping = aes(x = time_number, y = monthly_amount)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title="A stronger trend ", x="Date", y="Monthly Amount")
plot(time_number.lm)
Finally, let’s run a model with monthly_amount as target and month number + time_number as the predictors to see if month_number and time_number work better in tandem.
month_number_and_time_number.lm <- lm(monthly_amount~month_number + time_number, data=trainingSet)
summary(month_number_and_time_number.lm)
##
## Call:
## lm(formula = monthly_amount ~ month_number + time_number, data = trainingSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17139 -6451 1404 6149 17038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148686.3 3998.7 37.183 <2e-16 ***
## month_number 326.5 478.6 0.682 0.5003
## time_number 594.6 166.2 3.578 0.0012 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8947 on 30 degrees of freedom
## Multiple R-squared: 0.3272, Adjusted R-squared: 0.2823
## F-statistic: 7.295 on 2 and 30 DF, p-value: 0.00262
The Adjusted R-squared of 0.2823 is lower than our second model, and the p-value is higher.
The monthly_amount ~ time_number model was the best of the training data.
Based on the knowledge gained from the assessment above, a function was written which trains a linear regession model, another function calculates the RMSE (root mean squared error) and determines predictions for all rows. If ‘forecast’ is set to true, it will call the make_forecasts function and return forecasts for December 2016. Otherwise, it will return predictions (if forecast = false) for existing data.
# Train a linear regression model based on the data frame entered and the number of rows in the testing set
# Parameters
# df = data frame of dataset to test
# testingNum = Number of items in the testing set
# Returns a trained linear regression model
train_model <- function(df, testingNum) {
# Arrange dataset by date
arrange(df, date)
# Add a number to represent date order
df$time_number = c(1:nrow(df))
if(testingNum == 0) {
trainingNum = nrow(df)
} else {
# Training number is the number of rows minus the testingNum we have set
trainingNum = nrow(df) - testingNum
}
# Training set is all rows from the start minus the number in the test set. We'll arrange again, just in case.
trainingSet = head(arrange(df, time_number), trainingNum)
# Testing set is the last 'testingNum' rows. We'll arrange again, just in case.
testingSet = tail(arrange(df, time_number), testingNum)
# Train the model we selected earlier as the best, using the trainingSet
training.model = lm(monthly_amount~time_number, data=trainingSet)
return(training.model)
}
# Parameters
# df = data frame of dataset to test
# testingNum = NUmber of items in the testing set
# ind = Industry
# loc = Location
# setToTest = which data set to run predictions on. Can be 'test', 'train', or 'full' (i.e. the whole data set)
# forecast - whether to return forecast data (TRUE), or predictions on existing data (FALSE)
# Returns a data frame of predictions or forecasts
make_predictions <- function(df, testingNum, ind, loc, setToTest = "test", forecast = FALSE) {
output = data.frame()
# Arrange dataset by date
arrange(df, date)
# Add a number to represent date order
df$time_number = c(1:nrow(df))
training.model = train_model(df, testingNum)
#Determine the data set we want to make predictions on
if (setToTest == "train") {
dataSet = trainingSet
} else if (setToTest == "test") {
dataSet = testingSet
} else {
dataSet = df
}
# Output a prediction based on the dataSet
prediction = predict(training.model, dataSet)
output <- data.frame(industry = dataSet$industry,
location = dataSet$location,
month_number = dataSet$month_number,
year_number = dataSet$year_number,
time_number = dataSet$time_number,
actual = dataSet$monthly_amount,
prediction)
#Calculate the error for each prediction
output$error = with(output, prediction-actual)
#Calculate the root mean standard error
rmse = with(output, sqrt(mean(error^2)))
output$rmse = rmse
if (forecast) {
# Create a dataframe containing just the December 2016 data
december_2016 = data.frame(industry=ind,
location=loc,
monthly_amount=0,
month_number=12,
year_number=2016,
time_number=(nrow(dataSet)+1))
output = forecast_predictions(december_2016, training.model, rmse, ind, loc)
}
return(output)
}
#Forecasting December 2016 predictions is achieved by creating a data set of 1 row (for december 2016). Calculating the appropriate 'time_number value' and giving the row a monthly_amount value of 0. A training model created against training data is then used to predict the December 2016 monthly amount.
# Parameters
# df - dataframe of the whole data set used (i.e. for industry X and location Y)
# trainingModel - trainingModel created by 'make_predictions' function
# rmse - Root mean squared error calculated for the model predictions by 'make_predictions'
# industry - Industry number
# location - Location number
# Returns a data frame of December 2016 prediction
forecast_predictions <- function(df, trainingModel, rmse, industry, location) {
output = data.frame()
#Forecast a prediction using the training model and the December 2016 data
prediction_2016 = predict(trainingModel, df)
#Replace the december 2016 data frame with a new data frame
output = data.frame(industry = df$industry, location = df$location)
#Add the prediction
output$prediction = prediction_2016
#add the rmse calculated by the model
output$rmse = rmse
# Return the December 2016 prediction data frame
return(output)
}
# Loops through indutry and location combinations, makes predictions (or forecasts) and return the data frame with this data
# Parameters
# df = data frame of dataset to test
# industries = List of industries
# locations = List of locations
# setToTest = which data set to run predictions on. Can be 'test', 'train', or 'full' (i.e. the whole data set)
# forecast - whether to return forecast data (TRUE), or predictions on existing data (FALSE)
# Returns a data frame of predictions or forecasts
return_predictions <- function(df, industries, locations, setToTest = "full", forecast = FALSE) {
output = data.frame()
# Want to make sure we capture a couple of months from the preceeding year to capture the seasonal data
testingNum = 14
# The test/train split should ensure training data has 'trainingMultiplier' times the number of entries of tesing entries.
# The rationale for this exact value was described earlier
trainingMultiplier = 1.6
for (ind in industries) {
for (loc in locations) {
# Create a subset of the data
temp = df[df$industry == ind & df$location == loc, ]
# Check to make sure you have at least 'trainingMultiplier' times the number of training rows than testing rows
if (nrow(temp) >= ceiling(trainingMultiplier*testingNum)) {
predictionDataSet = make_predictions(temp, testingNum, ind, loc, setToTest, forecast)
} else {
# Append entry to output data frame when not enough data to compute
predictionDataSet = c(ind,loc,NA,NA)
}
# Add the row to the output dataframe
output = rbind(output, predictionDataSet)
}
}
#Return the output
return(output)
}
#Filter aggregated_transactions by location 1 and industry 1
loc1_ind1_df <- aggregated_transactions[aggregated_transactions$industry == 1 & aggregated_transactions$location == 1, ]
prediction_for_industry1_location1 = make_predictions(loc1_ind1_df, testingNum, 1, 1, "test", forecast = TRUE)
prediction_for_industry1_location1
model_for_industry1_location1 = train_model(loc1_ind1_df, 14)
summary(model_for_industry1_location1)
##
## Call:
## lm(formula = monthly_amount ~ time_number, data = trainingSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17279 -6992 1486 6303 16923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 150333.8 3159.5 47.581 < 2e-16 ***
## time_number 614.7 162.2 3.791 0.000651 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8870 on 31 degrees of freedom
## Multiple R-squared: 0.3168, Adjusted R-squared: 0.2947
## F-statistic: 14.37 on 1 and 31 DF, p-value: 0.0006507
# Get the list of unique industries, sorted in numerical order
industries <- sort(unique(aggregated_transactions$industry))
# Get the list of unique locations, sorted in numerical order
locations <- sort(unique(aggregated_transactions$location))
# Calculate the predictions for all industry and location pairs on training data
prediction_for_all_industry_location <- return_predictions(aggregated_transactions, industries, locations, setToTest = "test", forecast = FALSE)
# Order by RMSE
prediction_for_all_industry_location
#Aggregate the transactions data
aggregated_transactions = aggregate_transactions(transactions)
# Get the list of unique industries, sorted in numerical order
industries <- sort(unique(aggregated_transactions$industry))
# Get the list of unique locations, sorted in numerical order
locations <- sort(unique(aggregated_transactions$location))
output = data.frame()
#Include some months from previous year (outlined previously)
testingNum = 14
#Ensure enough training data (outline previously)
trainingMultiplier = 1.6
for (ind in industries) {
for (loc in locations) {
temp = aggregated_transactions[aggregated_transactions$industry == ind & aggregated_transactions$location == loc, ]
#Make sure we have enough training data
if (nrow(temp) >= ceiling(trainingMultiplier*testingNum)) {
# Arrange dataset by date
arrange(temp, date)
# Add a number to represent date order
temp$time_number = c(1:nrow(temp))
# Training number is the number of rows minus the testingNum we have set
trainingNum = nrow(temp) - testingNum
# Training set is all rows from the start minus the number in the test set. We'll arrange again, just in case.
trainingSet = head(arrange(temp, time_number), trainingNum)
# Testing set is the last 'testingNum' rows. We'll arrange again, just in case.
testingSet = tail(arrange(temp, time_number), testingNum)
# Train the model we selected earlier as the best, using the trainingSet
model = lm(monthly_amount~time_number, data=trainingSet)
# Output a prediction based on the dataSet
training.prediction = predict(model, trainingSet)
testing.prediction = predict(model, testingSet)
trainingSet$prediction = training.prediction
testingSet$prediction = testing.prediction
trainingSet$error = with(trainingSet, prediction-monthly_amount)
testingSet$error = with(testingSet, prediction-monthly_amount)
trainingSet.rmse = with(trainingSet, sqrt(mean(error^2)))
testingSet.rmse = with(testingSet, sqrt(mean(error^2)))
tempOutput = c(ind, loc, trainingSet.rmse, testingSet.rmse)
} else {
tempOutput = c(ind, loc, NA, NA)
}
output = rbind(output, tempOutput)
}
}
colnames(output) <- c("Industry", "Location", "Train RMSE", "Test RMSE")
output