Data, Algorithms and Meaning - Spring 2018, Assignment 1, Part A, the workings

# Import dataset. Create a function as we may want to do this again later for a clean import.

read_and_clean_data <- function(datasource, hasHeader=T) {
  dataSet = read.csv(datasource, header = hasHeader)

  #Clean data columns to match data dictionary
  dataSet$date = as.Date(dataSet$date, format = "%d/%m/%y")
  dataSet$customer_id = as.character(dataSet$customer_id, format = "")
  
  return(dataSet)
}

transactions <- read_and_clean_data("transactions.csv", T)

#View the data's structure to confirm
str(transactions)

## 'data.frame':    94248 obs. of  5 variables:
##  $ date          : Date, format: "2013-01-01" "2013-02-01" ...
##  $ customer_id   : chr  "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" ...
##  $ industry      : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ location      : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ monthly_amount: num  753851 651548 1138769 659739 770675 ...

1. a & b. Description of 2 insights from EDA

par(mfrow=c(1,2))
hist(transactions$industry, main = "Trans by Industry", xlab="Industry", ylab="Number of transactions", xlim = c(0,10), ylim=c(0,50000), las=0)
hist(transactions$location, main = "Trans by Location", xlab="Location", ylab="Number of transactions", xlim = c(0,10), ylim=c(0,50000), las=0)

Let’s visualise a scatterplot showing industry against monthly amount over time

ggplot(transactions, aes(x=date, y=monthly_amount, color=industry)) + 
  geom_point() + geom_smooth(se = FALSE, color = "red") + labs(title="Industry and monthly amount over time", x="Date", y="Monthly Amount") + scale_y_continuous(labels = scales::comma)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Let’s visualise a scatterplot showing location variables against monthly amount over time

ggplot(transactions, aes(x=date, y=monthly_amount, color=location)) + 
  geom_point() + geom_smooth(se = FALSE, color = "red") + labs(title="Location and monthly amount over time", x="Date", y="Monthly Amount") + scale_y_continuous(labels = scales::comma)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

What about exploring detail about higher than average sales values?

It makes sense to explore where and why the monthly amounts are higher than average, using the ‘mean’ as the indicator of average.

#Mutate the dataset to create a new column called 'monthly_amount_above_mean'  
transactions_with_mean_boolean <- transactions %>%
  # creating a new variable to classify transaction column
  mutate(monthly_amount_above_mean = ifelse(monthly_amount > mean(monthly_amount), TRUE, FALSE))

Plot a bar chart by monthly amounts by date

qplot(x = date, fill = monthly_amount_above_mean, data = transactions_with_mean_boolean, geom = "bar", main = "Number of transactions by month, showing proportion above the mean")

This reveals most monthly amounts are below the mean and the monthly amounts grow steadily rather than steeply. Let’s now look at it by industry and location, too.

Plot a bar chart by monthly amounts by location.

qplot(x = location, fill = monthly_amount_above_mean, data = transactions_with_mean_boolean, geom = "bar", main = "Number of transactions by location, showing proportion above the mean")

So what about by industry?

Plot a bar chart by monthly amounts by industry

qplot(x = industry, fill = monthly_amount_above_mean, data = transactions_with_mean_boolean, geom = "bar", main = "Number of transactions by industry, showing proportion above the mean")

2. Basic model fitting

2. a. Creating the model

The goal will be to build a model that can predict future monthly_amounts.

We will need to add a new variable called “time_number”, which is an integer describing the date order of rows in the dataset. For example: January, 2013 becomes 1, February 2013 becomes 2 etc. We will then do a rough 70:30 test:train data split, ensuring that the lower values seen across December are represented in our testing set.

Aggregate the data, grouping by date, industry and location, and calculating the mean monthly_amount

# We may want to do this again, so write a function
aggregate_transactions <- function(df) {
    
  # Aggregate the data, grouping by date, industry and location, and calculating the mean monthly_amount
  output = df %>%
    group_by(date, industry, location) %>%
    summarize(monthly_amount = mean(monthly_amount, na.rm = TRUE))
  
  # Let's also create a column for the month number and another one for year number
  output = output %>%
    mutate(month_number = format(as.Date(date), "%m")) %>%
    mutate(year_number = format(as.Date(date), "%Y"))
  
  # Make sure the new columns are of the correct type
  output$month_number = as.integer(output$month_number)
  output$year_number = as.integer(output$year_number)
  
  transform(output, month_number = as.integer(month_number), year_number = as.integer(year_number))
  
  return(output)

}

aggregated_transactions <- aggregate_transactions(transactions)

aggregated_transactions

2. a. ii) Create a line plot of the variable monthly_amount for industry = 1 and location = 1

ggplot(data=filter(aggregated_transactions, industry == 1 & location == 1), aes(x=date, y=monthly_amount, group=1)) + geom_line(color="red") + geom_point() + labs(title="Line plot of mean monthly amount for Industry 1 and Location 1", x="Date", y="Mean Monthly Amount")

We want to explore different linear regression models, with monthly_amount always the target, but trialling different predictors.

As industry and location are the same for all entries, they will not be used as predictors.

#Filter our aggregated data set to just entries with industry and location values of 1
tempSet <- aggregated_transactions[aggregated_transactions$industry == 1 & aggregated_transactions$location == 1, ]

# Want to make sure we capture a couple of months from the preceeding year to capture the seasonal data in our testing set
testingNum = 14

# The test/train split should ensure training data has 'trainingMultiplier' times the number of entries of tesing entries
# * 1.6 was the minimum size based on trials that produced the same RMSE compared to higher sampling.
# * A value of 4 or above returned no results due to insufficient rows. 1.5 or below returned warnings about rank-deficient fits
# * Between 1.6 and 3 provided the same RMSE error values
# * However, when trainingMultiplier was >=1.6 or <=2, results included one extra location/industry combination.
# * The lowest number was chosen that did not change RMSE values, to maximise chance of calculation on smaller sample sizes without compromising predictions.
trainingMultiplier = 1.6

#Arrange dataset by dateand add a time_number value to represent date order
tempSet$time_number = c(1:nrow(arrange(tempSet, date)))

#Just to make sure, let's order by time_number
arrange(tempSet, time_number)

#Training number is the number of rows minus the testingNum, we arrange the dataSet again, just to make sure
trainingNum = nrow(arrange(tempSet, time_number)) - testingNum

#Training set is all rows from the start minus the number in the test set, we arrange the dataSet again, just to make sure
trainingSet = head(arrange(tempSet, time_number), trainingNum)

#Testing set is the last 'testingNum' rows, we arrange the dataSet again, just to make sure
testingSet = tail(arrange(tempSet, time_number), testingNum)

Training the monthly_amount ~ month_number linear model

Run a linear model with the training set, with monthly_amount as target and month_number as the predictor

month_number.lm <- lm(monthly_amount~month_number, data=trainingSet)
summary(month_number.lm)

## 
## Call:
## lm(formula = monthly_amount ~ month_number, data = trainingSet)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21904.9  -7126.0    162.6   8778.7  17485.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  156947.7     3836.2  40.913   <2e-16 ***
## month_number    629.8      553.5   1.138    0.264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10510 on 31 degrees of freedom
## Multiple R-squared:  0.04009,    Adjusted R-squared:  0.009127 
## F-statistic: 1.295 on 1 and 31 DF,  p-value: 0.2639

Assessing the monthly_amount ~ month_number linear model

The summary statistics let us see the size and significance of the fit for this model, which looks unlikely to be significant or a good fit. The p-value of 0.2639 reflects this. The Adjusted R-Squared of 0.009127 further echoes this observation.

Residuals represent a lack of fit of a line to a model - which can be seen as the model’s error - and you can see the minimum to maximum residuals range from -21904.9 to 17485.4.

ggplot(data = trainingSet, mapping = aes(x = month_number, y = monthly_amount)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title="The linear relationship between monthly amount and month number", x="Month Number", y="Monthly Amount")

It’s not the greatest fit, but there is a trend.

If we plot monthly_amount ~ month_number linear model using statistical diagnostic plots, we can get a better view of the sign, size and significance of the fit and further confirm that month number is not likely to be the best predictor of monthly sales amounts.

par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(month_number.lm)

Training the monthly_amount ~ time_number linear model

Now let’s run a different linear model with monthly_amount as target and time_number as the predictor to see if it fits better.

time_number.lm <- lm(monthly_amount~time_number, data=trainingSet)

summary(time_number.lm)

## 
## Call:
## lm(formula = monthly_amount ~ time_number, data = trainingSet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17279  -6992   1486   6303  16923 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 150333.8     3159.5  47.581  < 2e-16 ***
## time_number    614.7      162.2   3.791 0.000651 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8870 on 31 degrees of freedom
## Multiple R-squared:  0.3168, Adjusted R-squared:  0.2947 
## F-statistic: 14.37 on 1 and 31 DF,  p-value: 0.0006507

Assessing the monthly_amount ~ time_number linear model

Let’s try to see if there is a better linear relationship between monthly amount and time_number

ggplot(data = trainingSet, mapping = aes(x = time_number, y = monthly_amount)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title="A stronger trend ", x="Date", y="Monthly Amount")

plot(time_number.lm)

Training the monthly_amount ~ month_number + time_number linear model

Finally, let’s run a model with monthly_amount as target and month number + time_number as the predictors to see if month_number and time_number work better in tandem.

month_number_and_time_number.lm <- lm(monthly_amount~month_number + time_number, data=trainingSet)

summary(month_number_and_time_number.lm)

## 
## Call:
## lm(formula = monthly_amount ~ month_number + time_number, data = trainingSet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17139  -6451   1404   6149  17038 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  148686.3     3998.7  37.183   <2e-16 ***
## month_number    326.5      478.6   0.682   0.5003    
## time_number     594.6      166.2   3.578   0.0012 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8947 on 30 degrees of freedom
## Multiple R-squared:  0.3272, Adjusted R-squared:  0.2823 
## F-statistic: 7.295 on 2 and 30 DF,  p-value: 0.00262

Assessing the monthly_amount ~ time_number linear model on training data

The Adjusted R-squared of 0.2823 is lower than our second model, and the p-value is higher.

The monthly_amount ~ time_number model was the best of the training data.

Function to hold our linear model and output predictions

Based on the knowledge gained from the assessment above, a function was written which trains a linear regession model, another function calculates the RMSE (root mean squared error) and determines predictions for all rows. If ‘forecast’ is set to true, it will call the make_forecasts function and return forecasts for December 2016. Otherwise, it will return predictions (if forecast = false) for existing data.

# Train a linear regression model based on the data frame entered and the number of rows in the testing set

# Parameters
#   df = data frame of dataset to test
#   testingNum = Number of items in the testing set

# Returns a trained linear regression model

train_model <- function(df, testingNum) {
  # Arrange dataset by date
  arrange(df, date)
  
  # Add a number to represent date order
  df$time_number = c(1:nrow(df))
  
  if(testingNum == 0) {
    trainingNum = nrow(df)
  } else {
    # Training number is the number of rows minus the testingNum we have set
    trainingNum = nrow(df) - testingNum
  }
  
  # Training set is all rows from the start minus the number in the test set. We'll arrange again, just in case.
  trainingSet = head(arrange(df, time_number), trainingNum)
  
  # Testing set is the last 'testingNum' rows. We'll arrange again, just in case.
  testingSet = tail(arrange(df, time_number), testingNum)

  # Train the model we selected earlier as the best, using the trainingSet
  training.model = lm(monthly_amount~time_number, data=trainingSet)
  
  return(training.model)
  
}

# Parameters
#   df = data frame of dataset to test
#   testingNum = NUmber of items in the testing set
#   ind = Industry
#   loc = Location
#   setToTest = which data set to run predictions on. Can be 'test', 'train', or 'full' (i.e. the whole data set)
#   forecast - whether to return forecast data (TRUE), or predictions on existing data (FALSE)

# Returns a data frame of predictions or forecasts

make_predictions <- function(df, testingNum, ind, loc, setToTest = "test", forecast = FALSE) {
  
  output = data.frame()
  
  # Arrange dataset by date
  arrange(df, date)
  
  # Add a number to represent date order
  df$time_number = c(1:nrow(df))
  
  training.model = train_model(df, testingNum)
  
  #Determine the data set we want to make predictions on
  if (setToTest == "train") {
    dataSet = trainingSet
  } else if (setToTest == "test") {
    dataSet = testingSet
  } else {
    dataSet = df
  }
  
  # Output a prediction based on the dataSet
  prediction = predict(training.model, dataSet)
  
  output <- data.frame(industry = dataSet$industry,
                        location = dataSet$location,
                        month_number = dataSet$month_number,
                        year_number = dataSet$year_number,
                        time_number = dataSet$time_number,
                        actual = dataSet$monthly_amount,
                        prediction)
  
  #Calculate the error for each prediction
  output$error = with(output, prediction-actual)
  
  #Calculate the root mean standard error
  rmse = with(output, sqrt(mean(error^2)))
  
  output$rmse = rmse
        
  if (forecast) {
    
    # Create a dataframe containing just the December 2016 data
    december_2016 = data.frame(industry=ind,
                               location=loc,
                               monthly_amount=0,
                               month_number=12,
                               year_number=2016,
                               time_number=(nrow(dataSet)+1))
    
    output = forecast_predictions(december_2016, training.model, rmse, ind, loc)
        
  }

  return(output)
  
}

#Forecasting December 2016 predictions is achieved by creating a data set of 1 row (for december 2016). Calculating the appropriate 'time_number value' and giving the row a monthly_amount value of 0. A training model created against training data is then used to predict the December 2016 monthly amount.

# Parameters
#   df - dataframe of the whole data set used (i.e. for industry X and location Y)
#   trainingModel - trainingModel created by 'make_predictions' function
#   rmse - Root mean squared error calculated for the model predictions by 'make_predictions'
#   industry - Industry number
#   location - Location number

# Returns a data frame of December 2016 prediction

forecast_predictions <- function(df, trainingModel, rmse, industry, location) {
  
  output = data.frame()
    
    #Forecast a prediction using the training model and the December 2016 data
    prediction_2016 = predict(trainingModel, df)
    
    #Replace the december 2016 data frame with a new data frame
    output = data.frame(industry = df$industry, location = df$location)
    
    #Add the prediction
    output$prediction = prediction_2016
    
    #add the rmse calculated by the model
    output$rmse = rmse
    
    # Return the December 2016 prediction data frame
    return(output)
  
}

# Loops through indutry and location combinations, makes predictions (or forecasts) and return the data frame with this data

# Parameters
#   df = data frame of dataset to test
#   industries = List of industries
#   locations = List of locations
#   setToTest = which data set to run predictions on. Can be 'test', 'train', or 'full' (i.e. the whole data set)
#   forecast - whether to return forecast data (TRUE), or predictions on existing data (FALSE)

# Returns a data frame of predictions or forecasts

return_predictions <- function(df, industries, locations, setToTest = "full", forecast = FALSE) {

  output = data.frame()
  
  # Want to make sure we capture a couple of months from the preceeding year to capture the seasonal data
  testingNum = 14
  
  # The test/train split should ensure training data has 'trainingMultiplier' times the number of entries of tesing entries.
  # The rationale for this exact value was described earlier
  trainingMultiplier = 1.6
  
  for (ind in industries) {
    for (loc in locations) {
      
      # Create a subset of the data
      temp = df[df$industry == ind & df$location == loc, ]
      
      # Check to make sure you have at least 'trainingMultiplier' times the number of training rows than testing rows
      if (nrow(temp) >= ceiling(trainingMultiplier*testingNum)) {
        
        predictionDataSet = make_predictions(temp, testingNum, ind, loc, setToTest, forecast)
      
      } else {
        # Append entry to output data frame when not enough data to compute
        predictionDataSet = c(ind,loc,NA,NA)
      }
      
      # Add the row to the output dataframe
      output = rbind(output, predictionDataSet)
    }
  }
  
  #Return the output
  return(output)
}

Create a prediction for monthly_amount in December 2016

#Filter aggregated_transactions by location 1 and industry 1
loc1_ind1_df <- aggregated_transactions[aggregated_transactions$industry == 1 & aggregated_transactions$location == 1, ]

prediction_for_industry1_location1 = make_predictions(loc1_ind1_df, testingNum, 1, 1, "test", forecast = TRUE)

prediction_for_industry1_location1

model_for_industry1_location1 = train_model(loc1_ind1_df, 14)

summary(model_for_industry1_location1)

## 
## Call:
## lm(formula = monthly_amount ~ time_number, data = trainingSet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17279  -6992   1486   6303  16923 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 150333.8     3159.5  47.581  < 2e-16 ***
## time_number    614.7      162.2   3.791 0.000651 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8870 on 31 degrees of freedom
## Multiple R-squared:  0.3168, Adjusted R-squared:  0.2947 
## F-statistic: 14.37 on 1 and 31 DF,  p-value: 0.0006507

3. Advanced model fitting

# Get the list of unique industries, sorted in numerical order 
industries <- sort(unique(aggregated_transactions$industry))

# Get the list of unique locations, sorted in numerical order
locations <- sort(unique(aggregated_transactions$location))

# Calculate the predictions for all industry and location pairs on training data
prediction_for_all_industry_location <- return_predictions(aggregated_transactions, industries, locations, setToTest = "test", forecast = FALSE)

# Order by RMSE
prediction_for_all_industry_location

3. b. Calculate your evaluation measure for the training data and your testing data for all models

#Aggregate the transactions data
aggregated_transactions = aggregate_transactions(transactions)

# Get the list of unique industries, sorted in numerical order 
industries <- sort(unique(aggregated_transactions$industry))

# Get the list of unique locations, sorted in numerical order
locations <- sort(unique(aggregated_transactions$location))

  output = data.frame()
  
  #Include some months from previous year (outlined previously)
  testingNum = 14
  #Ensure enough training data (outline previously)
  trainingMultiplier = 1.6
  
  for (ind in industries) {
    for (loc in locations) {
      
      temp = aggregated_transactions[aggregated_transactions$industry == ind & aggregated_transactions$location == loc, ]
      
      #Make sure we have enough training data
      if (nrow(temp) >= ceiling(trainingMultiplier*testingNum)) {
        
        # Arrange dataset by date
        arrange(temp, date)
        # Add a number to represent date order
        temp$time_number = c(1:nrow(temp))
        # Training number is the number of rows minus the testingNum we have set
        trainingNum = nrow(temp) - testingNum
        
        # Training set is all rows from the start minus the number in the test set. We'll arrange again, just in case.
        trainingSet = head(arrange(temp, time_number), trainingNum)
        
        # Testing set is the last 'testingNum' rows. We'll arrange again, just in case.
        testingSet = tail(arrange(temp, time_number), testingNum)
      
        # Train the model we selected earlier as the best, using the trainingSet
        model = lm(monthly_amount~time_number, data=trainingSet)
      
        # Output a prediction based on the dataSet
        training.prediction = predict(model, trainingSet)
        testing.prediction = predict(model, testingSet)
        
        trainingSet$prediction = training.prediction
        testingSet$prediction = testing.prediction
        
        trainingSet$error = with(trainingSet, prediction-monthly_amount)
        testingSet$error = with(testingSet, prediction-monthly_amount)
        
        trainingSet.rmse = with(trainingSet, sqrt(mean(error^2)))
        testingSet.rmse = with(testingSet, sqrt(mean(error^2)))
        
        tempOutput = c(ind, loc, trainingSet.rmse, testingSet.rmse)
        
      } else {
        tempOutput = c(ind, loc, NA, NA)
      }
      
      output = rbind(output, tempOutput)
    }
  }
  colnames(output) <- c("Industry", "Location", "Train RMSE", "Test RMSE")
  
  output