Install required libraries

###Task 1 

#install packages 
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(readr)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(lift)

#get the data
setwd("C:/Users/Benjibex/Documents/R/lineartimeseries")
data <- read.csv("transactions.csv", header= TRUE, stringsAsFactors = FALSE)

#put the date in lubridate format
data$date = as.Date(data$date, format = "%d/%m/%y")

#explore the data
str(data)
## 'data.frame':    94248 obs. of  5 variables:
##  $ date          : Date, format: "2013-01-01" "2013-02-01" ...
##  $ customer_id   : chr  "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" ...
##  $ industry      : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ location      : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ monthly_amount: num  753851 651548 1138769 659739 770675 ...
unique(data$industry)
##  [1]  8  1  9 10  2  5  4  6  7  3
unique(data$location)
##  [1]  9  4  2  5 10  6  1  3  8  7
unique(data$date)
##  [1] "2013-01-01" "2013-02-01" "2013-03-01" "2013-04-01" "2013-05-01"
##  [6] "2013-06-01" "2013-07-01" "2013-08-01" "2013-09-01" "2013-10-01"
## [11] "2013-11-01" "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01"
## [16] "2014-04-01" "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01"
## [21] "2014-09-01" "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01"
## [26] "2015-02-01" "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01"
## [31] "2015-07-01" "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01"
## [36] "2015-12-01" "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01"
## [41] "2016-05-01" "2016-06-01" "2016-07-01" "2016-08-01" "2016-09-01"
## [46] "2016-10-01" "2016-11-01"
length(unique(data$customer_id))
## [1] 4464
summary(data)
##       date            customer_id           industry         location     
##  Min.   :2013-01-01   Length:94248       Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:2014-05-01   Class :character   1st Qu.: 1.000   1st Qu.: 2.000  
##  Median :2015-06-01   Mode  :character   Median : 2.000   Median : 4.000  
##  Mean   :2015-03-26                      Mean   : 2.455   Mean   : 4.214  
##  3rd Qu.:2016-03-01                      3rd Qu.: 3.000   3rd Qu.: 6.000  
##  Max.   :2016-11-01                      Max.   :10.000   Max.   :10.000  
##  monthly_amount     
##  Min.   :        0  
##  1st Qu.:    95323  
##  Median :   179399  
##  Mean   :   395397  
##  3rd Qu.:   375439  
##  Max.   :100000000
#add a sequential number column
data$month2  <- month(data$date) + (year(data$date) - 2013) * 12


str(data)
## 'data.frame':    94248 obs. of  6 variables:
##  $ date          : Date, format: "2013-01-01" "2013-02-01" ...
##  $ customer_id   : chr  "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" ...
##  $ industry      : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ location      : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ monthly_amount: num  753851 651548 1138769 659739 770675 ...
##  $ month2        : num  1 2 3 4 5 6 7 8 9 10 ...

Task 1

Create an aggregated data set of the fields date, industry and location,

with a mean of monthly_amount.

#Question 1 
#Create an aggregated data set of the fields date, industry and location, 
#with a mean of monthly_amount.

data <- data %>% 
  group_by(month2, industry, location) %>% 
  summarise(MeanValue = mean(monthly_amount)) %>%
  mutate(MeanValue)
head(data)
## Source: local data frame [6 x 4]
## Groups: month2, industry [1]
## 
##   month2 industry location MeanValue
##    <dbl>    <int>    <int>     <dbl>
## 1      1        1        1  136080.9
## 2      1        1        2  177840.1
## 3      1        1        3  141632.5
## 4      1        1        4  221057.7
## 5      1        1        5  178138.1
## 6      1        1        6  133400.1
#write to disk and inspect file
write.csv(file="aggregated.csv",data)

aggregated <- as.matrix(data)

Task 2

Create a line plot of the variable monthly_amount for industry = 1 and

location = 1

Note the seasonality by month in this time series.

As you can see in the line graph below, the months of December and January are low months for this industry and location, but then the sales more than bounce back in March-June.

data_11 <- data %>%
  filter(industry==1, location==1)


  ggplot(data=data_11)+
    geom_point(mapping= aes(x=month2, y=MeanValue), color="blue")+
  geom_line(mapping= aes(x=month2, y=MeanValue), color="red")+
    xlab("Month")+
    ylab("Mean Value of Monthly Spend")+
  labs(title="Industry 1 Location 1")