Install required libraries
###Task 1
#install packages
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(lift)
#get the data
setwd("C:/Users/Benjibex/Documents/R/lineartimeseries")
data <- read.csv("transactions.csv", header= TRUE, stringsAsFactors = FALSE)
#put the date in lubridate format
data$date = as.Date(data$date, format = "%d/%m/%y")
#explore the data
str(data)
## 'data.frame': 94248 obs. of 5 variables:
## $ date : Date, format: "2013-01-01" "2013-02-01" ...
## $ customer_id : chr "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" ...
## $ industry : int 8 8 8 8 8 8 8 8 8 8 ...
## $ location : int 9 9 9 9 9 9 9 9 9 9 ...
## $ monthly_amount: num 753851 651548 1138769 659739 770675 ...
unique(data$industry)
## [1] 8 1 9 10 2 5 4 6 7 3
unique(data$location)
## [1] 9 4 2 5 10 6 1 3 8 7
unique(data$date)
## [1] "2013-01-01" "2013-02-01" "2013-03-01" "2013-04-01" "2013-05-01"
## [6] "2013-06-01" "2013-07-01" "2013-08-01" "2013-09-01" "2013-10-01"
## [11] "2013-11-01" "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01"
## [16] "2014-04-01" "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01"
## [21] "2014-09-01" "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01"
## [26] "2015-02-01" "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01"
## [31] "2015-07-01" "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01"
## [36] "2015-12-01" "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01"
## [41] "2016-05-01" "2016-06-01" "2016-07-01" "2016-08-01" "2016-09-01"
## [46] "2016-10-01" "2016-11-01"
length(unique(data$customer_id))
## [1] 4464
summary(data)
## date customer_id industry location
## Min. :2013-01-01 Length:94248 Min. : 1.000 Min. : 1.000
## 1st Qu.:2014-05-01 Class :character 1st Qu.: 1.000 1st Qu.: 2.000
## Median :2015-06-01 Mode :character Median : 2.000 Median : 4.000
## Mean :2015-03-26 Mean : 2.455 Mean : 4.214
## 3rd Qu.:2016-03-01 3rd Qu.: 3.000 3rd Qu.: 6.000
## Max. :2016-11-01 Max. :10.000 Max. :10.000
## monthly_amount
## Min. : 0
## 1st Qu.: 95323
## Median : 179399
## Mean : 395397
## 3rd Qu.: 375439
## Max. :100000000
#add a sequential number column
data$month2 <- month(data$date) + (year(data$date) - 2013) * 12
str(data)
## 'data.frame': 94248 obs. of 6 variables:
## $ date : Date, format: "2013-01-01" "2013-02-01" ...
## $ customer_id : chr "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" "70efdf2ec9b086079795c442636b55fb" ...
## $ industry : int 8 8 8 8 8 8 8 8 8 8 ...
## $ location : int 9 9 9 9 9 9 9 9 9 9 ...
## $ monthly_amount: num 753851 651548 1138769 659739 770675 ...
## $ month2 : num 1 2 3 4 5 6 7 8 9 10 ...
#Question 1
#Create an aggregated data set of the fields date, industry and location,
#with a mean of monthly_amount.
data <- data %>%
group_by(month2, industry, location) %>%
summarise(MeanValue = mean(monthly_amount)) %>%
mutate(MeanValue)
head(data)
## Source: local data frame [6 x 4]
## Groups: month2, industry [1]
##
## month2 industry location MeanValue
## <dbl> <int> <int> <dbl>
## 1 1 1 1 136080.9
## 2 1 1 2 177840.1
## 3 1 1 3 141632.5
## 4 1 1 4 221057.7
## 5 1 1 5 178138.1
## 6 1 1 6 133400.1
#write to disk and inspect file
write.csv(file="aggregated.csv",data)
aggregated <- as.matrix(data)
As you can see in the line graph below, the months of December and January are low months for this industry and location, but then the sales more than bounce back in March-June.
data_11 <- data %>%
filter(industry==1, location==1)
ggplot(data=data_11)+
geom_point(mapping= aes(x=month2, y=MeanValue), color="blue")+
geom_line(mapping= aes(x=month2, y=MeanValue), color="red")+
xlab("Month")+
ylab("Mean Value of Monthly Spend")+
labs(title="Industry 1 Location 1")