Homework #3 - Big Data Management & Technologies

Below you will find my solutions for the 3rd homework for the class BDM&T.

By Andraz Versnik 28.5.2018

Data importing: Products and Sales data

library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)
library(ggmap)

setwd("D:/1. Important stuff/Uni/Big data management & technologies/Homework/3 R")

#Product data
product<-read.csv(file="./DataCapture-master/SampleDataFiles/bi_product.txt", sep=";", dec=".", header=TRUE)
head(product)
##   ProductID     Product Category    Segment ManufacturerID
## 1         1 Abbas MA-01      Mix All Season              1
## 2         2 Abbas MA-02      Mix All Season              1
## 3         3 Abbas MA-03      Mix All Season              1
## 4         4 Abbas MA-04      Mix All Season              1
## 5         5 Abbas MA-05      Mix All Season              1
## 6         6 Abbas MA-06      Mix All Season              1
tally(product)
##      n
## 1 2412
#Sales data
sales<-read.csv(file="./DataCapture-master/SampleDataFiles/bi_salesFact.txt", sep=";", dec=".", header=TRUE)
head(sales)
##   ProductID   zip Units  Revenue  Datekey        city state latitude
## 1       706 75754     1 283.4475 20100101 Ben Wheeler    TX 32.41245
## 2      1730 32317     1 786.9750 20100101 Tallahassee    FL 30.47935
## 3      1789 76463     1 944.9475 20100101      Mingus    TX 32.49323
## 4      1826 46158     1 220.4475 20100101 Mooresville    IN 39.58855
## 5      2332 28128     1 534.9750 20100101     Norwood    NC 35.22883
## 6       531 80111     1 629.7375 20100101   Englewood    CO 39.61043
##    longitude
## 1  -95.66711
## 2 -84.346204
## 3  -98.41805
## 4  -86.37431
## 5  -80.15214
## 6 -104.88139
tally(sales)
##        n
## 1 184326

Number of units sold

a<-sales%>%filter(Units>=1)%>%
  summarise(Units_Sold=n())

print(head(a))
##   Units_Sold
## 1     184326

Sum of income for every product category

b<-full_join(sales, product, by="ProductID")%>%
  select(Category,Revenue)%>%
  group_by(Category)%>%
  summarize(CategoryRevenue=sum(Revenue, na.rm=TRUE))

print(head(b))
## # A tibble: 4 x 2
##   Category CategoryRevenue
##   <fct>              <dbl>
## 1 Mix             3924610.
## 2 Rural          12973642.
## 3 Urban          88820272.
## 4 Youth           1864729.

Mean and median revenue of each category

c<-full_join(sales, product, by="ProductID")%>%
  select(Category,Revenue)%>%
  group_by(Category)%>%
  summarize(Average=mean(Revenue, na.rm=TRUE))
  
d<-full_join(sales, product, by="ProductID")%>%
  select(Category,Revenue)%>%
  group_by(Category)%>%
  summarize(Median=median(Revenue, na.rm=TRUE))

cd<-full_join(c,d, by="Category")

print(cd)
## # A tibble: 4 x 3
##   Category Average Median
##   <fct>      <dbl>  <dbl>
## 1 Mix         401.   315.
## 2 Rural       268.   231.
## 3 Urban       763.   693.
## 4 Youth       194.   157.

Skewness of each category

As the mean (or average) is larger than the median in every single category, it meabs that all distributions are positively skewed.To demonstrate this, below you will find the vertical boxplots of the four categories (whereby upward skewed distribution = positive skewed distribution).

library(gridExtra)

d<-full_join(sales, product, by="ProductID")

skewness<-ggplot(d,aes(x=Category,y=Revenue))+
  geom_point()+
  geom_boxplot()+
  coord_cartesian(xlim=NULL, ylim=c(125,960), expand=TRUE)

grid.arrange(skewness, ncol=1, nrow=1)

Sales across category and segment

e<-full_join(sales, product, by="ProductID")%>%
  select(ProductID,Category,Segment)%>%
  group_by(Category,Segment)%>%
  summarize(Sales=n())%>%
  arrange(Segment,Category)%>%
  spread(Category,Sales)

e[is.na(e)]<-0
print(e)
## # A tibble: 8 x 5
##   Segment        Mix Rural Urban Youth
##   <fct>        <dbl> <dbl> <dbl> <dbl>
## 1 All Season    9185     0     0     0
## 2 Convenience      0     0 46197     0
## 3 Extreme          0     0 27984     0
## 4 Moderation       0     0 39664     0
## 5 Productivity   732 39093     0     0
## 6 Regular          0     0  3520     0
## 7 Select           0  9726     0     0
## 8 Youth            0     0     0  9701

Random 100 sum of unit sales on a US map

library(ggmap)
library(ggplot2)
library(mapproj)

mapNA <- get_map(location = 'United States of America', zoom = 4)

f <- sales%>%sample_n(100)%>%select(Revenue,longitude,latitude)

SalesUSA <- ggmap(mapNA) +
  geom_point(data = f, aes(x = as.numeric(as.character(longitude)), y = latitude), size = f$Revenue/150, alpha = .5)
  
SalesUSA