Below you will find my solutions for the 3rd homework for the class BDM&T.
By Andraz Versnik 28.5.2018
library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)
library(ggmap)
setwd("D:/1. Important stuff/Uni/Big data management & technologies/Homework/3 R")
#Product data
product<-read.csv(file="./DataCapture-master/SampleDataFiles/bi_product.txt", sep=";", dec=".", header=TRUE)
head(product)
## ProductID Product Category Segment ManufacturerID
## 1 1 Abbas MA-01 Mix All Season 1
## 2 2 Abbas MA-02 Mix All Season 1
## 3 3 Abbas MA-03 Mix All Season 1
## 4 4 Abbas MA-04 Mix All Season 1
## 5 5 Abbas MA-05 Mix All Season 1
## 6 6 Abbas MA-06 Mix All Season 1
tally(product)
## n
## 1 2412
#Sales data
sales<-read.csv(file="./DataCapture-master/SampleDataFiles/bi_salesFact.txt", sep=";", dec=".", header=TRUE)
head(sales)
## ProductID zip Units Revenue Datekey city state latitude
## 1 706 75754 1 283.4475 20100101 Ben Wheeler TX 32.41245
## 2 1730 32317 1 786.9750 20100101 Tallahassee FL 30.47935
## 3 1789 76463 1 944.9475 20100101 Mingus TX 32.49323
## 4 1826 46158 1 220.4475 20100101 Mooresville IN 39.58855
## 5 2332 28128 1 534.9750 20100101 Norwood NC 35.22883
## 6 531 80111 1 629.7375 20100101 Englewood CO 39.61043
## longitude
## 1 -95.66711
## 2 -84.346204
## 3 -98.41805
## 4 -86.37431
## 5 -80.15214
## 6 -104.88139
tally(sales)
## n
## 1 184326
a<-sales%>%filter(Units>=1)%>%
summarise(Units_Sold=n())
print(head(a))
## Units_Sold
## 1 184326
b<-full_join(sales, product, by="ProductID")%>%
select(Category,Revenue)%>%
group_by(Category)%>%
summarize(CategoryRevenue=sum(Revenue, na.rm=TRUE))
print(head(b))
## # A tibble: 4 x 2
## Category CategoryRevenue
## <fct> <dbl>
## 1 Mix 3924610.
## 2 Rural 12973642.
## 3 Urban 88820272.
## 4 Youth 1864729.
c<-full_join(sales, product, by="ProductID")%>%
select(Category,Revenue)%>%
group_by(Category)%>%
summarize(Average=mean(Revenue, na.rm=TRUE))
d<-full_join(sales, product, by="ProductID")%>%
select(Category,Revenue)%>%
group_by(Category)%>%
summarize(Median=median(Revenue, na.rm=TRUE))
cd<-full_join(c,d, by="Category")
print(cd)
## # A tibble: 4 x 3
## Category Average Median
## <fct> <dbl> <dbl>
## 1 Mix 401. 315.
## 2 Rural 268. 231.
## 3 Urban 763. 693.
## 4 Youth 194. 157.
As the mean (or average) is larger than the median in every single category, it meabs that all distributions are positively skewed.To demonstrate this, below you will find the vertical boxplots of the four categories (whereby upward skewed distribution = positive skewed distribution).
library(gridExtra)
d<-full_join(sales, product, by="ProductID")
skewness<-ggplot(d,aes(x=Category,y=Revenue))+
geom_point()+
geom_boxplot()+
coord_cartesian(xlim=NULL, ylim=c(125,960), expand=TRUE)
grid.arrange(skewness, ncol=1, nrow=1)
e<-full_join(sales, product, by="ProductID")%>%
select(ProductID,Category,Segment)%>%
group_by(Category,Segment)%>%
summarize(Sales=n())%>%
arrange(Segment,Category)%>%
spread(Category,Sales)
e[is.na(e)]<-0
print(e)
## # A tibble: 8 x 5
## Segment Mix Rural Urban Youth
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 All Season 9185 0 0 0
## 2 Convenience 0 0 46197 0
## 3 Extreme 0 0 27984 0
## 4 Moderation 0 0 39664 0
## 5 Productivity 732 39093 0 0
## 6 Regular 0 0 3520 0
## 7 Select 0 9726 0 0
## 8 Youth 0 0 0 9701
library(ggmap)
library(ggplot2)
library(mapproj)
mapNA <- get_map(location = 'United States of America', zoom = 4)
f <- sales%>%sample_n(100)%>%select(Revenue,longitude,latitude)
SalesUSA <- ggmap(mapNA) +
geom_point(data = f, aes(x = as.numeric(as.character(longitude)), y = latitude), size = f$Revenue/150, alpha = .5)
SalesUSA