Library:

pacman::p_load(dplyr,tidyr,plyr,ggplot2,caret,reader,  RColorBrewer,corrplot,GGally,reshape, viridis, rpart, rpart.plot,plotly)

Function for removing outliers:

outliers_DF <- function(DF,flexibilidad)
{
  list_return <- list()
  for (i in 1:ncol(DF)) {
    list_outliers<- list()
    vec <- DF[,i]
    posiciones_plus<-which(vec - (flexibilidad * sd(vec)) > median(vec))
    posiciones_minus<-which(vec + (flexibilidad * sd(vec)) < median(vec))
    list_outliers[["posiciones_plus"]] <- posiciones_plus
    list_outliers[["posiciones_minus"]] <- posiciones_minus
    list_return[[names(DF)[i]]] <- list_outliers
  } 
  return <- list_return
}

Importing the data for analyzing:

set.seed("123")
setwd("C:/Users/Alfonso/Desktop/Ubiqum/Bloque_2/Task-3/Data/")
products<-read.csv("existingproductattributes2017.csv")
preducts2 <- products
productsNew<-read.csv("newproductattributes2017.csv")

Preprocessing the data

Let’s have an overview of the dataset:

names(products)
##  [1] "ProductType"           "ProductNum"           
##  [3] "Price"                 "x5StarReviews"        
##  [5] "x4StarReviews"         "x3StarReviews"        
##  [7] "x2StarReviews"         "x1StarReviews"        
##  [9] "PositiveServiceReview" "NegativeServiceReview"
## [11] "Recommendproduct"      "BestSellersRank"      
## [13] "ShippingWeight"        "ProductDepth"         
## [15] "ProductWidth"          "ProductHeight"        
## [17] "ProfitMargin"          "Volume"
summary (products$ProductType)
##      Accessories          Display ExtendedWarranty      GameConsole 
##               26                5               10                2 
##           Laptop          Netbook               PC          Printer 
##                3                2                4               12 
##  PrinterSupplies       Smartphone         Software           Tablet 
##                3                4                6                3

First, we have removed the irrelevant observations for the predictions: We have removed all the products that have sales volume of 0 and all the Extended Warranties of the dataset.

products <- products[-(which (products$ProductType == "ExtendedWarranty")), ] 
products <- products[-(which (products$Volume == 0)), ]

We have also changed the categorical variables into binary variables in order to create a regression model.

newDataFrame <- dummyVars(" ~ .", data = products)
products <- data.frame(predict(newDataFrame, newdata = products))

Removing the missing values

products$BestSellersRank <- NULL
productsNew$BestSellersRank <- NULL
correlationmatrix <- cor(products[,-c(1:13)])

Correlation of the variables: Decision tree for finding relevant variables:

## Warning: Bad 'data' field in model 'call' (expected a data.frame or a matrix).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

What is the importance of each variable:

##         x4StarReviews         x3StarReviews         x2StarReviews 
##            82655846.1            60424872.6            59326548.1 
##         x1StarReviews PositiveServiceReview NegativeServiceReview 
##            45824946.8            43345059.5            21700469.6 
##            ProductNum         ProductHeight 
##             2196648.8              303145.1

We have removed all the variables that have high correlation with another independent variable, as well as, the variables that are not relevant to the Sales Volume. About the x5StarReviews, it has a correlation of 1 with the dependent variable:

remove<-which(names(products) %in% c("NegativeServiceReview","x5StarReviews",
                                     "x3StarReviews","x2StarReviews"))
products<- products[,-remove]

Removing outliers using the function created:

outliers_list <- outliers_DF(DF = products, flexibilidad = 4)
products <- products[-outliers_list$Volume$posiciones_plus,]

Creating the predictive model:

Spliting the dataset into Training and Testing. The training represents the 75% of the total observations.

partition<-createDataPartition(y = products$Volume, times = 1,p = 0.75,
                               list = FALSE )
products_train<- products[partition,]
products_test<- products[-partition,]
products_train2<-products_train[,-which(names(products_train)%in%"ProductNum")]
names(products_train2)
##  [1] "ProductType.Accessories"      "ProductType.Display"         
##  [3] "ProductType.ExtendedWarranty" "ProductType.GameConsole"     
##  [5] "ProductType.Laptop"           "ProductType.Netbook"         
##  [7] "ProductType.PC"               "ProductType.Printer"         
##  [9] "ProductType.PrinterSupplies"  "ProductType.Smartphone"      
## [11] "ProductType.Software"         "ProductType.Tablet"          
## [13] "Price"                        "x4StarReviews"               
## [15] "x1StarReviews"                "PositiveServiceReview"       
## [17] "Recommendproduct"             "ShippingWeight"              
## [19] "ProductDepth"                 "ProductWidth"                
## [21] "ProductHeight"                "ProfitMargin"                
## [23] "Volume"

Features of the train model:

tc1 <- trainControl(method = "repeatedcv",
                           number=10,
                           repeats = 1)

We have first checked which predictive model has the best performance. The models used are the following: Linear model, Random Forest, SVM Lineal, SVM Radial.

Plotting the performance of each predictive model:

colnames(results) <- c("lm","rf","svmLinear","svmRadial")

melted_results <- melt(results)
ggplot(melted_results, aes(x=X2, y = value)) + 
  geom_bar(stat = "identity", aes(fill=X2)) +
  facet_wrap(~X1,scales = "free") + 
  scale_fill_brewer(palette = "Spectral") +
  stat_summary(fun.y = max, colour="black", geom="text",
               vjust=-1,
               # position = position_nudge(x = 0, y = -0.12),
               aes(label=round(..y.., digits=2))) + 
  xlab("Models") + 
  ylab("")

Formulas for the model based on the most relevant variables. Which variables should we use for making predictions?

SVM_funcion0 <- (Volume ~ .)
SVM_funcion1 <- (Volume ~ x4StarReviews)
SVM_funcion2 <- (Volume ~ x4StarReviews + PositiveServiceReview)
SVM_funcion3 <- (Volume ~ x4StarReviews + PositiveServiceReview +
                   ShippingWeight)
SVM_function <- c(SVM_funcion0, SVM_funcion1, SVM_funcion2, SVM_funcion3 )
resultSVM <- c()
for (i in SVM_function) {
  
  modelsvm0 <- train(i,
                     data = products_train2,
                     method = "svmLinear",
                     tuneLength = 5,
                     trControl= tc1)
  
  pr <- postResample(predict(modelsvm0,products_test),products_test$Volume)
  
  resultSVM<- cbind(resultSVM , pr)
  
}
colnames(resultSVM)<- c("SVM_funcion0", "SVM_funcion1",
                        "SVM_funcion2", "SVM_funcion3")
meltsvm <- melt(resultSVM)
ggplot(data = meltsvm, aes(x=X2,y=value, label = round(value))) + 
  geom_bar(stat = "identity", aes(fill=X2)) +
  stat_summary(fun.y = max, colour="black", geom="text",
               vjust=+1,
               # position = position_nudge(x = 0, y = -0.12),
               aes(label=round(..y.., digits=4))) +
  facet_grid(X1~., scales = "free") + 
  scale_fill_brewer(palette = "Spectral") 

Now we plot the errors of the model

First we rename the dummy variables

productstorename <- preducts2$ProductNum   %in%  products$ProductNum
products$ProductType <- preducts2$ProductType[which(productstorename)]

Then we predict the data to plot errors

modelsvm <- train(Volume ~ x4StarReviews + PositiveServiceReview,
                  data = products_train2,
                  method = "svmLinear",
                  tuneLength    = 5,
                  trControl= tc1)

products$Predictions <- round(predict(modelsvm,products))

products$absolute_error <- products$Predictions - products$Volume

products$relative_error <- (products$Predictions - products$Volume)/
  products$Volume

And start making the plots

First sales by product:

ggplot(data = products , aes(x = ProductType, y = Volume)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired")

Then the absolute error

ggplot(data = products , aes(x = ProductType, y = absolute_error)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired")

Followed by a Scatter plot whith the comparison of prediction/Sales

ggplot(data = products) + 
  geom_point(aes(x=ProductNum, y = Volume),color = "chartreuse2") + 
  geom_point(aes(x=ProductNum, y = Predictions),color = "firebrick1") + 
  scale_fill_brewer(palette = "Paired")

Then we improve it Absolute error by product sorted by descending Sales and we find a bug with the program it does’t sort correctly.

ggplot(data = products ,
       aes(x =reorder(ProductType,-Volume) , y = Volume )) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired") + 
  xlab("Product Number")

And the next part is the ploting of the predictions.

wich is started with making the preditcions

productsNew$Predictions <- round(predict(modelsvm,productsNew))

predictionsWeCareAbout <- productsNew[which(productsNew$ProductType %in% 
                                              c("PC","Laptop","Netbook",
                                                "Smartphone")), ]

productsNew$Benefit <- productsNew$Predictions * productsNew$Price *
  productsNew$ProfitMargin

predictionsWeCareAbout$Profit <- predictionsWeCareAbout$Predictions * 
  predictionsWeCareAbout$Price *  predictionsWeCareAbout$ProfitMargin

And the plots the first one being predicted profit by category

ggplot(data = productsNew , aes(x = ProductType, y = Benefit)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired")

And then sorted

ggplot(data = productsNew ,
       aes(x =reorder( ProductNum , -Benefit) , y = Benefit)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired") + 
  xlab("Product Number")

Next are the predictes sales by product category

ggplot(data = productsNew ,
       aes(x =reorder( ProductNum , -Predictions) , y = Predictions)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired") + 
  xlab("Product Number") + 
  ylab("Predictes Sales") 

The profit by product

ggplot(data = predictionsWeCareAbout ,
       aes(x =reorder( ProductNum , -Profit) , y = Profit)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired") + 
  xlab("Product Number")

And the specific profit of the products they asked.

ggplot(data = predictionsWeCareAbout ,
       aes(x =reorder( ProductNum , -Profit) , y = Profit)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired") + 
  xlab("Product Number")

With its respective sales

predictionsWeCareAbout$ProductNum <- as.factor(predictionsWeCareAbout$ProductNum)
a <- ggplot(data = predictionsWeCareAbout ,
       aes(x =reorder( ProductNum , -Predictions) , y = Predictions)) + 
  geom_bar(stat = "identity", aes(fill = ProductType)) + 
  scale_fill_brewer(palette = "Paired") + 
  xlab("Product Number") + 
  ylab("Predictes Sales") 

  

ggplotly(a)