Library:
pacman::p_load(dplyr,tidyr,plyr,ggplot2,caret,reader, RColorBrewer,corrplot,GGally,reshape, viridis, rpart, rpart.plot,plotly)
Function for removing outliers:
outliers_DF <- function(DF,flexibilidad)
{
list_return <- list()
for (i in 1:ncol(DF)) {
list_outliers<- list()
vec <- DF[,i]
posiciones_plus<-which(vec - (flexibilidad * sd(vec)) > median(vec))
posiciones_minus<-which(vec + (flexibilidad * sd(vec)) < median(vec))
list_outliers[["posiciones_plus"]] <- posiciones_plus
list_outliers[["posiciones_minus"]] <- posiciones_minus
list_return[[names(DF)[i]]] <- list_outliers
}
return <- list_return
}
Importing the data for analyzing:
set.seed("123")
setwd("C:/Users/Alfonso/Desktop/Ubiqum/Bloque_2/Task-3/Data/")
products<-read.csv("existingproductattributes2017.csv")
preducts2 <- products
productsNew<-read.csv("newproductattributes2017.csv")
Let’s have an overview of the dataset:
names(products)
## [1] "ProductType" "ProductNum"
## [3] "Price" "x5StarReviews"
## [5] "x4StarReviews" "x3StarReviews"
## [7] "x2StarReviews" "x1StarReviews"
## [9] "PositiveServiceReview" "NegativeServiceReview"
## [11] "Recommendproduct" "BestSellersRank"
## [13] "ShippingWeight" "ProductDepth"
## [15] "ProductWidth" "ProductHeight"
## [17] "ProfitMargin" "Volume"
summary (products$ProductType)
## Accessories Display ExtendedWarranty GameConsole
## 26 5 10 2
## Laptop Netbook PC Printer
## 3 2 4 12
## PrinterSupplies Smartphone Software Tablet
## 3 4 6 3
First, we have removed the irrelevant observations for the predictions: We have removed all the products that have sales volume of 0 and all the Extended Warranties of the dataset.
products <- products[-(which (products$ProductType == "ExtendedWarranty")), ]
products <- products[-(which (products$Volume == 0)), ]
We have also changed the categorical variables into binary variables in order to create a regression model.
newDataFrame <- dummyVars(" ~ .", data = products)
products <- data.frame(predict(newDataFrame, newdata = products))
Removing the missing values
products$BestSellersRank <- NULL
productsNew$BestSellersRank <- NULL
correlationmatrix <- cor(products[,-c(1:13)])
Correlation of the variables: Decision tree for finding relevant variables:
## Warning: Bad 'data' field in model 'call' (expected a data.frame or a matrix).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
What is the importance of each variable:
## x4StarReviews x3StarReviews x2StarReviews
## 82655846.1 60424872.6 59326548.1
## x1StarReviews PositiveServiceReview NegativeServiceReview
## 45824946.8 43345059.5 21700469.6
## ProductNum ProductHeight
## 2196648.8 303145.1
We have removed all the variables that have high correlation with another independent variable, as well as, the variables that are not relevant to the Sales Volume. About the x5StarReviews, it has a correlation of 1 with the dependent variable:
remove<-which(names(products) %in% c("NegativeServiceReview","x5StarReviews",
"x3StarReviews","x2StarReviews"))
products<- products[,-remove]
Removing outliers using the function created:
outliers_list <- outliers_DF(DF = products, flexibilidad = 4)
products <- products[-outliers_list$Volume$posiciones_plus,]
Spliting the dataset into Training and Testing. The training represents the 75% of the total observations.
partition<-createDataPartition(y = products$Volume, times = 1,p = 0.75,
list = FALSE )
products_train<- products[partition,]
products_test<- products[-partition,]
products_train2<-products_train[,-which(names(products_train)%in%"ProductNum")]
names(products_train2)
## [1] "ProductType.Accessories" "ProductType.Display"
## [3] "ProductType.ExtendedWarranty" "ProductType.GameConsole"
## [5] "ProductType.Laptop" "ProductType.Netbook"
## [7] "ProductType.PC" "ProductType.Printer"
## [9] "ProductType.PrinterSupplies" "ProductType.Smartphone"
## [11] "ProductType.Software" "ProductType.Tablet"
## [13] "Price" "x4StarReviews"
## [15] "x1StarReviews" "PositiveServiceReview"
## [17] "Recommendproduct" "ShippingWeight"
## [19] "ProductDepth" "ProductWidth"
## [21] "ProductHeight" "ProfitMargin"
## [23] "Volume"
Features of the train model:
tc1 <- trainControl(method = "repeatedcv",
number=10,
repeats = 1)
We have first checked which predictive model has the best performance. The models used are the following: Linear model, Random Forest, SVM Lineal, SVM Radial.
Plotting the performance of each predictive model:
colnames(results) <- c("lm","rf","svmLinear","svmRadial")
melted_results <- melt(results)
ggplot(melted_results, aes(x=X2, y = value)) +
geom_bar(stat = "identity", aes(fill=X2)) +
facet_wrap(~X1,scales = "free") +
scale_fill_brewer(palette = "Spectral") +
stat_summary(fun.y = max, colour="black", geom="text",
vjust=-1,
# position = position_nudge(x = 0, y = -0.12),
aes(label=round(..y.., digits=2))) +
xlab("Models") +
ylab("")
Formulas for the model based on the most relevant variables. Which variables should we use for making predictions?
SVM_funcion0 <- (Volume ~ .)
SVM_funcion1 <- (Volume ~ x4StarReviews)
SVM_funcion2 <- (Volume ~ x4StarReviews + PositiveServiceReview)
SVM_funcion3 <- (Volume ~ x4StarReviews + PositiveServiceReview +
ShippingWeight)
SVM_function <- c(SVM_funcion0, SVM_funcion1, SVM_funcion2, SVM_funcion3 )
resultSVM <- c()
for (i in SVM_function) {
modelsvm0 <- train(i,
data = products_train2,
method = "svmLinear",
tuneLength = 5,
trControl= tc1)
pr <- postResample(predict(modelsvm0,products_test),products_test$Volume)
resultSVM<- cbind(resultSVM , pr)
}
colnames(resultSVM)<- c("SVM_funcion0", "SVM_funcion1",
"SVM_funcion2", "SVM_funcion3")
meltsvm <- melt(resultSVM)
ggplot(data = meltsvm, aes(x=X2,y=value, label = round(value))) +
geom_bar(stat = "identity", aes(fill=X2)) +
stat_summary(fun.y = max, colour="black", geom="text",
vjust=+1,
# position = position_nudge(x = 0, y = -0.12),
aes(label=round(..y.., digits=4))) +
facet_grid(X1~., scales = "free") +
scale_fill_brewer(palette = "Spectral")
Now we plot the errors of the model
First we rename the dummy variables
productstorename <- preducts2$ProductNum %in% products$ProductNum
products$ProductType <- preducts2$ProductType[which(productstorename)]
Then we predict the data to plot errors
modelsvm <- train(Volume ~ x4StarReviews + PositiveServiceReview,
data = products_train2,
method = "svmLinear",
tuneLength = 5,
trControl= tc1)
products$Predictions <- round(predict(modelsvm,products))
products$absolute_error <- products$Predictions - products$Volume
products$relative_error <- (products$Predictions - products$Volume)/
products$Volume
And start making the plots
First sales by product:
ggplot(data = products , aes(x = ProductType, y = Volume)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired")
Then the absolute error
ggplot(data = products , aes(x = ProductType, y = absolute_error)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired")
Followed by a Scatter plot whith the comparison of prediction/Sales
ggplot(data = products) +
geom_point(aes(x=ProductNum, y = Volume),color = "chartreuse2") +
geom_point(aes(x=ProductNum, y = Predictions),color = "firebrick1") +
scale_fill_brewer(palette = "Paired")
Then we improve it Absolute error by product sorted by descending Sales and we find a bug with the program it does’t sort correctly.
ggplot(data = products ,
aes(x =reorder(ProductType,-Volume) , y = Volume )) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired") +
xlab("Product Number")
And the next part is the ploting of the predictions.
wich is started with making the preditcions
productsNew$Predictions <- round(predict(modelsvm,productsNew))
predictionsWeCareAbout <- productsNew[which(productsNew$ProductType %in%
c("PC","Laptop","Netbook",
"Smartphone")), ]
productsNew$Benefit <- productsNew$Predictions * productsNew$Price *
productsNew$ProfitMargin
predictionsWeCareAbout$Profit <- predictionsWeCareAbout$Predictions *
predictionsWeCareAbout$Price * predictionsWeCareAbout$ProfitMargin
And the plots the first one being predicted profit by category
ggplot(data = productsNew , aes(x = ProductType, y = Benefit)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired")
And then sorted
ggplot(data = productsNew ,
aes(x =reorder( ProductNum , -Benefit) , y = Benefit)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired") +
xlab("Product Number")
Next are the predictes sales by product category
ggplot(data = productsNew ,
aes(x =reorder( ProductNum , -Predictions) , y = Predictions)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired") +
xlab("Product Number") +
ylab("Predictes Sales")
The profit by product
ggplot(data = predictionsWeCareAbout ,
aes(x =reorder( ProductNum , -Profit) , y = Profit)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired") +
xlab("Product Number")
And the specific profit of the products they asked.
ggplot(data = predictionsWeCareAbout ,
aes(x =reorder( ProductNum , -Profit) , y = Profit)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired") +
xlab("Product Number")
With its respective sales
predictionsWeCareAbout$ProductNum <- as.factor(predictionsWeCareAbout$ProductNum)
a <- ggplot(data = predictionsWeCareAbout ,
aes(x =reorder( ProductNum , -Predictions) , y = Predictions)) +
geom_bar(stat = "identity", aes(fill = ProductType)) +
scale_fill_brewer(palette = "Paired") +
xlab("Product Number") +
ylab("Predictes Sales")
ggplotly(a)