Emilio — Jun 22, 2014, 1:26 PM
# ===============================================================================
# topXX.r -> Petrol Station Data Analysis by Emilio González González
# End of Degree Project. Jan-June 2014
# Determinate the best seller products (including not only fuels)
# ===============================================================================
setwd("~/Economic Degree/TrabajoFinGrado")
library("plyr")
# Global Parameters
SampleSize <- 10 # show 'SampleSize' number of records when asking for a sample
topX <- 30 # topX days to show
limiteMax <- 1000000 # split the input file by quantities smaller and bigger than 'limiteMax'
# Data read
datos <- read.table("CSVTodo.csv", header=TRUE, sep=";", na.strings="NA", dec=",", strip.white=TRUE)
# Subset the datafile to transactions of less than limiteMax Qty
smallSales <- datos[which(datos$quantity <= limiteMax),]
# Group data by date and product
#============================================================================
datosFP <-ddply(smallSales, c("date", "product"), summarise, qty=sum(quantity), imp=sum(amount), price=mean(price))
head(datosFP)
date product qty imp price
1 01/01/2013 AGUA 0.5 L. 13 10.40 0.80
2 01/01/2013 AGUA 1.5L 10 12.00 1.20
3 01/01/2013 APERITIVOS 1 1.80 1.80
4 01/01/2013 AQUARIUS 0,5LIMON 3 5.25 1.75
5 01/01/2013 AQUARIUS 1L 2 6.00 3.00
6 01/01/2013 AQUARIUS NARANJA 0.5 3 5.25 1.75
str(datosFP)
'data.frame': 19630 obs. of 5 variables:
$ date : Factor w/ 338 levels "01/01/2013","01/02/2013",..: 1 1 1 1 1 1 1 1 1 1 ...
$ product: Factor w/ 394 levels "3CV 2T 1L.","3CV ATF O.5L",..: 18 19 33 35 36 37 38 43 49 51 ...
$ qty : num 13 10 1 3 2 3 1 3 3 7 ...
$ imp : num 10.4 12 1.8 5.25 6 ...
$ price : num 0.8 1.2 1.8 1.75 3 ...
summary(datosFP)
date product qty
07/09/2012: 95 DIESEL e+ : 337 Min. : 0
09/12/2012: 91 GASOLINA 95 : 336 1st Qu.: 1
14/10/2012: 91 AGUA 1.5L : 335 Median : 2
02/06/2013: 90 DIESELe+10 : 335 Mean : 1604
27/03/2013: 89 REFRESCO 1/2 L.: 332 3rd Qu.: 4
05/08/2012: 88 AGUA 0.5 L. : 327 Max. :380025
(Other) :19086 (Other) :17628
imp price
Min. : 0 Min. : 0
1st Qu.: 2 1st Qu.: 1
Median : 4 Median : 2
Mean : 1710 Mean : 10
3rd Qu.: 11 3rd Qu.: 2
Max. :411487 Max. :18750
# Group only by product
#================================================================================
datosP <-ddply(datosFP, c("product"), summarise, qty=sum(qty), imp=sum(imp), price=mean(price) )
# Pie Chart %Qty sold
numslices <- 8
order.qty <- order(datosP$qty, decreasing=T)
lbls <- round(datosP$qty/sum(datosP$qty)*100, digits=2)
lbls <- paste(lbls,"%",sep="") # ad % to labels
lbls <- lbls[order.qty[1:numslices]]
plot.new()
pie(datosP[order.qty[1:numslices], c("qty")], main="Sales by Product", cex.main=1.7, labels=lbls, col=rainbow(7))
legend(-1.7,0.6, legend = datosP$product[order.qty[1:numslices]], bty="n", fill=rainbow(9))
# Function to show percentajes
printf <- function(...)print(sprintf(...))
percentaje <- function(data, topx=8, variable="") {
order.data <- order(data, decreasing=T)
value.total <- sum(data)
value.topx <- sum(data[order.data[1:topx]])
value.rest <- value.total-value.topx
pct.topx <- 100.0*value.topx / value.total
pct.rest <- 100.0*value.rest/value.total
printf("%s: Top %6.0f products = % 12.2f [%07.4f%%]", variable, topx, value.topx, pct.topx)
printf("%s: Rest %6.0f products = % 12.2f [%07.4f%%]", variable, length(data)-topx, value.rest, pct.rest)
}
percentaje(datosP$qty, 8, "Quantity")
[1] "Quantity: Top 8 products = 31351839.42 [99.5582%]"
[1] "Quantity: Rest 386 products = 139127.33 [00.4418%]"
percentaje(datosP$imp, 8, "Amount")
[1] "Amount: Top 8 products = 33198721.27 [98.9271%]"
[1] "Amount: Rest 386 products = 360039.87 [01.0729%]"
# If 3d
library(plotrix)
pie3D(datosP[order.qty[1:numslices], c("qty")], explode=0.1, main="Sales by Product", labels=lbls, col=rainbow(7))
# Show basic statistic by product
#================================================================================
#statsP <-ddply(smallSales, c("descripcion"), summarise, mean=mean(precio), sd=sd(precio), min=min(precio), max=max(precio), "quantile values (0% 25% 50% 75% 100%)"=list(quantile(precio)))
statsP <-ddply(smallSales, c("product"), summarise, mean=mean(price), sd=sd(price), min=min(price), max=max(price))
write.table(statsP, paste(limiteMax, "statsByProduct.txt", sep=""), row.names=FALSE, sep="\t")
# Sorting data by quantity
orden.qty <- order(datosP$qty, decreasing=T)
topXXQty <- datosP[orden.qty,c("product", "qty", "imp", "price")]
# Show the topX records
head(topXXQty, topX)
product qty imp price
97 DIESEL e+ 13190256 1.601e+07 1.3944
127 GAS B 9559364 8.063e+06 0.9448
128 GAS C 3230354 2.642e+06 0.9212
130 GASOLINA 95 2520720 3.209e+06 1.4687
126 GAS A 1701104 1.914e+06 1.2985
98 DIESELe+10 541433 6.847e+05 1.4648
129 GASOLEO B 376087 3.554e+05 1.0824
131 gasolina98 232521 3.173e+05 1.5845
15 Adblue 88293 3.358e+04 0.7022
34 APORTACIONES PROMOCIONES REPSOL 5285 1.733e+03 0.6549
308 REFRESCO 1/2 L. 3144 5.030e+03 1.6000
19 AGUA 1.5L 3028 3.634e+03 1.2000
18 AGUA 0.5 L. 2301 1.841e+03 0.8000
204 LATA REFRESCO 2232 3.125e+03 1.4000
202 LATA CERVEZA 1627 2.278e+03 1.4000
54 BOLSA DE HIELO 1406 2.742e+03 1.9500
306 RED BULL 250 ML. 1278 3.067e+03 2.4000
284 PRENSA 1225 3.987e+03 3.2001
334 SANWICHES 3.80 1084 4.119e+03 3.8000
269 PATATAS GRANDES 916 2.015e+03 2.2000
158 HARIBO BALLA BALLA 860 1.290e+03 1.5000
84 COCA-COLA 1 LITRO 741 1.556e+03 2.1000
375 TRINDENT SENSES 711 1.066e+03 1.5000
80 CHUPA CHUPS 704 2.112e+02 0.3000
174 KINDER BUENO 701 7.711e+02 1.1000
287 PRINGLES GRANDES 170G 685 2.055e+03 3.0000
75 CERVEZA 1L. 662 1.456e+03 2.2000
35 AQUARIUS 0,5LIMON 624 1.092e+03 1.7500
206 LAVADO 2 554 2.770e+03 5.0000
51 BOLLERIA 533 4.797e+02 0.9000
# Write a table to manipulate data with external tools (excel, etc)
write.table(topX, paste(limiteMax,"topSellers.txt", sep=""), row.names=FALSE, sep="\t")
# Sorting dta by sales volume
orden.amnt <- order(datosP$imp, decreasing=T)
topXXAmnt <- datosP[orden.amnt,c("product", "qty", "imp", "price")]
# Show the topx records
head(topXXAmnt, topX)
product qty imp price
97 DIESEL e+ 13190256 16013723 1.3944
127 GAS B 9559364 8063020 0.9448
130 GASOLINA 95 2520720 3208736 1.4687
128 GAS C 3230354 2641821 0.9212
126 GAS A 1701104 1914054 1.2985
98 DIESELe+10 541433 684663 1.4648
129 GASOLEO B 376087 355379 1.0824
131 gasolina98 232521 317326 1.5845
285 PRESTACION DE SERVICIO SOLRED 45 55794 1248.8361
89 Construcciones en curso 7 55250 6083.3267
15 Adblue 88293 33581 0.7022
348 Suministros Electricidad 18 13326 1211.6025
341 Servicios Exteriores Seguros 16 11452 728.9250
313 Reparaciones y conservaci¢n 13 11219 901.8708
38 Arrendamientos y c nones 13 10959 832.9824
347 Suministros 42 8787 205.2891
256 OTROS SERVICIOS EXTERIORES 59 7606 137.5251
350 Suministros Gas¢leo Calefacci¢n 6 6107 1017.7800
293 RAQUET. BOSCH 65 305 5124 16.8000
308 REFRESCO 1/2 L. 3144 5030 1.6000
336 Servicios bancarios CAJA RURAL 22 4304 226.3505
334 SANWICHES 3.80 1084 4119 3.8000
284 PRENSA 1225 3987 3.2001
339 Servicios Exteriores Asesoria 6 3814 685.2510
86 COMISIONES SOLRED 24 3782 197.4407
19 AGUA 1.5L 3028 3634 1.2000
354 SUMINISTROS VARIOS 9 3545 421.9563
342 Servicios Exteriores Telefonia 48 3175 72.1948
204 LATA REFRESCO 2232 3125 1.4000
306 RED BULL 250 ML. 1278 3067 2.4000