topXX.r

Emilio — Jun 22, 2014, 1:26 PM

# ===============================================================================
# topXX.r -> Petrol Station Data Analysis by Emilio González González
#            End of Degree Project.  Jan-June 2014
#            Determinate the best seller products (including not only fuels)
# ===============================================================================
setwd("~/Economic Degree/TrabajoFinGrado")
library("plyr")

# Global Parameters
SampleSize <- 10        # show 'SampleSize' number of records when asking for a sample
topX       <- 30        # topX days to show
limiteMax  <- 1000000   # split the input file by quantities smaller and bigger than 'limiteMax'

# Data read
datos <- read.table("CSVTodo.csv", header=TRUE, sep=";", na.strings="NA", dec=",", strip.white=TRUE)

# Subset the datafile to transactions of less than limiteMax Qty
smallSales <- datos[which(datos$quantity <= limiteMax),]


# Group data by date and product 
#============================================================================
datosFP <-ddply(smallSales, c("date", "product"), summarise, qty=sum(quantity), imp=sum(amount), price=mean(price))
head(datosFP)
        date              product qty   imp price
1 01/01/2013          AGUA 0.5 L.  13 10.40  0.80
2 01/01/2013            AGUA 1.5L  10 12.00  1.20
3 01/01/2013           APERITIVOS   1  1.80  1.80
4 01/01/2013    AQUARIUS 0,5LIMON   3  5.25  1.75
5 01/01/2013          AQUARIUS 1L   2  6.00  3.00
6 01/01/2013 AQUARIUS NARANJA 0.5   3  5.25  1.75
str(datosFP)
'data.frame':   19630 obs. of  5 variables:
 $ date   : Factor w/ 338 levels "01/01/2013","01/02/2013",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ product: Factor w/ 394 levels "3CV 2T 1L.","3CV ATF O.5L",..: 18 19 33 35 36 37 38 43 49 51 ...
 $ qty    : num  13 10 1 3 2 3 1 3 3 7 ...
 $ imp    : num  10.4 12 1.8 5.25 6 ...
 $ price  : num  0.8 1.2 1.8 1.75 3 ...
summary(datosFP)
         date                  product           qty        
 07/09/2012:   95   DIESEL e+      :  337   Min.   :     0  
 09/12/2012:   91   GASOLINA 95    :  336   1st Qu.:     1  
 14/10/2012:   91   AGUA 1.5L      :  335   Median :     2  
 02/06/2013:   90   DIESELe+10     :  335   Mean   :  1604  
 27/03/2013:   89   REFRESCO 1/2 L.:  332   3rd Qu.:     4  
 05/08/2012:   88   AGUA 0.5 L.    :  327   Max.   :380025  
 (Other)   :19086   (Other)        :17628                   
      imp             price      
 Min.   :     0   Min.   :    0  
 1st Qu.:     2   1st Qu.:    1  
 Median :     4   Median :    2  
 Mean   :  1710   Mean   :   10  
 3rd Qu.:    11   3rd Qu.:    2  
 Max.   :411487   Max.   :18750  

# Group only by product
#================================================================================
datosP <-ddply(datosFP, c("product"), summarise,  qty=sum(qty), imp=sum(imp), price=mean(price) )
# Pie Chart %Qty sold 
numslices <- 8
order.qty <- order(datosP$qty, decreasing=T)
lbls  <- round(datosP$qty/sum(datosP$qty)*100, digits=2)
lbls <- paste(lbls,"%",sep="") # ad % to labels 
lbls <- lbls[order.qty[1:numslices]]
plot.new()
pie(datosP[order.qty[1:numslices], c("qty")],  main="Sales by Product", cex.main=1.7, labels=lbls, col=rainbow(7))
legend(-1.7,0.6, legend = datosP$product[order.qty[1:numslices]],  bty="n", fill=rainbow(9))

plot of chunk unnamed-chunk-1

# Function to show percentajes
printf <- function(...)print(sprintf(...))
percentaje <- function(data, topx=8, variable="") {
        order.data    <- order(data, decreasing=T)
        value.total   <- sum(data)
        value.topx    <- sum(data[order.data[1:topx]])
        value.rest    <- value.total-value.topx
        pct.topx      <- 100.0*value.topx / value.total
        pct.rest      <- 100.0*value.rest/value.total
        printf("%s:  Top %6.0f products = % 12.2f [%07.4f%%]", variable, topx, value.topx, pct.topx)
        printf("%s: Rest %6.0f products = % 12.2f [%07.4f%%]", variable, length(data)-topx, value.rest, pct.rest)
}
percentaje(datosP$qty, 8, "Quantity")
[1] "Quantity:  Top      8 products =  31351839.42 [99.5582%]"
[1] "Quantity: Rest    386 products =    139127.33 [00.4418%]"
percentaje(datosP$imp, 8, "Amount")
[1] "Amount:  Top      8 products =  33198721.27 [98.9271%]"
[1] "Amount: Rest    386 products =    360039.87 [01.0729%]"
# If 3d
library(plotrix)
pie3D(datosP[order.qty[1:numslices], c("qty")], explode=0.1, main="Sales by Product", labels=lbls, col=rainbow(7))

plot of chunk unnamed-chunk-1

# Show basic statistic by product
#================================================================================
#statsP <-ddply(smallSales, c("descripcion"),  summarise, mean=mean(precio), sd=sd(precio), min=min(precio), max=max(precio), "quantile values (0% 25% 50% 75% 100%)"=list(quantile(precio)))
statsP <-ddply(smallSales, c("product"),  summarise, mean=mean(price), sd=sd(price), min=min(price), max=max(price))
write.table(statsP, paste(limiteMax, "statsByProduct.txt", sep=""), row.names=FALSE, sep="\t")

# Sorting data by quantity
orden.qty <- order(datosP$qty, decreasing=T)
topXXQty <- datosP[orden.qty,c("product", "qty", "imp", "price")]
# Show the topX records
head(topXXQty, topX)
                            product      qty       imp  price
97                        DIESEL e+ 13190256 1.601e+07 1.3944
127                           GAS B  9559364 8.063e+06 0.9448
128                           GAS C  3230354 2.642e+06 0.9212
130                     GASOLINA 95  2520720 3.209e+06 1.4687
126                           GAS A  1701104 1.914e+06 1.2985
98                       DIESELe+10   541433 6.847e+05 1.4648
129                       GASOLEO B   376087 3.554e+05 1.0824
131                      gasolina98   232521 3.173e+05 1.5845
15                           Adblue    88293 3.358e+04 0.7022
34  APORTACIONES PROMOCIONES REPSOL     5285 1.733e+03 0.6549
308                 REFRESCO 1/2 L.     3144 5.030e+03 1.6000
19                        AGUA 1.5L     3028 3.634e+03 1.2000
18                      AGUA 0.5 L.     2301 1.841e+03 0.8000
204                   LATA REFRESCO     2232 3.125e+03 1.4000
202                    LATA CERVEZA     1627 2.278e+03 1.4000
54                   BOLSA DE HIELO     1406 2.742e+03 1.9500
306                RED BULL 250 ML.     1278 3.067e+03 2.4000
284                          PRENSA     1225 3.987e+03 3.2001
334                  SANWICHES 3.80     1084 4.119e+03 3.8000
269                 PATATAS GRANDES      916 2.015e+03 2.2000
158              HARIBO BALLA BALLA      860 1.290e+03 1.5000
84               COCA-COLA  1 LITRO      741 1.556e+03 2.1000
375                 TRINDENT SENSES      711 1.066e+03 1.5000
80                      CHUPA CHUPS      704 2.112e+02 0.3000
174                    KINDER BUENO      701 7.711e+02 1.1000
287          PRINGLES GRANDES  170G      685 2.055e+03 3.0000
75                      CERVEZA 1L.      662 1.456e+03 2.2000
35                AQUARIUS 0,5LIMON      624 1.092e+03 1.7500
206                        LAVADO 2      554 2.770e+03 5.0000
51                         BOLLERIA      533 4.797e+02 0.9000
# Write a table to manipulate data with external tools (excel, etc)
write.table(topX, paste(limiteMax,"topSellers.txt", sep=""), row.names=FALSE, sep="\t")

# Sorting dta by sales volume
orden.amnt <- order(datosP$imp, decreasing=T)
topXXAmnt <- datosP[orden.amnt,c("product", "qty", "imp", "price")]
# Show the topx records
head(topXXAmnt, topX)
                            product      qty      imp     price
97                        DIESEL e+ 13190256 16013723    1.3944
127                           GAS B  9559364  8063020    0.9448
130                     GASOLINA 95  2520720  3208736    1.4687
128                           GAS C  3230354  2641821    0.9212
126                           GAS A  1701104  1914054    1.2985
98                       DIESELe+10   541433   684663    1.4648
129                       GASOLEO B   376087   355379    1.0824
131                      gasolina98   232521   317326    1.5845
285   PRESTACION DE SERVICIO SOLRED       45    55794 1248.8361
89          Construcciones en curso        7    55250 6083.3267
15                           Adblue    88293    33581    0.7022
348        Suministros Electricidad       18    13326 1211.6025
341    Servicios Exteriores Seguros       16    11452  728.9250
313     Reparaciones y conservaci¢n       13    11219  901.8708
38         Arrendamientos y c nones       13    10959  832.9824
347                     Suministros       42     8787  205.2891
256      OTROS SERVICIOS EXTERIORES       59     7606  137.5251
350 Suministros Gas¢leo Calefacci¢n        6     6107 1017.7800
293                RAQUET. BOSCH 65      305     5124   16.8000
308                 REFRESCO 1/2 L.     3144     5030    1.6000
336  Servicios bancarios CAJA RURAL       22     4304  226.3505
334                  SANWICHES 3.80     1084     4119    3.8000
284                          PRENSA     1225     3987    3.2001
339   Servicios Exteriores Asesoria        6     3814  685.2510
86                COMISIONES SOLRED       24     3782  197.4407
19                        AGUA 1.5L     3028     3634    1.2000
354              SUMINISTROS VARIOS        9     3545  421.9563
342  Servicios Exteriores Telefonia       48     3175   72.1948
204                   LATA REFRESCO     2232     3125    1.4000
306                RED BULL 250 ML.     1278     3067    2.4000