tableplot.R

require(ggplot2)

## Loading required package: ggplot2

library(tabplot)

## Loading required package: bit

## Attaching package bit

## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)

## creators: bit bitwhich

## coercion: as.logical as.integer as.bit as.bitwhich which

## operator: ! & | xor != ==

## querying: print length any all min max range sum summary

## bit access: length<- [ [<- [[ [[<-

## for more help type ?bit

## 
## Attaching package: 'bit'

## The following object is masked from 'package:base':
## 
##     xor

## Loading required package: ff

## Attaching package ff

## - getOption("fftempdir")=="C:/Users/carlos/AppData/Local/Temp/RtmpsXxt8h"

## - getOption("ffextension")=="ff"

## - getOption("ffdrop")==TRUE

## - getOption("fffinonexit")==TRUE

## - getOption("ffpagesize")==65536

## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes

## - getOption("ffbatchbytes")==34309406.72 -- consider a different value for tuning your system

## - getOption("ffmaxbytes")==1715470336 -- consider a different value for tuning your system

## 
## Attaching package: 'ff'

## The following objects are masked from 'package:bit':
## 
##     clone, clone.default, clone.list

## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2

## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered

## Loading required package: ffbase

## 
## Attaching package: 'ffbase'

## The following objects are masked from 'package:ff':
## 
##     [.ff, [.ffdf, [<-.ff, [<-.ffdf

## The following objects are masked from 'package:base':
## 
##     %in%, table

## Standard deviations are plot by default. See argument numMode of plot.tabplot.

#Un tableplot simplemente se crea con la función tableplot.
#De forma predeterminada, se muestran todas las variables del conjunto de datos. 
#Con el argumento select, podemos especificar qué variables se trazan. 

data(diamonds)
## add some NA's
is.na(diamonds$price) <- diamonds$cut == "Ideal"
is.na(diamonds$cut) <- (runif(nrow(diamonds)) > 0.8)

tableplot(diamonds)

#The tableplot below consists of five columns, where the data is sorted on price.
#Notice that the missing values that we have added are placed at the bottom and 
#(by default) shown in a bright red color.

tableplot(diamonds, select = c(carat, price, cut, color, clarity), sortCol = price)

#Zooming
#We can focus our attention to the 5% most expensive diamonds by setting
#the from argument to 0 and the to argument to 5.
 
tableplot(diamonds, select = c(carat, price, cut, color, clarity), sortCol = price, 
          from = 0, to = 5)

#Filtering

#The argument subset serves as a data filter. The tableplot in the following 
#figure shows that data of premium cut diamonds that cost less than 5000$.

tableplot(diamonds, subset = price < 5000 & cut == "Premium")

#Variables categóricas

tableplot(diamonds, pals = list(cut="Set1(6)", color="Set5", clarity=rainbow(8)))

#High cardinality data

#To illustrate how tableplots deal with high cardinality data, we extend the 
#diamonds dataset with a convenient wrapper function num2fac:
  
diamonds$carat_class <- num2fac(diamonds$carat, n=20)

## Loading required namespace: classInt

diamonds$price_class <- num2fac(diamonds$price, n=100)

## Warning in classInt::classIntervals(num, n = n, style = method): var has
## missing values, omitted in finding classes

tableplot(diamonds, select=c(carat, price, carat_class, price_class))

#Preprocesando Big data

# create large dataset

large_diamonds <- diamonds[rep(seq.int(nrow(diamonds)), 10),]

system.time({
  p <- tablePrepare(large_diamonds)
})

##    user  system elapsed 
##    2.10    0.57    3.00

system.time({
  tableplot(p, plot=FALSE)
})

##    user  system elapsed 
##    0.38    0.14    0.56

system.time({
  tableplot(p, sortCol=price, nBins=200, plot=FALSE)
})

##    user  system elapsed 
##    0.46    0.14    0.68

#Although the first step takes a couple of seconds on a moderate desktop computer, 
#the processing time to create a tableplot from the intermediate result, object p, 
#is very short in comparison to the direct approach:

system.time({
  tableplot(large_diamonds, plot=FALSE)
})

##    user  system elapsed 
##    2.09    0.73    3.19

system.time({
  tableplot(large_diamonds, sortCol=price, nBins=200, plot=FALSE)
})

##    user  system elapsed 
##    2.17    0.70    3.39

#Sampling

system.time({
  tableplot(p, sample=TRUE)
})

##    user  system elapsed 
##    0.78    0.37    1.23

#Compare tableplots (experimental)

#It is possible to compare two datasets, for instance two samples, 
#two versions of a dataset, or datasets from two different time periods.

# calculate normalized carats to be used as sample probabilities
carat.norm <- with(diamonds, carat / max(diamonds$carat))

# draw samples
exp.diamonds <- diamonds[sample(1:nrow(diamonds), size=10000, prob=carat.norm, replace=TRUE),]
chp.diamonds <- diamonds[sample(1:nrow(diamonds), size=10000, prob=1-carat.norm, replace=TRUE),]

tp1 <- tableplot(exp.diamonds, plot=FALSE)
tp2 <- tableplot(chp.diamonds, plot=FALSE)

plot(tp2 - tp1)

## Independence of the compared tableplots assumed for the calculation of the standard deviations.

#This comparison tableplot shows per bin the difference in mean value for each numeric variable, 
#and for each categorical variable a two-sided stacked bar chart to indicate the differences in 
#fractions per category: left-side bars indicate that more items are contained in tp1, right-side 
#bars indicate that more items are contained in tp2, and if bars are not plotted, the fraction of
#items is unchanged. Relative mean values can be plot with relative=TRUE.

#Note: The objects tp1 and tp2 are tabplot-objects (see below). If they are substracted from 
#each other, a tabplot_compare-object is returned.


tab <- tableplot(diamonds, plot = FALSE)

summary(tab)

##               general               variable1      
##  dataset          :diamonds   name       :carat    
##  variables        :12         type       :numeric  
##  sortCol          :1          scale_init :auto     
##  decreasing       :TRUE       scale_final:lin      
##  from             :0%                              
##  to               :100%                            
##  objects.sample   :53940                           
##  objects.full.data:53940                           
##  bins             :100                             
##       variable2                variable3          
##  name      :cut           name      :color        
##  type      :categorical   type      :categorical  
##  categories:6             categories:8            
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##       variable4                 variable5             variable6      
##  name      :clarity       name       :depth     name       :table    
##  type      :categorical   type       :numeric   type       :numeric  
##  categories:9             scale_init :auto      scale_init :auto     
##                           scale_final:lin       scale_final:lin      
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##        variable7             variable8             variable9      
##  name       :price     name       :x         name       :y        
##  type       :numeric   type       :numeric   type       :numeric  
##  scale_init :auto      scale_init :auto      scale_init :auto     
##  scale_final:lin       scale_final:lin       scale_final:lin      
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##        variable10           variable11               variable12         
##  name       :z         name      :carat_class   name      :price_class  
##  type       :numeric   type      :categorical   type      :categorical  
##  scale_init :auto      categories:26            categories:51           
##  scale_final:lin                                                        
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##

plot(tab)

tableplot(diamonds, select = 1:7, fontsize = 14, legend.lines = 8,
          title = "Shine on you crazy Diamond", fontsize.title = 18)

tab2 <- tableChange(tab, select_string = c("carat", "price", "cut", "color", 
                                           "clarity"), pals = list(cut="Set1(2)"))
plot(tab2)

#Guardar tableplot
tableSave(tab, filename = "diamonds.png", width = 5, height = 3, fontsize = 6, legend.lines = 6)

tableplot.R

carlos

Tue Jun 28 10:27:36 2016