Data Visualization with R, Models and relationship between variables

Dhafer Malouche

http://dhafermalouche.net

Outline

2 Qualitative variables

Contingency table

> require(graphics)
> M <- as.table( cbind( c( 18,28,14), c( 20,51,28) , c( 12,25,9))) 
> dimnames( M) <- list( Husband = c(" Tall", "Medium", "Short"), Wife = c(" Tall"," Medium", "Short"))
> M
        Wife
Husband   Tall  Medium Short
   Tall     18      20    12
  Medium    28      51    25
  Short     14      28     9

Mosaic plot

> mosaicplot( M, col = c(" green", "red"),main = "Husband x Wife")

Mosaic plot

Mosaic plot

> library(vcd)
> mosaic(M, shade=T,main = "Husband x Wife")

Mosaic plot

Radial or Radar charts

Method 1

We use the following R function

> source('CreatRadialPlot.R')
> CreateRadialPlot
function (plot.data, axis.labels = colnames(plot.data)[-1], grid.min = -0.5, 
    grid.mid = 0, grid.max = 0.5, centre.y = grid.min - ((1/9) * 
        (grid.max - grid.min)), plot.extent.x.sf = 1.2, plot.extent.y.sf = 1.2, 
    x.centre.range = 0.02 * (grid.max - centre.y), label.centre.y = FALSE, 
    grid.line.width = 0.5, gridline.min.linetype = "longdash", 
    gridline.mid.linetype = "longdash", gridline.max.linetype = "longdash", 
    gridline.min.colour = "grey", gridline.mid.colour = "blue", 
    gridline.max.colour = "grey", grid.label.size = 4, gridline.label.offset = -0.02 * 
        (grid.max - centre.y), label.gridline.min = TRUE, axis.label.offset = 1.15, 
    axis.label.size = 3, axis.line.colour = "grey", group.line.width = 1, 
    group.point.size = 4, background.circle.colour = "yellow", 
    background.circle.transparency = 0.2, plot.legend = if (nrow(plot.data) > 
        1) TRUE else FALSE, legend.title = "Cluster", legend.text.size = grid.label.size) 
{
    var.names <- colnames(plot.data)[-1]
    plot.extent.x = (grid.max + abs(centre.y)) * plot.extent.x.sf
    plot.extent.y = (grid.max + abs(centre.y)) * plot.extent.y.sf
    if (length(axis.labels) != ncol(plot.data) - 1) 
        return("Error: 'axis.labels' contains the wrong number of axis labels")
    if (min(plot.data[, -1]) < centre.y) 
        return("Error: plot.data' contains value(s) < centre.y")
    if (max(plot.data[, -1]) > grid.max) 
        return("Error: 'plot.data' contains value(s) > grid.max")
    CalculateGroupPath <- function(df) {
        path <- as.factor(as.character(df[, 1]))
        angles = seq(from = 0, to = 2 * pi, by = (2 * pi)/(ncol(df) - 
            1))
        graphData = data.frame(seg = "", x = 0, y = 0)
        graphData = graphData[-1, ]
        for (i in levels(path)) {
            pathData = subset(df, df[, 1] == i)
            for (j in c(2:ncol(df))) {
                graphData = rbind(graphData, data.frame(group = i, 
                  x = pathData[, j] * sin(angles[j - 1]), y = pathData[, 
                    j] * cos(angles[j - 1])))
            }
            graphData = rbind(graphData, data.frame(group = i, 
                x = pathData[, 2] * sin(angles[1]), y = pathData[, 
                  2] * cos(angles[1])))
        }
        colnames(graphData)[1] <- colnames(df)[1]
        graphData
    }
    CaclulateAxisPath = function(var.names, min, max) {
        n.vars <- length(var.names)
        angles <- seq(from = 0, to = 2 * pi, by = (2 * pi)/n.vars)
        min.x <- min * sin(angles)
        min.y <- min * cos(angles)
        max.x <- max * sin(angles)
        max.y <- max * cos(angles)
        axisData <- NULL
        for (i in 1:n.vars) {
            a <- c(i, min.x[i], min.y[i])
            b <- c(i, max.x[i], max.y[i])
            axisData <- rbind(axisData, a, b)
        }
        colnames(axisData) <- c("axis.no", "x", "y")
        rownames(axisData) <- seq(1:nrow(axisData))
        as.data.frame(axisData)
    }
    funcCircleCoords <- function(center = c(0, 0), r = 1, npoints = 100) {
        tt <- seq(0, 2 * pi, length.out = npoints)
        xx <- center[1] + r * cos(tt)
        yy <- center[2] + r * sin(tt)
        return(data.frame(x = xx, y = yy))
    }
    plot.data.offset <- plot.data
    plot.data.offset[, 2:ncol(plot.data)] <- plot.data[, 2:ncol(plot.data)] + 
        abs(centre.y)
    group <- NULL
    group$path <- CalculateGroupPath(plot.data.offset)
    axis <- NULL
    axis$path <- CaclulateAxisPath(var.names, grid.min + abs(centre.y), 
        grid.max + abs(centre.y))
    axis$label <- data.frame(text = axis.labels, x = NA, y = NA)
    n.vars <- length(var.names)
    angles = seq(from = 0, to = 2 * pi, by = (2 * pi)/n.vars)
    axis$label$x <- sapply(1:n.vars, function(i, x) {
        ((grid.max + abs(centre.y)) * axis.label.offset) * sin(angles[i])
    })
    axis$label$y <- sapply(1:n.vars, function(i, x) {
        ((grid.max + abs(centre.y)) * axis.label.offset) * cos(angles[i])
    })
    gridline <- NULL
    gridline$min$path <- funcCircleCoords(c(0, 0), grid.min + 
        abs(centre.y), npoints = 360)
    gridline$mid$path <- funcCircleCoords(c(0, 0), grid.mid + 
        abs(centre.y), npoints = 360)
    gridline$max$path <- funcCircleCoords(c(0, 0), grid.max + 
        abs(centre.y), npoints = 360)
    gridline$min$label <- data.frame(x = gridline.label.offset, 
        y = grid.min + abs(centre.y), text = as.character(grid.min))
    gridline$max$label <- data.frame(x = gridline.label.offset, 
        y = grid.max + abs(centre.y), text = as.character(grid.max))
    gridline$mid$label <- data.frame(x = gridline.label.offset, 
        y = grid.mid + abs(centre.y), text = as.character(grid.mid))
    theme_clear <- theme_bw() + theme(legend.position = "bottom", 
        axis.text.y = element_blank(), axis.text.x = element_blank(), 
        axis.ticks = element_blank(), panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), panel.border = element_blank(), 
        legend.key = element_rect(linetype = "blank"))
    if (plot.legend == FALSE) 
        theme_clear <- theme_clear + theme(legend.position = "none")
    base <- ggplot(axis$label) + xlab(NULL) + ylab(NULL) + coord_equal() + 
        geom_text(data = subset(axis$label, axis$label$x < (-x.centre.range)), 
            aes(x = x, y = y, label = text), size = axis.label.size, 
            hjust = 1) + scale_x_continuous(limits = c(-plot.extent.x, 
        plot.extent.x)) + scale_y_continuous(limits = c(-plot.extent.y, 
        plot.extent.y))
    base <- base + geom_text(data = subset(axis$label, abs(axis$label$x) <= 
        x.centre.range), aes(x = x, y = y, label = text), size = axis.label.size, 
        hjust = 0.5)
    base <- base + geom_text(data = subset(axis$label, axis$label$x > 
        x.centre.range), aes(x = x, y = y, label = text), size = axis.label.size, 
        hjust = 0)
    base <- base + theme_clear
    base <- base + geom_polygon(data = gridline$max$path, aes(x, 
        y), fill = background.circle.colour, alpha = background.circle.transparency)
    base <- base + geom_path(data = axis$path, aes(x = x, y = y, 
        group = axis.no), colour = axis.line.colour)
    base <- base + geom_path(data = group$path, aes(x = x, y = y, 
        group = group, colour = group), size = group.line.width)
    base <- base + geom_point(data = group$path, aes(x = x, y = y, 
        group = group, colour = group), size = group.point.size)
    if (plot.legend == TRUE) 
        base <- base + labs(colour = legend.title, size = legend.text.size)
    base <- base + geom_path(data = gridline$min$path, aes(x = x, 
        y = y), lty = gridline.min.linetype, colour = gridline.min.colour, 
        size = grid.line.width)
    base <- base + geom_path(data = gridline$mid$path, aes(x = x, 
        y = y), lty = gridline.mid.linetype, colour = gridline.mid.colour, 
        size = grid.line.width)
    base <- base + geom_path(data = gridline$max$path, aes(x = x, 
        y = y), lty = gridline.max.linetype, colour = gridline.max.colour, 
        size = grid.line.width)
    if (label.gridline.min == TRUE) {
        base <- base + geom_text(aes(x = x, y = y, label = text), 
            data = gridline$min$label, fontface = "bold", size = grid.label.size, 
            hjust = 1)
    }
    base <- base + geom_text(aes(x = x, y = y, label = text), 
        data = gridline$mid$label, fontface = "bold", size = grid.label.size, 
        hjust = 1)
    base <- base + geom_text(aes(x = x, y = y, label = text), 
        data = gridline$max$label, fontface = "bold", size = grid.label.size, 
        hjust = 1)
    if (label.centre.y == TRUE) {
        centre.y.label <- data.frame(x = 0, y = 0, text = as.character(centre.y))
        base <- base + geom_text(aes(x = x, y = y, label = text), 
            data = centre.y.label, fontface = "bold", size = grid.label.size, 
            hjust = 0.5)
    }
    return(base)
}

Example

School dropout in Tunisia

> library(ggplot2)
>  source('CreatRadialPlot.R')
> df <- read.csv("drop_out_school_tunisia.csv")
> df
        X  group   bizerte  siliana monastir   mahdia     tunis     sfax  National
1 xFemale Female  7.575758 5.952381 11.83432 6.569343 12.328767 3.960396  8.253968
2   xmale   Male 14.285714 6.363636 23.00000 6.206897  6.024096 5.343511 11.473272
3    xAll    All 11.585366 6.185567 17.88618 6.382979  8.974359 4.741379 10.021475
> df <- df[,-1]

Example

> CreateRadialPlot(df,grid.label.size = 5,
+                  axis.label.size = 4,group.line.width = 2,
+                  plot.extent.x.sf = 1.5,
+                  background.circle.colour = 'gray', 
+                  grid.max = 26,
+                  grid.mid = round(df[3,8],1),
+                  grid.min = 4.5,
+                  axis.line.colour = 'black',
+                  legend.title = '')

Example

Method 2 with fmsb package

> library(fmsb)
>  
> # Create data: note in High school for several students
> set.seed(99)
> data=as.data.frame(matrix( sample( 0:20 , 15 , replace=F) , ncol=5))
> colnames(data)=c("math" , "english" , "biology" ,
+                  "music" , "R-coding" )
> rownames(data)=paste("mister" , letters[1:3] , sep="-")
> # We add 2 lines to the dataframe: the max and min of each 
> # topic to show on the plot!
> data=rbind(rep(20,5) , rep(0,5) , data)
> data
         math english biology music R-coding
1          20      20      20    20       20
2           0       0       0     0        0
mister-a   12      17      10    19        1
mister-b    2       9       4     6       16
mister-c   13      15      18     5       20

Radar

> colors_border=c( rgb(0.2,0.5,0.5,0.9), 
+                  rgb(0.8,0.2,0.5,0.9) , 
+                  rgb(0.7,0.5,0.1,0.9) )
> colors_in=c( rgb(0.2,0.5,0.5,0.4), 
+              rgb(0.8,0.2,0.5,0.4) , 
+              rgb(0.7,0.5,0.1,0.4) )
> radarchart( data  , axistype=1 , 
+     #custom polygon
+     pcol=colors_border , pfcol=colors_in , plwd=4 , plty=1,
+     #custom the grid
+     cglcol="grey", cglty=1, axislabcol="grey",
+     caxislabels=seq(0,20,5), cglwd=0.8,
+     #custom labels
+     vlcex=0.8 
+     )
> legend(x=0.7, y=1, 
+        legend = rownames(data[-c(1,2),]),
+        bty = "n", pch=20 , 
+        col=colors_in , text.col = "grey", cex=1.2, pt.cex=3)

Radar

googleVis package

Preparing data

> library(googleVis)
> op <- options(gvis.plot.tag="chart")
> df1=t(df[,-1])
> df1=cbind.data.frame(colnames(df[,-1]),df1)
> 
> colnames(df1)=c("Gouvernorat","Female","Male","All")
> df1[,-1]=round(df1[,-1],2)
> df1
         Gouvernorat Female  Male   All
bizerte      bizerte   7.58 14.29 11.59
siliana      siliana   5.95  6.36  6.19
monastir    monastir  11.83 23.00 17.89
mahdia        mahdia   6.57  6.21  6.38
tunis          tunis  12.33  6.02  8.97
sfax            sfax   3.96  5.34  4.74
National    National   8.25 11.47 10.02

Preparing data

> Bar <- gvisBarChart(df1,xvar="Gouvernorat",
+                     yvar=c("Female","Male","All"),
+                     options=list(width=1250, height=700,
+                                  title="Drop out School Rate",
+                                  titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}",
+                                  bar="{groupWidth:'100%'}",
+                                  hAxis="{format:'#,#%'}"))
> plot(Bar)

Preparing data

With some options

> Bar <- gvisLineChart(df1[,-4],xvar="Gouvernorat",
+                      options=list(width=1250, height=700,
+                                   title="Drop out School Rate",
+                                   titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}",
+                                   bar="{groupWidth:'100%'}",
+                                   vAxis="{format:'#,#%'}"))
> plot(Bar)

With some options

2 Quantitative variables, corrplot package

With circles

> library(corrplot)
> data(mtcars)
> head(mtcars)
> M <- cor(mtcars)
> corrplot(M, method = "circle")

With circles

With squares

> corrplot(M, method = "square")

With squares

With ellipses

> corrplot(M, method = "ellipse")

With ellipses

With numbers

> corrplot(M, method = "number")

With numbers

With pies

> corrplot(M, method = "pie")

With pies

Only upper matrix

> corrplot(M, type = "upper")

Only upper matrix

Ellipses and numbers

> corrplot.mixed(M, lower = "ellipse", upper = "number")

Ellipses and numbers

Reordering variables

Character, the ordering method of the correlation matrix.

h-clust

> corrplot(M, order = "hclust")

h-clust

Showing clusters with rectangles

> corrplot(M, order = "hclust",addrect = 3)

Showing clusters with rectangles

Customizing the plot

Colors

> mycol <- colorRampPalette(c("red", "white", "blue"))
> corrplot(M, order = "hclust",addrect = 2,col=mycol(50))

Colors

Background

> wb <- c("white", "black")
> corrplot(M, order = "hclust",
+          addrect = 2, 
+          col = wb, bg = "gold2")

Background

An R code for Independence hypothesis testing

> cor.mtest <- function(mat, conf.level = 0.95) {
+     mat <- as.matrix(mat)
+     n <- ncol(mat)
+     p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
+     diag(p.mat) <- 0
+     diag(lowCI.mat) <- diag(uppCI.mat) <- 1
+     for (i in 1:(n - 1)) {
+         for (j in (i + 1):n) {
+             tmp <- cor.test(mat[, i], mat[, j], conf.level = conf.level)
+             p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
+             lowCI.mat[i, j] <- lowCI.mat[j, i] <- tmp$conf.int[1]
+             uppCI.mat[i, j] <- uppCI.mat[j, i] <- tmp$conf.int[2]
+         }
+     }
+     return(list(p.mat, lowCI.mat, uppCI.mat))
+ }

Independence hypothesis testing

> res1 <- cor.mtest(mtcars, 0.95)
> res1[[1]][1:3,1:3]
             [,1]         [,2]         [,3]
[1,] 0.000000e+00 6.112687e-10 9.380327e-10
[2,] 6.112687e-10 0.000000e+00 1.802838e-12
[3,] 9.380327e-10 1.802838e-12 0.000000e+00
> res1[[2]][1:3,1:3]
           [,1]       [,2]       [,3]
[1,]  1.0000000 -0.9257694 -0.9233594
[2,] -0.9257694  1.0000000  0.8072442
[3,] -0.9233594  0.8072442  1.0000000

Adding p-values

> corrplot(M, p.mat = res1[[1]], sig.level = 0.1)

Non-significant independence correlations with a X

Or

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01)

Or

Writing p-values

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "p-value")

Writing p-values

Displaying white squares instead of p-values

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "blank")

Displaying squares instead of p-values

Use xtable R package to display nice correlation table in html format

> library(xtable)
> mcor<-round(cor(mtcars),2)
> upper<-mcor
> upper[upper.tri(mcor)]<-""
> upper<-as.data.frame(upper)

Use xtable R package to display nice correlation table in html format

> print(xtable(upper), type="html")

Use xtable R package to display nice correlation table in html format

mpg cyl disp hp drat wt qsec vs am gear carb
mpg 1
cyl -0.85 1
disp -0.85 0.9 1
hp -0.78 0.83 0.79 1
drat 0.68 -0.7 -0.71 -0.45 1
wt -0.87 0.78 0.89 0.66 -0.71 1
qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1
vs 0.66 -0.81 -0.71 -0.72 0.44 -0.55 0.74 1
am 0.6 -0.52 -0.59 -0.24 0.71 -0.69 -0.23 0.17 1
gear 0.48 -0.49 -0.56 -0.13 0.7 -0.58 -0.21 0.21 0.79 1
carb -0.55 0.53 0.39 0.75 -0.09 0.43 -0.66 -0.57 0.06 0.27 1

Combine matrix of correlation coefficients and significance levels

We use corstar function

> # x is a matrix containing the data
> # method : correlation method. "pearson"" or "spearman"" is supported
> # removeTriangle : remove upper or lower triangle
> # results :  if "html" or "latex"
>   # the results will be displayed in html or latex format
> corstars <-function(x, method=c("pearson", "spearman"), 
+                     removeTriangle=c("upper", "lower"),
+                      result=c("none", "html", "latex")){
+     #Compute correlation matrix
+     require(Hmisc)
+     x <- as.matrix(x)
+     correlation_matrix<-rcorr(x, type=method[1])
+     R <- correlation_matrix$r # Matrix of correlation coeficients
+     p <- correlation_matrix$P # Matrix of p-value 
+     
+     # Define notions for significance levels; spacing is important.
+     mystars <- ifelse(p < .0001, "****", ifelse(p < .001, "*** ", ifelse(p < .01, "**  ", ifelse(p < .05, "*   ", "    "))))
+     
+     # trunctuate the correlation matrix to two decimal
+     R <- format(round(cbind(rep(-1.11, ncol(x)), R), 2))[,-1]
+     
+     # build a new matrix that includes the correlations with their apropriate stars
+     Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x))
+     diag(Rnew) <- paste(diag(R), " ", sep="")
+     rownames(Rnew) <- colnames(x)
+     colnames(Rnew) <- paste(colnames(x), "", sep="")
+     
+     # remove upper triangle of correlation matrix
+     if(removeTriangle[1]=="upper"){
+       Rnew <- as.matrix(Rnew)
+       Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
+       Rnew <- as.data.frame(Rnew)
+     }
+     
+     # remove lower triangle of correlation matrix
+     else if(removeTriangle[1]=="lower"){
+       Rnew <- as.matrix(Rnew)
+       Rnew[lower.tri(Rnew, diag = TRUE)] <- ""
+       Rnew <- as.data.frame(Rnew)
+     }
+     
+     # remove last column and return the correlation matrix
+     Rnew <- cbind(Rnew[1:length(Rnew)-1])
+     if (result[1]=="none") return(Rnew)
+     else{
+       if(result[1]=="html") print(xtable(Rnew), type="html")
+       else print(xtable(Rnew), type="latex") 
+     }
+ } 
> 

Combine matrix of correlation coefficients and significance levels

> corstars(mtcars[,1:7], 
+          result="html")

Combine matrix of correlation coefficients and significance levels

mpg cyl disp hp drat wt
mpg
cyl -0.85****
disp -0.85**** 0.90****
hp -0.78**** 0.83**** 0.79****
drat 0.68**** -0.70**** -0.71**** -0.45**
wt -0.87**** 0.78**** 0.89**** 0.66**** -0.71****
qsec 0.42* -0.59*** -0.43* -0.71**** 0.09 -0.17

My Shiny app: Visulazing Correlation matrix

https://dhafer.shinyapps.io/CorrMatrixViz

ggfortify package, visualizing models

Time series

> library(ggfortify)
> head(AirPassengers)
[1] 112 118 132 129 121 135
> class(AirPassengers)
[1] "ts"
> autoplot(AirPassengers)

Time serie

Times series, customizing

> p <- autoplot(AirPassengers)
> p + ggtitle('AirPassengers') + xlab('Year') + ylab('Passengers')

Times series, customizing

Clustering

> set.seed(1)
> head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
> p <- autoplot(kmeans(iris[-5], 3), data = iris)
> p

Clustering

PCA

> df <- iris[c(1, 2, 3, 4)]
> autoplot(prcomp(df))

PCA

PCA, by showing groups! Convexes

> autoplot(prcomp(df), 
+          data = iris, 
+          colour = 'Species', 
+          shape = FALSE, 
+          label.size = 3, frame=T)

PCA, by showing groups! Convexes

Biplot for a PCA

> autoplot(prcomp(df), data = iris, colour = 'Species',
+          loadings = TRUE, loadings.colour = 'blue',
+          loadings.label = TRUE, loadings.label.size = 3)

Biplot for a PCA

Regression diagnostic

> m <- lm(Petal.Width ~ Petal.Length, data = iris)
> autoplot(m, which = 1:6, colour = 'dodgerblue3',
+          smooth.colour = 'black', 
+          smooth.linetype = 'dashed',
+          ad.colour = 'blue',
+          label.size = 3, label.n = 5, label.colour = 'blue',
+          ncol = 3)

Regression diagnostic

Local Fisher Discriminant Analysis

> library(lfda)
> model <- lfda(x = iris[-5], y = iris[, 5], r = 3, metric="plain")
> autoplot(model, 
+          data = iris, 
+          frame = TRUE, 
+          frame.colour = 'Species')

Local Fisher Discriminant Analysis

tabplot package

Data

> require(ggplot2)
> data(diamonds)
> head(diamonds)
# A tibble: 6 x 10
  carat cut       color clarity depth table price     x     y     z
  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.230 Ideal     E     SI2      61.5  55.0   326  3.95  3.98  2.43
2 0.210 Premium   E     SI1      59.8  61.0   326  3.89  3.84  2.31
3 0.230 Good      E     VS1      56.9  65.0   327  4.05  4.07  2.31
4 0.290 Premium   I     VS2      62.4  58.0   334  4.20  4.23  2.63
5 0.310 Good      J     SI2      63.3  58.0   335  4.34  4.35  2.75
6 0.240 Very Good J     VVS2     62.8  57.0   336  3.94  3.96  2.48
> summary(diamonds)
     carat               cut        color        clarity          depth           table           price             x                y                z         
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720   1st Qu.: 2.910  
 Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80   Median :57.00   Median : 2401   Median : 5.700   Median : 5.710   Median : 3.530  
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735   Mean   : 3.539  
 3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540   3rd Qu.: 4.040  
 Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900   Max.   :31.800  
                                    J: 2808   (Other): 2531                                                                                                     

Exploring Data

> require(tabplot)
> tableplot(diamonds)

Exploring Data

Exploring Data, how it works?

> tableplot(diamonds, nBins=2,select =c(carat,color))

Exploring Data, how it works?

Exploring Data, how it works?

> x=tableplot(diamonds, nBins=2,select =c(carat,color),decreasing = T)
> names(x)
>  x$columns$carat$mean
>  z=sort(diamonds$carat,d=T)
>  dim(diamonds)
>  mean(z[1:26970])
>  mean(z[26971:53940])
>  x$columns$color$widths
>  y=diamonds$color[order(diamonds$carat,decreasing = T)]
>  prop.table(table(y[1:26940]))
>  prop.table(table(y[26971:53940]))

Exploring Data, how it works?

 [1] "dataset"    "select"     "subset"     "nBins"      "binSizes"   "sortCol"    "decreasing" "from"       "to"         "n"          "N"          "m"          "isNumber"   "rows"       "columns"    "numMode"   
[1] 1.1723063 0.4235732
[1] 53940    10
[1] 1.172306
[1] 0.4235732
           [,1]      [,2]      [,3]      [,4]      [,5]       [,6]       [,7] [,8]
[1,] 0.09492028 0.1389692 0.1665925 0.2012236 0.1852799 0.13388951 0.07912495    0
[2,] 0.15628476 0.2242862 0.1872080 0.2174638 0.1226177 0.06714868 0.02499073    0

         D          E          F          G          H          I          J 
0.09561990 0.13938382 0.16674091 0.20100223 0.18511507 0.13333333 0.07880475 

         D          E          F          G          H          I          J 
0.15550612 0.22376715 0.18694846 0.21757508 0.12298851 0.06781609 0.02539859 

Missing values

> # add some NA's
> diamonds2=diamonds
> diamonds2$price[which(diamonds2$cut == "Ideal")]<-NA
> diamonds2$cut[diamonds2$depth>65]=NA
> tableplot(diamonds2,colorNA = "black")

Missing values

Zooming on data,

> tableplot(diamonds, nBins=5, select = c(carat, price, cut, color, clarity), sortCol = price, 
+     from = 0, to = 5)

Zooming on data,

Filtering data

> tableplot(diamonds, subset = price < 5000 & cut == "Premium")

Filtering data

Change colors

> tableplot(diamonds, pals = list(cut="Set1(6)", color="Set5", clarity=rainbow(8)))

Change colors

Preprocessing of Large data

> # create large dataset
> large_diamonds <- diamonds[rep(seq.int(nrow(diamonds)), 10),]
> 
> system.time({
+     p <- tablePrepare(large_diamonds)
+ })
   user  system elapsed 
  1.287   0.758   2.301