CS 424 Big Data Analytics

Session 10: Visual Analysis

Instructor: Dr. Bob Batzinger
Academic year: 2021/2022
Semester: 1

Begins June 2021

R Studio Interface

Starting up

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)

l = length(diamonds[,1])
diamonds2 = diamonds[runif(l,1,5) > 4,]

Regression

lm = lm(price ~ carat +cut+color+clarity+depth+table, data=diamonds)
summary(lm)
## 
## Call:
## lm(formula = price ~ carat + cut + color + clarity + depth + 
##     table, data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16828.8   -678.7   -199.4    464.6  10341.2 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  -969.661    360.432   -2.690  0.00714 ** 
## carat        8895.194     12.079  736.390  < 2e-16 ***
## cut.L         615.613     22.985   26.784  < 2e-16 ***
## cut.Q        -326.638     18.390  -17.762  < 2e-16 ***
## cut.C         156.333     15.814    9.886  < 2e-16 ***
## cut^4         -15.975     12.648   -1.263  0.20657    
## color.L     -1908.010     17.718 -107.689  < 2e-16 ***
## color.Q      -626.087     16.112  -38.858  < 2e-16 ***
## color.C      -172.056     15.063  -11.423  < 2e-16 ***
## color^4        20.319     13.833    1.469  0.14187    
## color^5       -85.245     13.068   -6.523 6.95e-11 ***
## color^6       -50.085     11.881   -4.216 2.50e-05 ***
## clarity.L    4206.854     30.867  136.290  < 2e-16 ***
## clarity.Q   -1831.804     28.811  -63.580  < 2e-16 ***
## clarity.C     919.725     24.672   37.278  < 2e-16 ***
## clarity^4    -361.609     19.728  -18.330  < 2e-16 ***
## clarity^5     213.910     16.108   13.280  < 2e-16 ***
## clarity^6       2.986     14.030    0.213  0.83148    
## clarity^7     110.147     12.375    8.901  < 2e-16 ***
## depth         -21.024      4.079   -5.154 2.56e-07 ***
## table         -24.803      2.978   -8.329  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1156 on 53919 degrees of freedom
## Multiple R-squared:  0.9161, Adjusted R-squared:  0.916 
## F-statistic: 2.942e+04 on 20 and 53919 DF,  p-value: < 2.2e-16

Contour

diamonds2 %>% 
  ggplot(aes(x=table,y=depth)) + 
  geom_density2d()

Contour (Full set vs Sample)

Filled Contour

diamonds2 %>% 
  ggplot(aes(x=table,y=depth)) + 
  geom_bin2d() +
  xlim(40,70)+
    ylim(55,70)

Rectanglar plot

Hexagon plotting

Color Filled Contour

diamonds2 %>% 
  ggplot(aes(x=table,y=depth)) + 
  geom_hex()+
  xlim(40,70)+
    ylim(55,70)+
    scale_color_gradient(low = "#99ff99", 
       high = "#003300", space = "Lab", 
       na.value = "grey50",
       guide = "colourbar", 
       aesthetics = "fill")
## Warning: Removed 51 rows containing non-finite values (stat_binhex).
## Warning: Removed 11 rows containing missing values (geom_hex).

Plotted

diamonds2 %>% 
  ggplot(aes(x=table,y=carat,color=price),xlim=c(0,1000)) + 
    scale_colour_gradient(low = "red",  high = "green", 
       space = "Lab", na.value = "grey50",
       guide = "colourbar", aesthetics = "colour") +
  geom_jitter(mapping=aes(x=table,y=carat,color=price),size=0.5,alpha=0.5) +
  xlim(45,75)+
  ylim(0,4)+
  facet_grid(rows=vars(cut))
## Warning: Removed 11 rows containing missing values (geom_point).

Preliminary analysis

library(GGally)

ggpairs(diamonds2,columns=c(1,5:6,8:10),
    ggplot2::aes(colour=cut))

ggcorr(diamonds2[,c(1,5:6,8:10)])

Pairs

Correlation

Correlations

Distribution

Evolution

Maps

Parts of a whole

Ranking =============

Evolution

Graphic Techniques