CS 424 Big Data Analytics

Session 7: Basics of Visualizations

Instructor: Dr. Bob Batzinger
Academic year: 2021/2022
Semester: 1

Begins June 2021

R Studio Interface

Starting up

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load Data

titpass =read_csv("../titanicpass.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   lastname = col_character(),
##   fname = col_character(),
##   age = col_double(),
##   status = col_character(),
##   section = col_character()
## )
head(titpass,5)
## # A tibble: 5 x 5
##   lastname fname                 age status section    
##   <chr>    <chr>               <dbl> <chr>  <chr>      
## 1 Abbing   Mr. Anthony            40 pass   thirdClass 
## 2 Abbott   Mr. Ernest Owen        21 staff  Victualling
## 3 Abbott   Mr. Eugene Joseph      14 pass   thirdClass 
## 4 Abbott   Mr. Rossmore Edward    16 pass   thirdClass 
## 5 Abbott*  Mrs. Rhoda Mary        39 pass   thirdClass

Create Graph frame

a = titpass %>% ggplot()
a

Add objects to your plot

b = a+ geom_boxplot(mapping=aes(x=status,y=age),fill="gray",color="slategray")
        
b

Add other objects

c = b + geom_dotplot(aes(x=status,y=age,color=status),binaxis='y', stackdir='center',dotsize=0.12)
c
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Try other objects

  d = a+geom_violin(mapping=aes(x=status,y=age,fill=status))
d

Adjust the coordinates

e= d+ coord_flip()
e

Add labels

f = e +labs(title="Age distribution",
        caption="\nFig. 1. Ages of staff and passengers of the RMS Titanic"    )
f

Add Annotation

g = f + annotate(geom="text",y=c(31,23),x=c("staff","pass"), label = c("S","P"))
g

Combined into a single workflow of commands

titpass %>% ggplot() + geom_violin(mapping=aes(x=status,y=age,fill=status)) + 
   coord_flip() +
  labs(title="Age distribution",
   caption="\nFig. 1. Ages of staff and passengers of the RMS Titanic") +
  annotate(geom="text",y=c(31,23),x=c("staff","pass"), label = c("S","P"))

Combining graphs

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(a, b,c,d, 
          ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Labelling multiplots

grid.arrange(a+labs(title="Step 1"),     b+labs(title="Step 2"),
             c+labs(title="Step 3"),     d+labs(title="Step 4"), 
          ncol = 2, nrow = 2,top = "Building graphics incrementally",
          bottom = "Fig. 2: Development of ggplot graphic")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Other examples

diamonds %>% ggplot(aes(x=carat, y=price, color=clarity)) +
  geom_point()+facet_grid(rows=vars(cut))

Smoothing

diamonds %>% ggplot(aes(x=carat, y=price, color=clarity)) +
  geom_smooth(method="loess")+facet_grid(rows=vars(cut))
## `geom_smooth()` using formula 'y ~ x'

Color

diamonds %>% ggplot(aes(x=carat, y=price, color=color)) +
  geom_point()+facet_grid(rows=vars(cut))

Regression

lm = lm(price ~ carat + color + clarity + cut -1,data=diamonds)
summary(lm)
## 
## Call:
## lm(formula = price ~ carat + color + clarity + cut - 1, data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16813.5   -680.4   -197.6    466.4  10394.9 
## 
## Coefficients:
##            Estimate Std. Error  t value Pr(>|t|)    
## carat      8886.129     12.034  738.437   <2e-16 ***
## colorD    -2886.973     18.211 -158.526   <2e-16 ***
## colorE    -3098.655     16.328 -189.776   <2e-16 ***
## colorF    -3190.283     16.691 -191.138   <2e-16 ***
## colorG    -3393.172     16.448 -206.299   <2e-16 ***
## colorH    -3865.670     18.370 -210.438   <2e-16 ***
## colorI    -4327.275     21.457 -201.670   <2e-16 ***
## colorJ    -5212.195     26.973 -193.239   <2e-16 ***
## clarity.L  4217.535     30.831  136.794   <2e-16 ***
## clarity.Q -1832.406     28.827  -63.565   <2e-16 ***
## clarity.C   923.273     24.679   37.411   <2e-16 ***
## clarity^4  -361.995     19.739  -18.339   <2e-16 ***
## clarity^5   216.616     16.109   13.447   <2e-16 ***
## clarity^6     2.105     14.037    0.150    0.881    
## clarity^7   110.340     12.383    8.910   <2e-16 ***
## cut.L       698.907     20.335   34.369   <2e-16 ***
## cut.Q      -327.686     17.911  -18.295   <2e-16 ***
## cut.C       180.565     15.557   11.607   <2e-16 ***
## cut^4        -1.207     12.458   -0.097    0.923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1157 on 53921 degrees of freedom
## Multiple R-squared:  0.9574, Adjusted R-squared:  0.9574 
## F-statistic: 6.373e+04 on 19 and 53921 DF,  p-value: < 2.2e-16

Contour

diamonds %>% ggplot(aes(x=table,y=depth)) + geom_density2d() +
  facet_grid(rows=vars(cut))

Plotted

diamonds %>% ggplot(aes(x=table,y=carat,color=price),xlim=c(0,1000)) + 
    scale_colour_gradient(low = "red",  high = "green", 
  space = "Lab",
  na.value = "grey50",
  guide = "colourbar",
  aesthetics = "colour")+
  geom_jitter(mapping=aes(x=table,y=carat,color=price),size=0.5,alpha=0.5) +
  xlim(45,75)+
  facet_grid(rows=vars(cut))
## Warning: Removed 5 rows containing missing values (geom_point).

```