CS 424 Big Data Analytics

Session 9: Visualizations

Instructor: Dr. Bob Batzinger
Academic year: 2021/2022
Semester: 1

Begins June 2021

R Studio Interface

Starting up

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Challenge 1

Challenge 1 coding

diamonds %>% 
  ggplot(aes(x=carat, y=price, color=color)) +
  geom_point(cex=0.5)+
  facet_grid(rows=vars(cut)) +
   labs(title="Challenge 1")

Challenge 2

## Warning: Removed 32 rows containing non-finite values (stat_smooth).

Challenge 2 coding

diamonds %>% 
  ggplot(aes(x=carat, y=price, color=clarity)) +
  geom_smooth(method="loess",formula="y~x")+
  facet_grid(rows=vars(cut))+
  labs(title="Challenge 2")

Challenge 3

## Warning: Removed 10 rows containing missing values (geom_point).

Challenge 3 coding

diamonds %>% 
  ggplot(aes(x=table,y=carat,color=price))+
    scale_colour_gradient(low = "red",
       high = "green", space = "Lab", 
       na.value = "grey50",
       guide = "colourbar", 
       aesthetics = "colour") +
  geom_jitter(mapping=
       aes(x=table,y=carat,color=price),
              size=0.5,alpha=0.5) +
  xlim(45,75)+ 
  ylim(0,4)+
  facet_grid(rows=vars(cut)) + 
  labs("Challenge 3")
## Warning: Removed 11 rows containing missing values (geom_point).

Challenge 4

Strains vs Time

$$ \[\begin{matrix} yr & 2020 & 2020 & 2020 & 2020 & 2020 & 2021 & 2021 & 2021 & 2021 & 2021 & 2021 & 2021 \\ month & 08 & 09 & 10 & 11 & 12 & 01 & 02 & 03 & 04 & 05 & 06 & 07 \\ other & 100 & 100 & 95 & 90 & 90 & 70 & 50 & 20 & 2 & 2 & 1 & 1\\ alpha & 0 & 0 & 1 & 6 & 5 & 10 & 20 & 10 & 3 & 1 & 0 & 0\\ kappa & 0 & 0 & 2 & 3 & 3 & 15 & 20 & 20 & 5 & 2 & 1 & 0\\ delta & 0 & 0 & 2 & 1 & 2 & 5 & 10 & 50 & 90 & 95 & 98 & 99\\ \end{matrix}\]

$$

Challenge 4 coding

time = c("2020-08-01", "2020-09-01", "2020-10-01",
         "2020-11-01","2020-12-01", "2021-01-01",
         "2021-02-01", "2021-03-01","2021-04-01",
         "2021-05-01", "2021-06-01", "2021-07-01")
other = c(100,100,95,90,90,70, 50,20,2,2,1,1)
alpha = c(0,0,1,6,5,10, 20,10,3,1,0,0)
kappa = c(0,0,2,3,3,15, 20,20,5,2,1,0)
delta = c(0,0,2,1,2,5, 10,50,90,95,98,99)
coviddat = data.frame(as.Date(rep(time,4)),
        c(rep("other",12),rep("alpha",12),
          rep("kappa",12),rep("delta",12)),
        c(other, alpha,kappa,delta))
colnames(coviddat) = c("time","strain","percent")

Challenge 4 Tidy Dataframe

time strain percent
2020-08-01 other 100
2020-09-01 other 100
2020-10-01 other 95
2020-11-01 other 90
2020-12-01 other 90
2021-01-01 other 70
2021-02-01 other 50
2021-03-01 other 20
2021-04-01 other 2
2021-05-01 other 2
2021-06-01 other 1
2021-07-01 other 1
2020-08-01 alpha 0
2020-09-01 alpha 0
2020-10-01 alpha 1

Challenge 5

Challenge 5 coding

coviddat %>% ggplot() +
  geom_col(mapping=
       aes(x=time,y=percent,fill=strain))+
  labs(title="Challenge 5",
       subtitle="Emerging Strains of COVID-19")

Challenge 6

Challenge 6 coding

coviddat %>% 
  ggplot() +
  geom_area(mapping=
        aes(x=time,y=percent,fill=strain)) +
  labs(title="Challenge 6",
         subtitle="Emerging Strains of COVID-19")

Challenge 7

Challenge 7 coding

coviddat %>% 
  ggplot() +
  geom_area(mapping =
       aes(x=time,y=percent,fill=strain)) +
  coord_polar() +
  labs(title="Challenge 7",
       subtitle="Emerging Strains of COVID-19")

Regression

lm = lm(price ~ carat + color + clarity + cut, data=diamonds)
summary(lm)
## 
## Call:
## lm(formula = price ~ carat + color + clarity + cut, data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16813.5   -680.4   -197.6    466.4  10394.9 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -3710.603     13.980 -265.414  < 2e-16 ***
## carat        8886.129     12.034  738.437  < 2e-16 ***
## color.L     -1910.288     17.712 -107.853  < 2e-16 ***
## color.Q      -627.954     16.121  -38.952  < 2e-16 ***
## color.C      -171.960     15.070  -11.410  < 2e-16 ***
## color^4        21.678     13.840    1.566    0.117    
## color^5       -85.943     13.076   -6.572 5.00e-11 ***
## color^6       -49.986     11.889   -4.205 2.62e-05 ***
## clarity.L    4217.535     30.831  136.794  < 2e-16 ***
## clarity.Q   -1832.406     28.827  -63.565  < 2e-16 ***
## clarity.C     923.273     24.679   37.411  < 2e-16 ***
## clarity^4    -361.995     19.739  -18.339  < 2e-16 ***
## clarity^5     216.616     16.109   13.447  < 2e-16 ***
## clarity^6       2.105     14.037    0.150    0.881    
## clarity^7     110.340     12.383    8.910  < 2e-16 ***
## cut.L         698.907     20.335   34.369  < 2e-16 ***
## cut.Q        -327.686     17.911  -18.295  < 2e-16 ***
## cut.C         180.565     15.557   11.607  < 2e-16 ***
## cut^4          -1.207     12.458   -0.097    0.923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1157 on 53921 degrees of freedom
## Multiple R-squared:  0.9159, Adjusted R-squared:  0.9159 
## F-statistic: 3.264e+04 on 18 and 53921 DF,  p-value: < 2.2e-16

Contour

diamonds %>% 
  ggplot(aes(x=table,y=depth)) + 
  geom_density2d()

Filled Contour

diamonds %>% 
  ggplot(aes(x=table,y=depth)) + 
  geom_bin2d() +
  xlim(40,70)+
    ylim(55,70)

Rectanglar plot

Hexagon plotting

Color Filled Contour

diamonds %>% 
  ggplot(aes(x=table,y=depth)) + 
  geom_hex()+
  xlim(40,70)+
    ylim(55,70)+
    scale_color_gradient(low = "#99ff99", 
       high = "#003300", space = "Lab", 
       na.value = "grey50",
       guide = "colourbar", 
       aesthetics = "fill")
## Warning: Removed 51 rows containing non-finite values (stat_binhex).
## Warning: Removed 11 rows containing missing values (geom_hex).

Plotted

diamonds %>% 
  ggplot(aes(x=table,y=carat,color=price),xlim=c(0,1000)) + 
    scale_colour_gradient(low = "red",  high = "green", 
       space = "Lab", na.value = "grey50",
       guide = "colourbar", aesthetics = "colour") +
  geom_jitter(mapping=aes(x=table,y=carat,color=price),size=0.5,alpha=0.5) +
  xlim(45,75)+
  ylim(0,4)+
  facet_grid(rows=vars(cut))
## Warning: Removed 10 rows containing missing values (geom_point).

Preliminary analysis

library(GGally)

ggpairs(diamonds,columns=c(1,5:6,8:10),
    ggplot2::aes(colour=cut))

ggcorr(diamonds)

Pairs

Correlation

## Warning in ggcorr(diamonds): data in column(s) 'cut', 'color', 'clarity' are not
## numeric and were ignored

Correlations

Distribution

Evolution

Maps

Parts of a whole

Ranking =============

Evolution

Graphic Techniques