From Planets to Zombies

Visualization methods in R

Rafael S. de Souza

4/28/2017

Information Visualization

Information visualization or information visualisation is the study of (interactive) visual representations of abstract data to reinforce human cognition.

A statistical computing environment

Some topics covered by R packages

Bayesian Inference Machine Learning Social Sciences
Computational Physics Medical Image Analysis Spatial Data
Cluster Analysis Multivariate Statistics Statistical Geneticss
Differential Equations Natural Language Survival Analysi
Econometrics Numerical Mathematic Time Series Analysis
Environmetrics Optimization Visualization
Environmetrics Pharmacokinetic Web Technologies
Extreme Value Analysis Phylogenetics Astronomy
Empirical Finance Probability Distributions
Functional Data Analysis Psychometric

Required packages for today’s lesson

require(ggplot2);require(reshape2);require(circlize);require(ggdendro);
require(PerformanceAnalytics);require(ggthemes);require(RColorBrewer);
require(psych)

If not available

install.packages(c("ggplot2","reshape2","circlize","ggdendro","ggthemes",
"RColorBrewer","reshape2","PerformanceAnalytics","psych"),dependencies = T)

Basic commands

1+1
## [1] 2
x <- 2
for (i in 1:5){
print(x+i)  
}
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
print("Life, the Universe and Everything")
## [1] "Life, the Universe and Everything"

Basic commands

x <- rnorm(5000) # Generate 5000 random numbers from a Normal distribution
hist(x,col="green") # Plot a histogram

Simple regression model

set.seed(1056)              # set seed to replicate example
nobs= 150                   # number of obs in model 
x1 <- runif(nobs,0,5)       # random uniform variable
mu <- 1 + 5 * x1            # linear predictor, xb
y <- rnorm(nobs, mu, sd=1)  # create y as adjusted random normal variate 
fit <- lm(y ~ x1)           # Normal Fit 
summary(fit) # Display results from the fit
Fitting linear model: y ~ x1
  Estimate Std. Error t value Pr(>|t|)
x1 5.025 0.05288 95.03 1.494e-134
(Intercept) 1.006 0.1541 6.528 9.971e-10

Display results-base R

xx <- seq(0,5,length=200)
ypred <- predict(fit,newdata=list(x1=xx),type="response")     # Prediction from the model 

plot(x1,y,pch=19,col="red")                                   # Plot regression line 
lines(xx,ypred,col='cyan',lwd=4,lty=2)

segments(x1,fitted(fit),x1,y,lwd=2,col="gray")                # Add the residuals

Display results-ggplot

dat <- data.frame(x1,y)
ggplot(data=dat,aes(x =x1, y = y)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red")

Data-mining-Exoplanets catalogue

Data from http://www.openexoplanetcatalogue.com

Read and display data in tabular format

d <- read.csv("data/exoplanets.csv",header = T)
d <- d[complete.cases(d),]
head(d)

Visualization-Scatter plot

p1 <- ggplot(data = d, aes(x = Period, y = star_mass, group=Discovery_method,
  fill = Discovery_method, text=name, size=Radius)) + geom_point(alpha=0.5) +
  scale_x_log10() + ylab("Stellar Mass") + xlab("Period")+
  theme(legend.position = "none")
p1

Visualization-Scatter plot

p2 <- ggplot(data = d, aes(x = star_mass, y = star_radius, group=Discovery_method, text=name,size=Radius,colour=Discovery_method)) +theme_bw()+
geom_point(alpha=0.6) +scale_x_log10() + xlab("Stellar Mass") + ylab("Stellar Radius")+
theme(legend.position = "right")+scale_size(range=c(2,10))+ scale_color_tableau(name="")
p2

Visualization-Linear Fit

p3 <- ggplot(data = d, aes(x = star_mass,y =star_radius,size=Radius)) +
  geom_point(size=2.5,alpha=0.6,colour="red") + geom_smooth(size=2,method = "loess")+
  xlab("Stellar Mass") + ylab("Stellar Radius")+scale_size(range=c(2,10))
p3

Visualization-Boxplot

xcut <- cut(log10(d$Period),breaks=5)
d$xcut <- xcut

p4 <- ggplot(data = d, aes(x = log10(Period), y = star_mass)) +
   geom_boxplot(data = d, aes(x = xcut,y = star_mass),fill="red3")+  
   geom_point(aes(group=Discovery_method,  fill = Discovery_method,
   text=name,size=Radius), alpha=0.3) +
   xlab("Stellar Mass") + ylab("Stellar Radius") +
  theme(legend.position = "none")
p4

Visualization-Boxplot

xcut <- cut(log10(d$Period),breaks=5)
d$xcut <- xcut
p5 <- ggplot(data = d, aes(x = log10(Period), y = star_mass)) +
   geom_boxplot(data = d, aes(x = xcut,y = star_mass),fill="red3")+  
   xlab("Stellar Mass") + ylab("Stellar Radius") +
  theme(legend.position = "none")
p5

Visualization-Violin plot

xcut <- cut(log10(d$Period),breaks=5)
d$xcut <- xcut
p6 <- ggplot(data = d, aes(x = log10(Period), y = star_mass)) +
   geom_violin(data = d, aes(x = xcut,y = star_mass),fill="red3")+  
   xlab("Stellar Mass") + ylab("Stellar Radius")+
  theme(legend.position = "none")
p6

Visualization-Histograms

p7 <- ggplot(data = d, aes(x = Radius)) + geom_histogram(bins = 45,fill="red")
p7

Visualization-Kernel Density

p8 <- ggplot(data = d, aes(x = Radius)) + geom_density(fill="red")
p8

Visualization-Kernel Density

By groups

p9 <- ggplot(data = d, aes(x = Radius,group=Discovery_method,fill=Discovery_method)) +
      geom_density()+ scale_fill_discrete(name="")
p9

Visualization-Kernel Density

By groups

p10 <- ggplot(data = d, aes(x = Radius,group=Discovery_method,fill=Discovery_method)) +
      geom_density(adjust=1.5, position="fill")+ scale_fill_discrete(name="")
p10

The Big Picture

Correlation matrix

dcor <- cor(d[,c(2,3,5,6,7)])
print(dcor)
Table continues below
  Radius Period star_mass star_radius
Radius 1 0.03116 0.3415 0.3532
Period 0.03116 1 0.06246 0.02269
star_mass 0.3415 0.06246 1 0.7933
star_radius 0.3532 0.02269 0.7933 1
star_temperature 0.157 0.04783 0.6342 0.4354
  star_temperature
Radius 0.157
Period 0.04783
star_mass 0.6342
star_radius 0.4354
star_temperature 1

Visualization-correlation matrix

c0 <- scale(log10(d[,c(2,3,5,6,7)]))
chart.Correlation(c0, histogram=TRUE, pch=19)

Visualization-correlation matrix

pairs.panels(log10(d[,c(2,3,5,6,7)]), scale=TRUE)

Visualization-Heatmap

Visualization-Heatmap with Dendrograms

Visualization-Dendrograms

Visualization-Chord Diagram

See de Souza, Dantas, et al 2017 arXiv:1703.07607

nc <- cor(d[,c(2,3,5,6,7)])
chordDiagram(nc)

What about Beers?

beer <- read.csv("data/beer2.csv",header = T)
beer <- beer[complete.cases(beer),]
head(beer)

Beers-scatter plot

Beers-boxplot

Data from https://www.kaggle.com

Beers-Density

Beers-Hexagon Binning

Another case-Titanic

titan <- as.data.frame(Titanic)
head(titan)
##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0

Bar-plot

t1 <- ggplot(titan,aes(x=Class,y=Freq,fill=Survived))+geom_bar(stat="identity")+facet_wrap(Sex~Age)+
  theme_dark()+scale_fill_pander()+ylab("Passengers")
t1

Zombies

The system is given as:

\(\frac{dS}{dt} = P - BSZ - dS\)

\(\frac{dZ}{dt} = BSZ + GR - ASZ\)

\(\frac{dR}{dt} = dS + ASZ - GR\)

Notations

S: the number of susceptible victims d: the chance of a natural death
Z: the number of zombies B: the chance the “zombie disease” is transmitted
R: the number of people killed G: the chance a dead person is resurrected into a zombie
P: the population birth rate A: the chance a zombie is totally destroyed

Modeling Zombie apocalypse in R

See Munz et al. 2009 and Modeling a Zombie Apocalypse in Python

library(deSolve)

Initial Conditions

parameters <- c(P = 15,      # birth rate
                d = 0.0000005,  # natural death percent (per day)
                B = 0.00010,  # transmission percent  (per day)
                G = 0.0005,  # ressurect percent (per day)
                A = 0.000025)  # destroy percent  (per day))


state <- c(S=50000, # initial population
           Z=0,  # initial zombie population
           R=0   # initial death population
           )  

Solve the ODE

zoombie <- function(t, state, parameters){
with(as.list(c(state, parameters)),{
  #the model equations (see Munz et al. 2009)
  dS <- P - B*S*Z - d*S
  dZ <- B*S*Z + G*R - A*S*Z
  dR <- d*S + A*S*Z - G*R
  output <- list(c(dS,dZ,dR))
  return(output)  
})
}
times <- seq(0, 20, by = 0.025)
out <- ode(y = state, times = times, func = zoombie, parms = parameters)
head(out)
##       time        S            Z            R
## [1,] 0.000 50000.00 0.000000e+00 0.0000000000
## [2,] 0.025 50000.37 4.097939e-09 0.0006249985
## [3,] 0.050 50000.75 1.679475e-08 0.0012499941
## [4,] 0.075 50001.12 3.893657e-08 0.0018749872
## [5,] 0.100 50001.50 7.145279e-08 0.0024999779
## [6,] 0.125 50001.87 1.153639e-07 0.0031249667

Plot results with ggplot2

dat <- as.data.frame(out)
dat2 <- melt(dat,id="time")
z1 <-ggplot(data=dat2,aes(x=time,y=value,group=variable,colour=variable,linetype=variable))+
  geom_line(size=2)+theme_hc()+ylab("Population")+xlab("Days after break")+
  scale_color_stata(name="",labels = c("Population", "Zombies","Victims"))+
  scale_linetype_stata(name="",labels = c("Population", "Zombies","Victims"))
z1

Exercise

Choose a dataset and write a brief 1-2 page report with

Potential data sources

Wanna know more about Statistics?

Available at Amazon and Cambridge websites.