Rafael S. de Souza
4/28/2017
Information visualization or information visualisation is the study of (interactive) visual representations of abstract data to reinforce human cognition.
R is an open source programming language and software environment for statistical computing and graphics
Available from http://www.r-project.org for Windows, Mac or linux.
\(>\) 10500 user-provided add-on CRAN packages, tens of thousands of statistical functions. On-the-fly installation of CRAN packages
Learning curve: trade-off between finding what you want, and understanding what you find.
| Bayesian Inference | Machine Learning | Social Sciences |
| Computational Physics | Medical Image Analysis | Spatial Data |
| Cluster Analysis | Multivariate Statistics | Statistical Geneticss |
| Differential Equations | Natural Language | Survival Analysi |
| Econometrics | Numerical Mathematic | Time Series Analysis |
| Environmetrics | Optimization | Visualization |
| Environmetrics | Pharmacokinetic | Web Technologies |
| Extreme Value Analysis | Phylogenetics | Astronomy |
| Empirical Finance | Probability Distributions | |
| Functional Data Analysis | Psychometric |
require(ggplot2);require(reshape2);require(circlize);require(ggdendro);
require(PerformanceAnalytics);require(ggthemes);require(RColorBrewer);
require(psych)If not available
install.packages(c("ggplot2","reshape2","circlize","ggdendro","ggthemes",
"RColorBrewer","reshape2","PerformanceAnalytics","psych"),dependencies = T)1+1## [1] 2
x <- 2
for (i in 1:5){
print(x+i)
}## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
print("Life, the Universe and Everything")## [1] "Life, the Universe and Everything"
x <- rnorm(5000) # Generate 5000 random numbers from a Normal distribution
hist(x,col="green") # Plot a histogramset.seed(1056) # set seed to replicate example
nobs= 150 # number of obs in model
x1 <- runif(nobs,0,5) # random uniform variable
mu <- 1 + 5 * x1 # linear predictor, xb
y <- rnorm(nobs, mu, sd=1) # create y as adjusted random normal variate
fit <- lm(y ~ x1) # Normal Fit summary(fit) # Display results from the fit| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| x1 | 5.025 | 0.05288 | 95.03 | 1.494e-134 |
| (Intercept) | 1.006 | 0.1541 | 6.528 | 9.971e-10 |
xx <- seq(0,5,length=200)
ypred <- predict(fit,newdata=list(x1=xx),type="response") # Prediction from the model
plot(x1,y,pch=19,col="red") # Plot regression line
lines(xx,ypred,col='cyan',lwd=4,lty=2)
segments(x1,fitted(fit),x1,y,lwd=2,col="gray") # Add the residualsdat <- data.frame(x1,y)
ggplot(data=dat,aes(x =x1, y = y)) +
geom_point() +
stat_smooth(method = "lm", col = "red")Data from http://www.openexoplanetcatalogue.com
Read and display data in tabular format
d <- read.csv("data/exoplanets.csv",header = T)
d <- d[complete.cases(d),]
head(d)p1 <- ggplot(data = d, aes(x = Period, y = star_mass, group=Discovery_method,
fill = Discovery_method, text=name, size=Radius)) + geom_point(alpha=0.5) +
scale_x_log10() + ylab("Stellar Mass") + xlab("Period")+
theme(legend.position = "none")p1p2 <- ggplot(data = d, aes(x = star_mass, y = star_radius, group=Discovery_method, text=name,size=Radius,colour=Discovery_method)) +theme_bw()+
geom_point(alpha=0.6) +scale_x_log10() + xlab("Stellar Mass") + ylab("Stellar Radius")+
theme(legend.position = "right")+scale_size(range=c(2,10))+ scale_color_tableau(name="")p2p3 <- ggplot(data = d, aes(x = star_mass,y =star_radius,size=Radius)) +
geom_point(size=2.5,alpha=0.6,colour="red") + geom_smooth(size=2,method = "loess")+
xlab("Stellar Mass") + ylab("Stellar Radius")+scale_size(range=c(2,10))p3xcut <- cut(log10(d$Period),breaks=5)
d$xcut <- xcut
p4 <- ggplot(data = d, aes(x = log10(Period), y = star_mass)) +
geom_boxplot(data = d, aes(x = xcut,y = star_mass),fill="red3")+
geom_point(aes(group=Discovery_method, fill = Discovery_method,
text=name,size=Radius), alpha=0.3) +
xlab("Stellar Mass") + ylab("Stellar Radius") +
theme(legend.position = "none")p4xcut <- cut(log10(d$Period),breaks=5)
d$xcut <- xcut
p5 <- ggplot(data = d, aes(x = log10(Period), y = star_mass)) +
geom_boxplot(data = d, aes(x = xcut,y = star_mass),fill="red3")+
xlab("Stellar Mass") + ylab("Stellar Radius") +
theme(legend.position = "none")p5xcut <- cut(log10(d$Period),breaks=5)
d$xcut <- xcut
p6 <- ggplot(data = d, aes(x = log10(Period), y = star_mass)) +
geom_violin(data = d, aes(x = xcut,y = star_mass),fill="red3")+
xlab("Stellar Mass") + ylab("Stellar Radius")+
theme(legend.position = "none")p6p7 <- ggplot(data = d, aes(x = Radius)) + geom_histogram(bins = 45,fill="red")p7p8 <- ggplot(data = d, aes(x = Radius)) + geom_density(fill="red")p8By groups
p9 <- ggplot(data = d, aes(x = Radius,group=Discovery_method,fill=Discovery_method)) +
geom_density()+ scale_fill_discrete(name="")p9By groups
p10 <- ggplot(data = d, aes(x = Radius,group=Discovery_method,fill=Discovery_method)) +
geom_density(adjust=1.5, position="fill")+ scale_fill_discrete(name="")p10dcor <- cor(d[,c(2,3,5,6,7)])
print(dcor)| Radius | Period | star_mass | star_radius | |
|---|---|---|---|---|
| Radius | 1 | 0.03116 | 0.3415 | 0.3532 |
| Period | 0.03116 | 1 | 0.06246 | 0.02269 |
| star_mass | 0.3415 | 0.06246 | 1 | 0.7933 |
| star_radius | 0.3532 | 0.02269 | 0.7933 | 1 |
| star_temperature | 0.157 | 0.04783 | 0.6342 | 0.4354 |
| star_temperature | |
|---|---|
| Radius | 0.157 |
| Period | 0.04783 |
| star_mass | 0.6342 |
| star_radius | 0.4354 |
| star_temperature | 1 |
c0 <- scale(log10(d[,c(2,3,5,6,7)]))
chart.Correlation(c0, histogram=TRUE, pch=19)pairs.panels(log10(d[,c(2,3,5,6,7)]), scale=TRUE)See de Souza, Dantas, et al 2017 arXiv:1703.07607
nc <- cor(d[,c(2,3,5,6,7)])
chordDiagram(nc)beer <- read.csv("data/beer2.csv",header = T)
beer <- beer[complete.cases(beer),]head(beer)Data from https://www.kaggle.com
titan <- as.data.frame(Titanic)
head(titan)## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
t1 <- ggplot(titan,aes(x=Class,y=Freq,fill=Survived))+geom_bar(stat="identity")+facet_wrap(Sex~Age)+
theme_dark()+scale_fill_pander()+ylab("Passengers")t1\(\frac{dS}{dt} = P - BSZ - dS\)
\(\frac{dZ}{dt} = BSZ + GR - ASZ\)
\(\frac{dR}{dt} = dS + ASZ - GR\)
| S: the number of susceptible victims | d: the chance of a natural death |
| Z: the number of zombies | B: the chance the “zombie disease” is transmitted |
| R: the number of people killed | G: the chance a dead person is resurrected into a zombie |
| P: the population birth rate | A: the chance a zombie is totally destroyed |
See Munz et al. 2009 and Modeling a Zombie Apocalypse in Python
library(deSolve)parameters <- c(P = 15, # birth rate
d = 0.0000005, # natural death percent (per day)
B = 0.00010, # transmission percent (per day)
G = 0.0005, # ressurect percent (per day)
A = 0.000025) # destroy percent (per day))
state <- c(S=50000, # initial population
Z=0, # initial zombie population
R=0 # initial death population
) zoombie <- function(t, state, parameters){
with(as.list(c(state, parameters)),{
#the model equations (see Munz et al. 2009)
dS <- P - B*S*Z - d*S
dZ <- B*S*Z + G*R - A*S*Z
dR <- d*S + A*S*Z - G*R
output <- list(c(dS,dZ,dR))
return(output)
})
}times <- seq(0, 20, by = 0.025)
out <- ode(y = state, times = times, func = zoombie, parms = parameters)
head(out)## time S Z R
## [1,] 0.000 50000.00 0.000000e+00 0.0000000000
## [2,] 0.025 50000.37 4.097939e-09 0.0006249985
## [3,] 0.050 50000.75 1.679475e-08 0.0012499941
## [4,] 0.075 50001.12 3.893657e-08 0.0018749872
## [5,] 0.100 50001.50 7.145279e-08 0.0024999779
## [6,] 0.125 50001.87 1.153639e-07 0.0031249667
dat <- as.data.frame(out)
dat2 <- melt(dat,id="time")
z1 <-ggplot(data=dat2,aes(x=time,y=value,group=variable,colour=variable,linetype=variable))+
geom_line(size=2)+theme_hc()+ylab("Population")+xlab("Days after break")+
scale_color_stata(name="",labels = c("Population", "Zombies","Victims"))+
scale_linetype_stata(name="",labels = c("Population", "Zombies","Victims"))z1Choose a dataset and write a brief 1-2 page report with
Potential data sources