Basic introduction to the dataset:
data(iris)
dim(iris)
## [1] 150 5
class(iris)
## [1] "data.frame"
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.30 Min. :2.00 Min. :1.00 Min. :0.1
## 1st Qu.:5.10 1st Qu.:2.80 1st Qu.:1.60 1st Qu.:0.3
## Median :5.80 Median :3.00 Median :4.35 Median :1.3
## Mean :5.84 Mean :3.06 Mean :3.76 Mean :1.2
## 3rd Qu.:6.40 3rd Qu.:3.30 3rd Qu.:5.10 3rd Qu.:1.8
## Max. :7.90 Max. :4.40 Max. :6.90 Max. :2.5
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
sapply(iris, class)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## "numeric" "numeric" "numeric" "numeric" "factor"
Default plot for a numeric variable:
plot(iris$Sepal.Length)
For two numeric variables:
plot(iris$Sepal.Length, iris$Sepal.Width)
Equivalently using the formula interface:
plot(iris$Sepal.Width ~ iris$Sepal.Length)
Or (and try doing this with lm instead of plot):
pdf("iris1.pdf")
plot(Sepal.Width ~ Sepal.Length, data=iris)
dev.off()
Interactive: make it look a better. Look at ?plot and ?plot.default. Play with type, ylim, log, main, sub, xlab, ylab, pch, cex. Explore different output devices, and RStudio “Export.”
Exercise: do again, using different shapes (pch) or colors (col) for each species. Add a legend.
plot(1:20, pch=1:20)
plot(1:20, pch=c(".", "a", "dog"))
Explore color palettes.
library(RColorBrewer)
display.brewer.all(n=3)
plot(Sepal.Width ~ Sepal.Length, data=iris, col=brewer.pal(3, "Set2")[iris$Species])
legend(x=6.5, y=4.5, legend=levels(iris$Species), col=brewer.pal(3, "Set2"), pch=1)
x <- hist(iris$Sepal.Length)
segments(x0=x$mids-0.25, x1=x$mids+0.25, y0=x$counts, y1=x$counts, lw=4, col="red")
How about a boxplot with histogram together? (see http://rgraphgallery.blogspot.com/search/label/boxplot)
layout(mat = matrix(c(1,2), 2, 1, byrow=TRUE), height = c(1,3))
par(mar=c(2, 2, 0.5, 1))
boxplot(iris$Sepal.Length, horizontal=TRUE, outline=TRUE,ylim=c(4,8), frame=FALSE, col = "green1")
hist(iris$Sepal.Length,xlim=c(4,8), col = "pink", freq = FALSE)
lines(density(iris$Sepal.Length))
Boxplot of Sepal width vs. species
boxplot(Sepal.Width ~ Species, data=iris)
Use dev.off() to clear layout.
library(gplots)
heatmap.2(t(iris[, 1:4]), trace="none", scale="row", key=TRUE, mar=c(2, 8), cexRow=1, ColSideColors=c("grey", "black", "yellow")[iris$Species])
Exploratory pairs plots:
pairs(iris)
Google maps, from example at http://rgraphgallery.blogspot.com/2013/04/rg68-get-google-map-and-plot-data-in-it.html:
library(ggmap)
## Loading required package: ggplot2
ls("package:ggmap")
## [1] "crime" "distQueryCheck" "geocode"
## [4] "geocodeQueryCheck" "geom_leg" "get_cloudmademap"
## [7] "get_googlemap" "get_map" "get_openstreetmap"
## [10] "get_stamenmap" "ggimage" "gglocator"
## [13] "ggmap" "ggmapplot" "hadley"
## [16] "inset" "inset_raster" "legs2route"
## [19] "LonLat2XY" "make_bbox" "mapdist"
## [22] "OSM_scale_lookup" "qmap" "qmplot"
## [25] "revgeocode" "route" "routeQueryCheck"
## [28] "theme_inset" "theme_nothing" "wind"
## [31] "XY2LonLat" "zips"
# example of map of Hunter College area
hc <- geocode("Hunter College")
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Hunter+College&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
hcmap1 = get_map(location = unlist(hc), zoom = 15, maptype = 'roadmap', source = "google")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.768702,-73.964876&zoom=15&size=%20640x640&scale=%202&maptype=roadmap&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
huntermap1 = ggmap(hcmap1)
huntermap1
Let's add the iris data to random locations:
library(grid)
df <- data.frame(lat=runif(nrow(iris), min=40.76, max=40.777),
lon=runif(nrow(iris), min=-73.975, max=-73.955),
length=iris$Sepal.Length,
width=iris$Sepal.Width)
huntermap1 + geom_point(aes(x = lon, y = lat, colour = width, size = length, alpha = 0.9), data=df) + scale_colour_gradient(low="yellow", high="red")
Datasets are available at http://goo.gl/x87GY6
Plot petal width vs. petal length for the iris dataset, identifying species using both color and datapoint type.
data(iris)
plot(Petal.Width ~ Petal.Length, col=c("black", "red", "blue")[Species], pch=(15:17)[Species], data=iris)
legend("topleft", legend=levels(iris$Species), col=c("black", "red", "blue"), pch=15:17)
Do exploratory analysis of the GSE12945 dataset, taken from the following paper:
Staub E, Groene J, Heinze M, Mennerich D et al. An expression module of WIPF1-coexpressed genes identifies patients with favorable prognosis in three tumor types. J Mol Med (Berl) 2009 Jun;87(6):633-44. PMID: 19399471
Dataset is available from http://goo.gl/x87GY6 or by using the following command:
download.file("https://www.dropbox.com/sh/pukanjaahmonmcp/AADWX-vKk70CuGgYWBqqxWjfa/datasets/GSE12945.csv", destfile="GSE12945.csv", method="wget")
Create appropriate pairwise plots for the first five variables in the NYC Community Health Survey dataset provided at https://goo.gl/qD2Wzm. On the diagonal, put the name of the variable shown in that row+column. Think about how you could customize the plot type using if/else statements.
download.file("https://goo.gl/qD2Wzm", destfile="chs2012_codedforR.csv", method="wget")
x <- read.csv("chs2012_codedforR.csv")
par(mfrow=c(5, 5))
par(mar=c(3, 3, 1, 1))
for (i in 1:5){
for (j in 1:5){
if(j != i){
plot(x[[i]], x[[j]], xlab=colnames(x)[i], ylab=colnames(x)[j])
}else{
plot.new()
text(x=0.5, y=0.5, label=colnames(x)[j])
}
}
}