Basic introduction to the dataset:

data(iris)
dim(iris)
## [1] 150   5
class(iris)
## [1] "data.frame"
summary(iris)
##   Sepal.Length   Sepal.Width    Petal.Length   Petal.Width 
##  Min.   :4.30   Min.   :2.00   Min.   :1.00   Min.   :0.1  
##  1st Qu.:5.10   1st Qu.:2.80   1st Qu.:1.60   1st Qu.:0.3  
##  Median :5.80   Median :3.00   Median :4.35   Median :1.3  
##  Mean   :5.84   Mean   :3.06   Mean   :3.76   Mean   :1.2  
##  3rd Qu.:6.40   3rd Qu.:3.30   3rd Qu.:5.10   3rd Qu.:1.8  
##  Max.   :7.90   Max.   :4.40   Max.   :6.90   Max.   :2.5  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
sapply(iris, class)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"

Default plot for a numeric variable:

plot(iris$Sepal.Length)

plot of chunk unnamed-chunk-2

For two numeric variables:

plot(iris$Sepal.Length, iris$Sepal.Width)

plot of chunk unnamed-chunk-3

Equivalently using the formula interface:

plot(iris$Sepal.Width ~ iris$Sepal.Length)

Or (and try doing this with lm instead of plot):

pdf("iris1.pdf")
plot(Sepal.Width ~ Sepal.Length, data=iris)
dev.off()

Interactive: make it look a better. Look at ?plot and ?plot.default. Play with type, ylim, log, main, sub, xlab, ylab, pch, cex. Explore different output devices, and RStudio “Export.”

Exercise: do again, using different shapes (pch) or colors (col) for each species. Add a legend.

plot(1:20, pch=1:20)

plot of chunk unnamed-chunk-6

plot(1:20, pch=c(".", "a", "dog"))

plot of chunk unnamed-chunk-6

Explore color palettes.

library(RColorBrewer)
display.brewer.all(n=3)

plot of chunk unnamed-chunk-7

plot(Sepal.Width ~ Sepal.Length, data=iris, col=brewer.pal(3, "Set2")[iris$Species])
legend(x=6.5, y=4.5, legend=levels(iris$Species), col=brewer.pal(3, "Set2"), pch=1)

plot of chunk unnamed-chunk-8

x <- hist(iris$Sepal.Length)
segments(x0=x$mids-0.25, x1=x$mids+0.25, y0=x$counts, y1=x$counts, lw=4, col="red")

plot of chunk unnamed-chunk-9

How about a boxplot with histogram together? (see http://rgraphgallery.blogspot.com/search/label/boxplot)

layout(mat = matrix(c(1,2), 2, 1, byrow=TRUE),  height = c(1,3))
par(mar=c(2, 2, 0.5, 1))
boxplot(iris$Sepal.Length, horizontal=TRUE,  outline=TRUE,ylim=c(4,8), frame=FALSE, col = "green1")
hist(iris$Sepal.Length,xlim=c(4,8), col = "pink", freq = FALSE)
lines(density(iris$Sepal.Length))

plot of chunk unnamed-chunk-10

Boxplot of Sepal width vs. species

boxplot(Sepal.Width ~ Species, data=iris)

plot of chunk unnamed-chunk-11

Use dev.off() to clear layout.

library(gplots)
heatmap.2(t(iris[, 1:4]), trace="none", scale="row", key=TRUE, mar=c(2, 8), cexRow=1, ColSideColors=c("grey", "black", "yellow")[iris$Species])

plot of chunk unnamed-chunk-12

Exploratory pairs plots:

pairs(iris)

plot of chunk unnamed-chunk-13

Google maps, from example at http://rgraphgallery.blogspot.com/2013/04/rg68-get-google-map-and-plot-data-in-it.html:

library(ggmap)
## Loading required package: ggplot2
ls("package:ggmap")
##  [1] "crime"             "distQueryCheck"    "geocode"          
##  [4] "geocodeQueryCheck" "geom_leg"          "get_cloudmademap" 
##  [7] "get_googlemap"     "get_map"           "get_openstreetmap"
## [10] "get_stamenmap"     "ggimage"           "gglocator"        
## [13] "ggmap"             "ggmapplot"         "hadley"           
## [16] "inset"             "inset_raster"      "legs2route"       
## [19] "LonLat2XY"         "make_bbox"         "mapdist"          
## [22] "OSM_scale_lookup"  "qmap"              "qmplot"           
## [25] "revgeocode"        "route"             "routeQueryCheck"  
## [28] "theme_inset"       "theme_nothing"     "wind"             
## [31] "XY2LonLat"         "zips"
# example of map of Hunter College area
hc <- geocode("Hunter College")
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Hunter+College&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
hcmap1 = get_map(location = unlist(hc), zoom = 15, maptype = 'roadmap', source = "google")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.768702,-73.964876&zoom=15&size=%20640x640&scale=%202&maptype=roadmap&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
huntermap1 = ggmap(hcmap1)
huntermap1

plot of chunk unnamed-chunk-14

Let's add the iris data to random locations:

library(grid)
df <- data.frame(lat=runif(nrow(iris), min=40.76, max=40.777),
  lon=runif(nrow(iris), min=-73.975, max=-73.955),
  length=iris$Sepal.Length,
  width=iris$Sepal.Width)
huntermap1 +   geom_point(aes(x = lon, y = lat, colour = width, size = length, alpha = 0.9), data=df)  + scale_colour_gradient(low="yellow", high="red")

plot of chunk unnamed-chunk-15

Exercises

Datasets are available at http://goo.gl/x87GY6

Exercise #1

Plot petal width vs. petal length for the iris dataset, identifying species using both color and datapoint type.

data(iris)
plot(Petal.Width ~ Petal.Length, col=c("black", "red", "blue")[Species], pch=(15:17)[Species], data=iris)
legend("topleft", legend=levels(iris$Species), col=c("black", "red", "blue"), pch=15:17)

plot of chunk unnamed-chunk-16

Exercise #2: GSE12945

Do exploratory analysis of the GSE12945 dataset, taken from the following paper:

Staub E, Groene J, Heinze M, Mennerich D et al. An expression module of WIPF1-coexpressed genes identifies patients with favorable prognosis in three tumor types. J Mol Med (Berl) 2009 Jun;87(6):633-44. PMID: 19399471

Dataset is available from http://goo.gl/x87GY6 or by using the following command:

download.file("https://www.dropbox.com/sh/pukanjaahmonmcp/AADWX-vKk70CuGgYWBqqxWjfa/datasets/GSE12945.csv", destfile="GSE12945.csv", method="wget")

Exercise #3 (Advanced)

Create appropriate pairwise plots for the first five variables in the NYC Community Health Survey dataset provided at https://goo.gl/qD2Wzm. On the diagonal, put the name of the variable shown in that row+column. Think about how you could customize the plot type using if/else statements.

download.file("https://goo.gl/qD2Wzm", destfile="chs2012_codedforR.csv", method="wget")
x <- read.csv("chs2012_codedforR.csv")
par(mfrow=c(5, 5))
par(mar=c(3, 3, 1, 1))
for (i in 1:5){
  for (j in 1:5){
    if(j != i){
      plot(x[[i]], x[[j]], xlab=colnames(x)[i], ylab=colnames(x)[j])
      }else{
        plot.new()
        text(x=0.5, y=0.5, label=colnames(x)[j])
      }
  }
}

plot of chunk unnamed-chunk-18