Class 5 Visualization

Packages Used ggplot2, reshape2, ggvis maps, ggmap, maptools, mapproj

Basic Plotiing

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
# install.packages("maps")
                
library(ggvis)
## Warning: package 'ggvis' was built under R version 3.1.1
## 
## Attaching package: 'ggvis'
## 
## The following object is masked from 'package:ggplot2':
## 
##     resolution
library(reshape2)
library(maps)
## Warning: package 'maps' was built under R version 3.1.1
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.1
library(maps)

library(maptools)
## Warning: package 'maptools' was built under R version 3.1.1
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.1.1
## Checking rgeos availability: FALSE
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()
library(mapproj)
## Warning: package 'mapproj' was built under R version 3.1.1
# Safai video: ggplot2
head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
tail(diamonds)
##       carat       cut color clarity depth table price    x    y    z
## 53935  0.72   Premium     D     SI1  62.7    59  2757 5.69 5.73 3.58
## 53936  0.72     Ideal     D     SI1  60.8    57  2757 5.75 5.76 3.50
## 53937  0.72      Good     D     SI1  63.1    55  2757 5.69 5.75 3.61
## 53938  0.70 Very Good     D     SI1  62.8    60  2757 5.66 5.68 3.56
## 53939  0.86   Premium     H     SI2  61.0    58  2757 6.15 6.12 3.74
## 53940  0.75     Ideal     D     SI2  62.2    55  2757 5.83 5.87 3.64
small <- diamonds[sample(nrow(diamonds), 1000),]


ggplot(small) + geom_point(aes(x=carat, y=price)) + ggtitle('example 01')

plot of chunk unnamed-chunk-1

ggplot(small) + geom_point(aes(x=carat, y=price, color = cut )) + ggtitle('example 02')

plot of chunk unnamed-chunk-1

ggplot(small) + geom_point(aes(x=carat, y=price, color = cut )) + ggtitle('example 02')

plot of chunk unnamed-chunk-1

ggplot(small, aes(x=carat, y=price, )) + geom_point(aes(color=cut)) + ggtitle('example 01')

plot of chunk unnamed-chunk-1

ggplot(small, aes(x=carat, y=price, color=cut)) + geom_point() + ggtitle('example 02')

plot of chunk unnamed-chunk-1

##########################################
### PLOTTING WITH BASE R
##########################################


#EXAMPLE 5.1
# FORMAT plot(x,y, type=“l”) # line
plot(faithful, type='p')

plot of chunk unnamed-chunk-1

# My check with the data set, in order to know why
names(faithful)
## [1] "eruptions" "waiting"
head(faithful)
##   eruptions waiting
## 1     3.600      79
## 2     1.800      54
## 3     3.333      74
## 4     2.283      62
## 5     4.533      85
## 6     2.883      55
dim(faithful)
## [1] 272   2
#Connect each x,y point by a line.
plot(faithful, type='l') #This plot doesn't make much sense, just for illustration 

plot of chunk unnamed-chunk-1

# EXAMPLE 5.2
# PLOTTING CATEGORICAL VARIABLES
#First, let's try:
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(iris)
## [1] 150   5
plot(iris$Species)

plot of chunk unnamed-chunk-1

#' Notice that it became a bar plot

#' Any 'factor' when plotted becomes a bar chart
library(ggplot2)
str(movies$mpaa)
##  Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
plot(movies[, "mpaa"])

plot of chunk unnamed-chunk-1

# Excluding the movies without ratings
mpaa.movies <- subset(movies, mpaa != "")
plot(mpaa.movies[, "mpaa"])

plot of chunk unnamed-chunk-1

# Check dataset of "mtcars"
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
dim(mtcars)
## [1] 32 11
# Plot
barplot(mtcars[ , "mpg"])

plot of chunk unnamed-chunk-1

plot(mtcars[ , "mpg"])

plot of chunk unnamed-chunk-1

# EXAMPLE 5.3
#Adding title/labels
plot(faithful, ann=FALSE) #get rid of the current labels
title(main="The Title", xlab="X Axis Label", ylab="Y Axis Label")

plot of chunk unnamed-chunk-1

# EXAMPLE 5.4
#You can take any numerical vector and plot a histogram very quickly.
#The Y-axis is the frequency
names(faithful)
## [1] "eruptions" "waiting"
hist(faithful$waiting)

plot of chunk unnamed-chunk-1

hist(faithful$eruptions)

plot of chunk unnamed-chunk-1

pairs(faithful)

plot of chunk unnamed-chunk-1

#  P A I R S 
#' @description Use pairs() to quickly produce a complete matrix of scatterplots
#' 
# EXAMPLE 5.5
pairs(iris)

plot of chunk unnamed-chunk-1

pairs(iris[,-5]) # everything except the 5th column

plot of chunk unnamed-chunk-1

GGPLOT

#Example 5.6
#Let's jump right in and create a few plots
names(movies)
##  [1] "title"       "year"        "length"      "budget"      "rating"     
##  [6] "votes"       "r1"          "r2"          "r3"          "r4"         
## [11] "r5"          "r6"          "r7"          "r8"          "r9"         
## [16] "r10"         "mpaa"        "Action"      "Animation"   "Comedy"     
## [21] "Drama"       "Documentary" "Romance"     "Short"
head(movies)
##                      title year length budget rating votes   r1   r2  r3
## 1                        $ 1971    121     NA    6.4   348  4.5  4.5 4.5
## 2        $1000 a Touchdown 1939     71     NA    6.0    20  0.0 14.5 4.5
## 3   $21 a Day Once a Month 1941      7     NA    8.2     5  0.0  0.0 0.0
## 4                  $40,000 1996     70     NA    8.2     6 14.5  0.0 0.0
## 5 $50,000 Climax Show, The 1975     71     NA    3.4    17 24.5  4.5 0.0
## 6                    $pent 2000     91     NA    4.3    45  4.5  4.5 4.5
##     r4   r5   r6   r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  4.5 14.5 24.5 24.5 14.5  4.5  4.5           0         0      1     1
## 2 24.5 14.5 14.5 14.5  4.5  4.5 14.5           0         0      1     0
## 3  0.0  0.0 24.5  0.0 44.5 24.5 24.5           0         1      0     0
## 4  0.0  0.0  0.0  0.0  0.0 34.5 45.5           0         0      1     0
## 5 14.5 14.5  4.5  0.0  0.0  0.0 24.5           0         0      0     0
## 6 14.5 14.5 14.5  4.5  4.5 14.5 14.5           0         0      0     1
##   Documentary Romance Short
## 1           0       0     0
## 2           0       0     0
## 3           0       0     1
## 4           0       0     0
## 5           0       0     0
## 6           0       0     0
dim(movies)
## [1] 58788    24
str(movies)
## 'data.frame':    58788 obs. of  24 variables:
##  $ title      : chr  "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
##  $ year       : int  1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
##  $ length     : int  121 71 7 70 71 91 93 25 97 61 ...
##  $ budget     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ rating     : num  6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
##  $ votes      : int  348 20 5 6 17 45 200 24 18 51 ...
##  $ r1         : num  4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r2         : num  4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
##  $ r3         : num  4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
##  $ r4         : num  4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
##  $ r5         : num  14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
##  $ r6         : num  24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
##  $ r7         : num  24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
##  $ r8         : num  14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
##  $ r9         : num  4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
##  $ r10        : num  4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
##  $ mpaa       : Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
##  $ Action     : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Animation  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Comedy     : int  1 1 0 1 0 0 0 0 0 0 ...
##  $ Drama      : int  1 0 0 0 0 1 1 0 1 0 ...
##  $ Documentary: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Romance    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Short      : int  0 0 1 0 0 0 0 1 0 0 ...
# Let's try different Y-axis variables
ggplot(movies, aes(x=year, y=budget)) + geom_point()
## Warning: Removed 53573 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-2

ggplot(movies, aes(x=year, y=rating)) + geom_point()

plot of chunk unnamed-chunk-2

ggplot(movies, aes(x=year, y=length)) + geom_point()

plot of chunk unnamed-chunk-2

ggplot(movies, aes(x=year, y=votes)) + geom_point()

plot of chunk unnamed-chunk-2

# Now, we are switching to Bar Plots. Notice the use of geom_bar()
ggplot(movies, aes(x=year)) + geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk unnamed-chunk-2

#It is usually a good idea to control the binwidth ourselves
# Let's do it by decades...
ggplot(movies, aes(x=year)) + geom_bar(binwidth=10)

plot of chunk unnamed-chunk-2

#We want to can color the bars by any variable.
#Here we are coloring by movies$mpaa. Notice the use of fill
ggplot(movies, aes(x=year, fill=mpaa)) + geom_bar(binwidth=10)

plot of chunk unnamed-chunk-2

#----------------------------------------------------------
library(ggvis)

mtcars %>% ggvis(x = ~wt, y = ~mpg) %>% layer_points()

mtcars %>% ggvis(x = ~wt, y = ~mpg, stroke := "red") %>% layer_points()

#----------------------------------------------------------

#Example 5.7
## BUILDING A PLOT ITERATIVELY
## AVG MOVIE RATINGS BY YEAR
ggplot(movies, aes(x=year, y=rating, color=rating)) + geom_point()

plot of chunk unnamed-chunk-2

p <- ggplot(movies, aes(x=year, y=rating, color=factor(rating))) + geom_point()
p

plot of chunk unnamed-chunk-2

#Let's try it with dplyr
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.1
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dplot <- movies %>%
  group_by(year) %>%
  summarize(yearly_rating = mean(rating)) %>%
  ggplot(aes(x=year, y=yearly_rating, color=yearly_rating)) + geom_point()

dplot

plot of chunk unnamed-chunk-2

#Let's fix the Y-axis scale so that it goes from 0 to 10.
dp2 <- dplot + scale_y_continuous(limits=c(0,10))
dp2

plot of chunk unnamed-chunk-2

#' Example 5.7A
#' How to create the same plot using ggVis
#' Using ggVis
library(ggvis)
library(dplyr)

movies %>%
  group_by(year) %>%
  summarize(yearly_rating = mean(rating)) %>%
  ggvis(x= ~year, y= ~yearly_rating, stroke :='red') %>%
  layer_points() %>%
  scale_numeric("y", domain= c(1,10))

#Example 5.7B -Alternative Way to summarize using the very flexible stat_summary option
#' This is a powerful way to transform and plot.
#' Note that we didn't have to create a new data frame
ggplot(movies, aes(x=year, y=rating)) + stat_summary(fun.y=mean, geom="point")

plot of chunk unnamed-chunk-2

UNDERSTANDING AESTHETICS

#Example 5.8
p <- ggplot(movies,  aes(x=year, y=rating, color=mpaa)) + geom_point() # instance of “mapping” (inside aes)
p

plot of chunk unnamed-chunk-3

q <- ggplot(movies,  aes(x=year, y=rating), color=mpaa) + geom_point() #an instance of “setting” (outside aes)
q

plot of chunk unnamed-chunk-3

##########
#IN CLASS EXERCISE
#Using mtcars
##########

#Example 5.9.

names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
base <- ggplot(data = mtcars)

#Try executing these commands and watch how the plot changes...
#Layer by layer, we are making the plot more complex.
base +  geom_point(aes(x=cyl, mpg))

plot of chunk unnamed-chunk-3

base + geom_point(aes(x=cyl, y=mpg, color=gear))

plot of chunk unnamed-chunk-3

base + geom_point(aes(x=cyl, y=mpg, color=factor(gear)))

plot of chunk unnamed-chunk-3

base + geom_point(aes(x=cyl, y=mpg, color=factor(gear), size=hp))

plot of chunk unnamed-chunk-3

base + geom_point(aes(x=cyl, y=mpg, color=factor(gear), size=hp, shape=factor(carb)))

plot of chunk unnamed-chunk-3

#Example 5.9a
#Subsetting the data and plotting only what we are interested in
#Say we are only interestd in Mercedes cars in the dataset
tf <- grepl("Merc", row.names(mtcars))
mtcars[tf,]
##              mpg cyl  disp  hp drat   wt qsec vs am gear carb
## Merc 240D   24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2
## Merc 230    22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
## Merc 280    19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
## Merc 280C   17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
## Merc 450SE  16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3
## Merc 450SL  17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
## Merc 450SLC 15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3
ggplot(mtcars[tf,])+ geom_point(aes(x=cyl, y=mpg, color=factor(gear), size=hp, shape=factor(carb)))

plot of chunk unnamed-chunk-3 ##### Barplot vs Histograms

#Example 5.10
library(ggplot2)
str(quakes)
## 'data.frame':    1000 obs. of  5 variables:
##  $ lat     : num  -20.4 -20.6 -26 -18 -20.4 ...
##  $ long    : num  182 181 184 182 182 ...
##  $ depth   : int  562 650 42 626 649 195 82 194 211 622 ...
##  $ mag     : num  4.8 4.2 5.4 4.1 4 4 4.8 4.4 4.7 4.3 ...
##  $ stations: int  41 15 43 19 11 12 43 15 35 19 ...
ggplot(quakes, aes(x=depth)) + geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-4

hgram <- ggplot(quakes, aes(x=depth)) + geom_bar(binwidth=50)

hgram

plot of chunk unnamed-chunk-4

#Example 5.11
#Let's plot a BAR PLOT instead
#' We should use stat="identity"
quakes_agg <- aggregate(mag ~ round(depth, -2), data=quakes, FUN=length) #rounding to 100s
str(quakes_agg) #only 8 rows
## 'data.frame':    8 obs. of  2 variables:
##  $ round(depth, -2): num  0 100 200 300 400 500 600 700
##  $ mag             : int  77 271 159 66 55 142 220 10
names(quakes_agg) <- c("depth", "mag")
bplot <- ggplot(quakes_agg) + geom_bar(aes(x=depth, y=mag), stat="identity")
bplot

plot of chunk unnamed-chunk-4

######################################
#Titles
#Example 5.12
names(faithful)
## [1] "eruptions" "waiting"
p <- ggplot(faithful,aes(x=waiting, y=eruptions)) + geom_point()
p

plot of chunk unnamed-chunk-4

#add a title 
p + labs(title="Waiting Time Between Eruptions")

plot of chunk unnamed-chunk-4

#change the Y-axis title
p + ylab("Time between eruptions")

plot of chunk unnamed-chunk-4

#Hide (supress) the title for the X-axis
p + xlab("")

plot of chunk unnamed-chunk-4

#Say we want the x-axis text ever 5 minutes (instead of 10)
p + scale_x_continuous(limits=c(30, 110), breaks=seq(40,100,by=5))

plot of chunk unnamed-chunk-4

Setting own Colors

#Example 5.13
p <- ggplot(movies, aes(x=year, y=rating, color=rating)) + geom_point()
p <- p + scale_color_continuous(low="red", high="green")
p

plot of chunk unnamed-chunk-5

Legends

#Example 5.14
plot <- ggplot(mtcars, aes(x=cyl, y=mpg, fill=factor(gear))) + geom_bar(stat="identity") 
plot + scale_fill_discrete(name="")

plot of chunk unnamed-chunk-6

plot + scale_fill_discrete(name="Num Gears")

plot of chunk unnamed-chunk-6

plot + guides(fill=FALSE)

plot of chunk unnamed-chunk-6

CONDITIONAL PLOTTING
# DO Quakes with higher Depth come with higher Magnitude?
#Example 5.15
head(quakes)
##      lat  long depth mag stations
## 1 -20.42 181.6   562 4.8       41
## 2 -20.62 181.0   650 4.2       15
## 3 -26.00 184.1    42 5.4       43
## 4 -17.97 181.7   626 4.1       19
## 5 -20.42 182.0   649 4.0       11
## 6 -19.68 184.3   195 4.0       12
head(cut(quakes$depth, c(0, 200, 400, 800)) )
## [1] (400,800] (400,800] (0,200]   (400,800] (400,800] (0,200]  
## Levels: (0,200] (200,400] (400,800]
quakes$depth.group
## NULL
ggplot(quakes, aes(x=depth, y=mag)) + geom_boxplot()

plot of chunk unnamed-chunk-7

#Lett's create a new group, based on quake depth
quakes$depth.group <- cut(quakes$depth, c(0, 200, 400, 800))
ggplot(quakes, aes(x=depth, y=mag, fill=depth.group)) + geom_boxplot()
## Warning: position_dodge requires constant width: output may be incorrect

plot of chunk unnamed-chunk-7

# USING dplyr (the same example 5.16)
quakes %>%
  mutate(depth_group = cut(depth, c(0,200,400,800))) %>%
  ggplot(., aes(x=depth, y=mag, fill=depth_group)) + geom_boxplot()
## Warning: position_dodge requires constant width: output may be incorrect

plot of chunk unnamed-chunk-7

##--------------------------------------------------------------------------##
#Example 5.16 (Slide is at the end of the deck)
head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
h <- ggplot(diamonds[1:1000,], aes(x=cut)) + geom_bar(); h #simple histogram

plot of chunk unnamed-chunk-7

h <- ggplot(diamonds[1:1000,], aes(x=cut, fill=color)) + geom_bar(); h # stacked by default

plot of chunk unnamed-chunk-7

h <- ggplot(diamonds[1:1000,], aes(x=cut, fill=color)) + geom_bar(position="dodge") #unstack 
h

plot of chunk unnamed-chunk-7

#Example 5.17
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
#add a new column to the data.frame
iris2 <- ddply(iris, "Species", transform,  avg.petal = mean(Petal.Length))
#use the new column to show height
ggplot(iris2) + geom_bar(aes(x=Species, y=avg.petal))  # This one has error!!

plot of chunk unnamed-chunk-7 ### FACETS

#Example 5.18
names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
base_g <- ggplot(mtcars, aes(x=hp, y=mpg)) + geom_point()
base_g

plot of chunk unnamed-chunk-8

base_g + facet_grid(. ~ cyl) #side-by-side comparisons

plot of chunk unnamed-chunk-8

base_g + facet_grid(gear ~ .) # top-to-bottom comparisons

plot of chunk unnamed-chunk-8

base_g + facet_grid(gear ~ cyl)

plot of chunk unnamed-chunk-8

##SECOND EXAMPLE TO UNDERSTAND FACETS
# This time we use the diamonds data set
#Example 5.19 -------------------------------------
unique(diamonds$cut)
## [1] Ideal     Premium   Good      Very Good Fair     
## Levels: Fair < Good < Very Good < Premium < Ideal
unique(diamonds$clarity)
## [1] SI2  SI1  VS1  VS2  VVS2 VVS1 I1   IF  
## Levels: I1 < SI2 < SI1 < VS2 < VS1 < VVS2 < VVS1 < IF
names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"
dim(diamonds)
## [1] 53940    10
base_g <- ggplot(diamonds[1:1000,]) + geom_point(aes(x=carat, y=price))
base_g

plot of chunk unnamed-chunk-8

g1 <- base_g + facet_grid(. ~ cut) #horizontal stacking
g2 <- base_g + facet_grid(cut ~ .) #vertical stacking
g3 <- base_g + facet_grid(clarity ~ cut) #vertical stacking

#We did not discuss wrap in class, but good to know
g_wrap <- g1 + facet_wrap(clarity ~ cut) #good for exploring, to search for interesting cases
g_wrap

plot of chunk unnamed-chunk-8

SPECIAL TYPES OF GRAPHS - Heat Maps, Maps & Choropleths

library(ggplot2)
movieGenres <- movies[c(18:23)] #subset to 6 genres. We are only interested in genres
head(movies[c(18:23)])
##   Action Animation Comedy Drama Documentary Romance
## 1      0         0      1     1           0       0
## 2      0         0      1     0           0       0
## 3      0         1      0     0           0       0
## 4      0         0      1     0           0       0
## 5      0         0      0     0           0       0
## 6      0         0      0     1           0       0
head(movieGenres)
##   Action Animation Comedy Drama Documentary Romance
## 1      0         0      1     1           0       0
## 2      0         0      1     0           0       0
## 3      0         1      0     0           0       0
## 4      0         0      1     0           0       0
## 5      0         0      0     0           0       0
## 6      0         0      0     1           0       0
tail(movies, 20)
##                         title year length   budget rating votes   r1   r2
## 58769   Zzikhimyeon jukneunda 2000     94       NA    4.7    45 14.5  4.5
## 58770                    Zzim 1998    101       NA    4.5    14 14.5  4.5
## 58771                   Zzyzx 2005     90  1000000    8.0    10 24.5  0.0
## 58772               alaska.de 2000     89       NA    6.2   186  4.5  4.5
## 58773             barefootin' 1987      3       NA    5.7     6  0.0  0.0
## 58774             deadend.com 2002    120       NA    6.9    53 64.5  4.5
## 58775                e-Dreams 2001     94       NA    6.8    86  4.5  0.0
## 58776                   eMale 2001     17       NA    7.3    15  0.0  0.0
## 58777               eRATicate 2003      9       NA    6.0     5  0.0  0.0
## 58778            eXXXorcismos 2002     78       NA    4.2    11 34.5  0.0
## 58779                eXistenZ 1999     97       NA    6.7 14742  4.5  4.5
## 58780                f2point8 2002     20       NA    5.0    13  4.5  0.0
## 58781                      f8 2001     13       NA    7.6     7  0.0  0.0
## 58782            pURe kILLjoy 1998     87       NA    5.2     6  0.0 14.5
## 58783                  sIDney 2002     15       NA    7.0     8 14.5  0.0
## 58784               tom thumb 1958     98       NA    6.5   274  4.5  4.5
## 58785             www.XXX.com 2003    105       NA    1.1    12 45.5  0.0
## 58786  www.hellssoapopera.com 1999    100       NA    6.6     5 24.5  0.0
## 58787                     xXx 2002    132 85000000    5.5 18514  4.5  4.5
## 58788 xXx: State of the Union 2005    101 87000000    3.9  1584 24.5  4.5
##         r3   r4   r5   r6   r7   r8   r9  r10  mpaa Action Animation
## 58769  4.5 24.5 24.5 14.5  4.5  4.5  0.0  4.5            0         0
## 58770 24.5  4.5 14.5 24.5  0.0  4.5  4.5  0.0            0         0
## 58771  0.0  0.0  0.0  0.0  0.0  0.0 24.5 64.5            0         0
## 58772  4.5  4.5  4.5 14.5 14.5 24.5 14.5  4.5            0         0
## 58773  0.0 14.5 45.5  0.0 34.5  0.0  0.0  0.0            0         1
## 58774  0.0  4.5  4.5  0.0  4.5  4.5  4.5  4.5            0         0
## 58775  0.0  4.5  4.5 14.5 14.5 34.5  4.5 14.5            0         0
## 58776  0.0  4.5  4.5  4.5  0.0 24.5 24.5 24.5            0         0
## 58777  0.0  0.0  0.0 44.5  0.0  0.0 24.5 44.5            0         0
## 58778  0.0  0.0  0.0  4.5  0.0 14.5  4.5 24.5            0         0
## 58779  4.5  4.5  4.5 14.5 24.5 24.5 14.5  4.5     R      0         0
## 58780  4.5  4.5 14.5  0.0 14.5  4.5 14.5 24.5            0         0
## 58781  0.0  0.0  0.0 24.5 14.5 24.5 24.5  0.0            0         1
## 58782 14.5 14.5  0.0 34.5  0.0  0.0  0.0 14.5            0         0
## 58783  0.0 14.5  0.0  0.0 24.5 14.5 14.5 24.5            1         0
## 58784  4.5  4.5 14.5 14.5 24.5 14.5  4.5  4.5            0         1
## 58785  0.0  0.0  0.0  0.0 24.5  0.0  0.0 24.5            0         0
## 58786 24.5  0.0  0.0  0.0  0.0  0.0 24.5 44.5            0         0
## 58787  4.5  4.5 14.5 14.5 14.5 14.5  4.5  4.5 PG-13      1         0
## 58788  4.5  4.5  4.5 14.5  4.5  4.5  4.5 14.5 PG-13      1         0
##       Comedy Drama Documentary Romance Short
## 58769      0     0           0       0     0
## 58770      0     0           0       0     0
## 58771      0     0           0       0     0
## 58772      0     1           0       0     0
## 58773      0     0           0       0     1
## 58774      0     1           0       0     0
## 58775      0     0           1       0     0
## 58776      1     0           0       0     1
## 58777      1     0           0       0     1
## 58778      0     1           0       0     0
## 58779      0     1           0       0     0
## 58780      0     0           0       0     1
## 58781      0     0           0       0     1
## 58782      0     0           0       0     0
## 58783      0     0           0       0     1
## 58784      0     0           0       0     0
## 58785      0     1           0       1     0
## 58786      0     0           0       0     0
## 58787      0     0           0       0     0
## 58788      0     0           0       0     0
cor(movieGenres) # 6x6 cor matrix
##               Action Animation   Comedy    Drama Documentary  Romance
## Action       1.00000  -0.05443 -0.08289  0.00776    -0.06949 -0.02336
## Animation   -0.05443   1.00000  0.17967 -0.17916    -0.05204 -0.06637
## Comedy      -0.08289   0.17967  1.00000 -0.25578    -0.14084  0.10986
## Drama        0.00776  -0.17916 -0.25578  1.00000    -0.17344  0.10355
## Documentary -0.06949  -0.05204 -0.14084 -0.17344     1.00000 -0.07158
## Romance     -0.02336  -0.06637  0.10986  0.10355    -0.07158  1.00000
#It is much easier to plot when there is one data point per row
library(reshape2)
#for that, we melt the data.
mdf <-melt(cor(movieGenres)) 
names(mdf) # 'melt' automatically creates columns Var1, Var2, value
## [1] "Var1"  "Var2"  "value"
head(mdf)
##          Var1   Var2    value
## 1      Action Action  1.00000
## 2   Animation Action -0.05443
## 3      Comedy Action -0.08289
## 4       Drama Action  0.00776
## 5 Documentary Action -0.06949
## 6     Romance Action -0.02336
# Book: Intro Stat with R, P140
library(ISwR)
## Warning: package 'ISwR' was built under R version 3.1.2
attach(heart.rate)
heart.rate
##     hr subj time
## 1   96    1    0
## 2  110    2    0
## 3   89    3    0
## 4   95    4    0
## 5  128    5    0
## 6  100    6    0
## 7   72    7    0
## 8   79    8    0
## 9  100    9    0
## 10  92    1   30
## 11 106    2   30
## 12  86    3   30
## 13  78    4   30
## 14 124    5   30
## 15  98    6   30
## 16  68    7   30
## 17  75    8   30
## 18 106    9   30
## 19  86    1   60
## 20 108    2   60
## 21  85    3   60
## 22  78    4   60
## 23 118    5   60
## 24 100    6   60
## 25  67    7   60
## 26  74    8   60
## 27 104    9   60
## 28  92    1  120
## 29 114    2  120
## 30  83    3  120
## 31  83    4  120
## 32 118    5  120
## 33  94    6  120
## 34  71    7  120
## 35  74    8  120
## 36 102    9  120
# Same as:
heart.rate <- data.frame(hr = c(96,110,89,95,128,100,72,79,100,
92,106,86,78,124,98,68,75,106,
86,108,85,78,118,100,67,74,104,
92,114,83,83,118,94,71,74,102),
subj=gl(9,1,36),
time=gl(4,9,36,labels=c(0,30,60,120)))

gl(9,1,36)
##  [1] 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8
## [36] 9
## Levels: 1 2 3 4 5 6 7 8 9
anova(lm(hr~subj+time))
## Analysis of Variance Table
## 
## Response: hr
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## subj       8   8967    1121   90.64 4.9e-16 ***
## time       3    151      50    4.07   0.018 *  
## Residuals 24    297      12                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
interaction.plot(time, subj, hr)

plot of chunk unnamed-chunk-9

## Back to Ram's

hm <- ggplot(data=mdf, aes(x=Var1, y=Var2, fill=value)) + geom_tile()
hm

plot of chunk unnamed-chunk-9

#Good, but the colors are very difficult to interpret.

#Example 5.20b better colors --------------

#'
#Side Example showing the use of colorRampPalette
Red_to_White_range<-colorRampPalette(c("red", "white" ) ) 
Red_to_White_range(2) # creates a vector of 10 Hex Color values
## [1] "#FF0000" "#FFFFFF"
Red_to_White_range(15) # creates a vector of 5 Hex Color values
##  [1] "#FF0000" "#FF1212" "#FF2424" "#FF3636" "#FF4848" "#FF5B5B" "#FF6D6D"
##  [8] "#FF7F7F" "#FF9191" "#FFA3A3" "#FFB6B6" "#FFC8C8" "#FFDADA" "#FFECEC"
## [15] "#FFFFFF"
#' End of Side Example. Back to main Heat Map

#set up a coloring scheme using colorRampPalette
red=rgb(1,0,0); green=rgb(0,1,0); blue=rgb(0,0,1); white=rgb(1,1,1)
red_to_White_range<-colorRampPalette(c(red, white ) ) # We can divide this into as many colorvalues as we like
white_to_Green_range<-colorRampPalette(c(white, green) ) 
########colors are ready

#Let's say that we want "red" colors for negative correlations and "green" for positives.
#(We can gray out the 1 along the diagonal.)
hm <- hm + scale_fill_gradient2(low=red_to_White_range(100), 
                                mid=white_to_Green_range(100), 
                                high="gray")
#Why did I choose fill_gradient2? Because it allows for a c(low,mid, high)

hm

plot of chunk unnamed-chunk-9

PLOTTING LOCATIONS ON MAPS

#Example 5.21 ----MAPS
library("ggmap")
locations <- c("PVG", "sFO", "Chennai", "London", "Melbourne", "Johannesburg, SA")
coord_visited <- geocode(locations) #geocode is part of ggmap
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PVG&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=sFO&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Chennai&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=London&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Melbourne&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Johannesburg,+SA&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
locations_df <- data.frame(locations, coord_visited)

mp <- NULL #clean start
#Using GGPLOT, plot the Base World Map
world_map <- borders("world", colour="gray50", fill="gray50") # create a layer of borders
# ?borders # This is part of ggplot2

#align the two with lat and lon as the common coordinates
mp <- ggplot(locations_df, aes(x=lon, y=lat)) + world_map
mp

plot of chunk unnamed-chunk-10

#Now Layer the cities on top
mp <- mp+ geom_point(color="blue", size=5)
mp

plot of chunk unnamed-chunk-10

#we can use any variable for the color aesthetic
mp <- mp+ geom_point(aes(color=lon), size=3)
mp

plot of chunk unnamed-chunk-10

#we can use any column for the point size aesthetic
mp <- mp+ geom_point(aes(color=lon, size=lat))
mp

plot of chunk unnamed-chunk-10

#Example 5.22 ---Maps package

# Extra
#PLOTTING USING The maps package.
#That is, without using ggplot2
library(maps)
map("world", fill=TRUE, col="white", bg="lightblue", ylim=c(-60, 90), mar=c(0,0,0,0))
points(locations_df$lon,locations_df$lat, col="red", pch=16)

plot of chunk unnamed-chunk-10

Plotting CHOROPLETHS

From: http://dict.baidu.com/s?wd=choropleth+ The character of expressing information of choropleth maps in internet is analyzed, and the main idea and technology of publishing interactive choropleth maps are expatiated. ????????????????????????????????????????????????????????????,????????????????????????????????????????????????????????????

For a choropleth to work, we need TWO data frames. One is a “map” data frame - the lat/long points and a way to identify the shapess. The second df has the values that we want to base our color on, and it has the same IDs. Important: The two data frames are linked by that common “ID” column. That is how ggplot knows to link the two data frames.

Choropleth map???????????????????????????????????????????????????????????????????????????(????????????????????????????????????),??????????????????????????????(??????)??????????????????,??????????????????????????????????????????,??????????????????????????????????????????????????????????????????

#Example 5.23 -------------------------------------
# 1. First, create the maps data frame
# ??map_data
states_map_coords <- map_data("state") # Create a data frame of map data

head(states_map_coords)
##     long   lat group order  region subregion
## 1 -87.46 30.39     1     1 alabama      <NA>
## 2 -87.48 30.37     1     2 alabama      <NA>
## 3 -87.53 30.37     1     3 alabama      <NA>
## 4 -87.53 30.33     1     4 alabama      <NA>
## 5 -87.57 30.33     1     5 alabama      <NA>
## 6 -87.59 30.33     1     6 alabama      <NA>
tail(states_map_coords)
##         long   lat group order  region subregion
## 15594 -106.3 41.01    63 15594 wyoming      <NA>
## 15595 -106.9 41.01    63 15595 wyoming      <NA>
## 15596 -107.3 41.02    63 15596 wyoming      <NA>
## 15597 -107.9 41.02    63 15597 wyoming      <NA>
## 15598 -109.1 40.99    63 15598 wyoming      <NA>
## 15599 -109.1 41.00    63 15599 wyoming      <NA>
dim(states_map_coords)
## [1] 15537     6
head(states_map_coords)
##     long   lat group order  region subregion
## 1 -87.46 30.39     1     1 alabama      <NA>
## 2 -87.48 30.37     1     2 alabama      <NA>
## 3 -87.53 30.37     1     3 alabama      <NA>
## 4 -87.53 30.33     1     4 alabama      <NA>
## 5 -87.57 30.33     1     5 alabama      <NA>
## 6 -87.59 30.33     1     6 alabama      <NA>
#2. Create a data frame with the values (color intensity will be based on that)
stateName <- tolower(state.name) # state.name is part of built-in datasets pkg
stateName
##  [1] "alabama"        "alaska"         "arizona"        "arkansas"      
##  [5] "california"     "colorado"       "connecticut"    "delaware"      
##  [9] "florida"        "georgia"        "hawaii"         "idaho"         
## [13] "illinois"       "indiana"        "iowa"           "kansas"        
## [17] "kentucky"       "louisiana"      "maine"          "maryland"      
## [21] "massachusetts"  "michigan"       "minnesota"      "mississippi"   
## [25] "missouri"       "montana"        "nebraska"       "nevada"        
## [29] "new hampshire"  "new jersey"     "new mexico"     "new york"      
## [33] "north carolina" "north dakota"   "ohio"           "oklahoma"      
## [37] "oregon"         "pennsylvania"   "rhode island"   "south carolina"
## [41] "south dakota"   "tennessee"      "texas"          "utah"          
## [45] "vermont"        "virginia"       "washington"     "west virginia" 
## [49] "wisconsin"      "wyoming"
name_length <- nchar(stateName) # how many letters in the name
name_length
##  [1]  7  6  7  8 10  8 11  8  7  7  6  5  8  7  4  6  8  9  5  8 13  8  9
## [24] 11  8  7  8  6 13 10 10  8 14 12  4  8  6 12 12 14 12  9  5  4  7  8
## [47] 10 13  9  7
values_df<- data.frame(stateName, name_length) #creating a 50x2 data frame here
head(values_df)
##    stateName name_length
## 1    alabama           7
## 2     alaska           6
## 3    arizona           7
## 4   arkansas           8
## 5 california          10
## 6   colorado           8
#3. Create a choropleth
# We are going to plot the name_length values, on top of the base map
# The two are linked by the common column called stateName
mp <- ggplot(values_df, aes(map_id=stateName))  + geom_map(aes(fill=name_length), map=states_map_coords) 
mp <- mp +  expand_limits(x = states_map_coords$long, y = states_map_coords$lat) 

# Let's a give it a color-gradient. Low to High
mp <- mp + scale_fill_gradient(low='grey90', high='darkgreen', limits=c(0,14))
mp

plot of chunk unnamed-chunk-11

mp <- mp + guides(fill=FALSE)

#Optional step. If you want better-looking maps
library(mapproj)
mp <- mp + coord_map("polyconic")
mp

plot of chunk unnamed-chunk-11

#and that's the choropleth we want
########################

For Homework 4:

Question 4 0f 10:

# A.
#ggplot(mtcars, aes(x=factor(cyl), y=mpg, fill=cyl)) + geom_boxplot() + guides(fill=guide_legend(title="Cylinders"))

# B.
ggplot(mtcars, aes(x=cyl, y=mpg, fill=factor(cyl))) + geom_boxplot() + guides(fill=guide_legend(title="Cylinders"))

plot of chunk unnamed-chunk-13

# C
ggplot(mtcars, aes(x=factor(cyl), y=mpg, fill=cyl)) + geom_boxplot() + scale_color_discrete()+ guides(fill=guide_legend(title="Cylinders"))

plot of chunk unnamed-chunk-13

# D
ggplot(mtcars, aes(x=factor(cyl), y=mpg, fill=factor(cyl))) + geom_boxplot() + guides(fill=guide_legend(title="Cylinders"))

plot of chunk unnamed-chunk-13

Question 5 of 10

ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() 

plot of chunk unnamed-chunk-14

# Added:
# A.
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(limits=c(0, 50))

plot of chunk unnamed-chunk-14

# B
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(breaks=seq(0, 50, 10))

plot of chunk unnamed-chunk-14

# C

ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot()  + scale_y_continuous(breaks=0:50)

plot of chunk unnamed-chunk-14

# D
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(breaks=c(0,50))

plot of chunk unnamed-chunk-14

Question 6 of 10

#Original
ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar()

plot of chunk unnamed-chunk-15

# A.
#ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar(stat="identity")
# Doesn't show anything

# B

ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar(position="dodge")

plot of chunk unnamed-chunk-15

# C
ggplot(diamonds, aes(x=factor(color), fill=cut)) + geom_bar()

plot of chunk unnamed-chunk-15

# D
# ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar(color+cut ~ .)
# This one has error

Question 8 of 10

#Data frame 1 
crimes <-data.frame(state = tolower(rownames(USArrests)), USArrests) 

#Data frame 2 
states_map <-map_data("state") 
head(states_map)
##     long   lat group order  region subregion
## 1 -87.46 30.39     1     1 alabama      <NA>
## 2 -87.48 30.37     1     2 alabama      <NA>
## 3 -87.53 30.37     1     3 alabama      <NA>
## 4 -87.53 30.33     1     4 alabama      <NA>
## 5 -87.57 30.33     1     5 alabama      <NA>
## 6 -87.59 30.33     1     6 alabama      <NA>
#Now we can plot the choropleth:
ggplot(crimes, aes(map_id = state)) + geom_map(aes(fill = Murder), map = states_map) + expand_limits(x = states_map$long, y = states_map$lat)

plot of chunk unnamed-chunk-16

# A.


# B

# C

# D

Question 10 of 10 1.0 Points

Facet Grid question:

The following graph was created using ggplot’s facet_grid option. It used the mtcars dataset.

The relevant variables are:

mpg - miles per gallon

am - automatic or manual: 0 or 1 in the dataset

gear - 3,4 or 5 gears by automobile

cyl - 4, 6 or 8 cylinder engines.

base_g <- ggplot(mtcars, aes(x=am, y=mpg)) + geom_point()
base_g

plot of chunk unnamed-chunk-17

base_g + facet_grid(gear~cyl)

plot of chunk unnamed-chunk-17

base_g + facet_grid (am~cyl)  # This one

plot of chunk unnamed-chunk-17

base_g + facet_grid(am+gear ~ mpg+cyl) 

plot of chunk unnamed-chunk-17

base_g + facet_grid(am+cyl ~ gear)

plot of chunk unnamed-chunk-17