ggplot2_tutorial.R

frankdavenport — May 23, 2013, 9:55 AM

#---Gggplot2 Tutorial

#Author: Frank Davenport



#-------Set up----------------------------
rm(list=ls())

library(ggplot2) #the ggplot2 package
library(scales) #complements ggplot2 and also works with other graphics packages

#---Two Packages that are are great for data manipulation (also by ggplto2 author)
library(reshape2)
library(plyr)


#=========================================

#------Take a Sample from the Diamond Data-----
head(diamonds) #take a peek
  carat       cut color clarity depth table price    x    y    z
1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48

set.seed(1410) #make this reproducible--follows pg. 11 from original ggplot2 book
pd<-diamonds[sample(nrow(diamonds),1000),] #pd stands for 'plot data'

#============================================


#----Lets make some basic plots----------

#Give the basic plot information
p<-ggplot(data=pd,aes(x=carat,y=price))
#p  # does not draw anything, because we have not supplied any layers or geoms

#--Examine as points
p1<-p+geom_point()
p1

plot of chunk unnamed-chunk-1


#-Examine as a line
p2<-p+geom_line()
p2

plot of chunk unnamed-chunk-1


#--Plot as points, and a fit line
p3<-p+geom_point()+geom_smooth() #smooth fits a line, you can specify how, or it will pick a method for you, in this case it does a gam
p3
geom_smooth: method="auto" and size of largest group is >=1000, so using
gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the
smoothing method.

plot of chunk unnamed-chunk-1


#=============================================================


#---------Lets Specify x and y in the geom, not the call to ggplot-------

#--In the examples above the arguments for geom_() are inherieted from ggplot
#---We can also specify the x,y in the geom, rather than the call to ggplot
p<-ggplot(data=pd) #no aesthetics (aes()) specified here
p0<-p+geom_point(aes(x=carat,y=price)) #instead we specify it in geom_point() but we get the same result
p0

plot of chunk unnamed-chunk-1


#--This is useful, as not all geoms, take the same aesthetics
p<-p+geom_density(aes(x=price)) #looks weird
p

plot of chunk unnamed-chunk-1


p1<-p+geom_density(aes(x=price),fill='red') #we can manually specify some aesthetics 
p1+labs(title='Here we specify the fill color')

plot of chunk unnamed-chunk-1



p2<-p+geom_density(aes(x=price,fill=cut)) #or we can fill based on an attribute; Note that here, fill is inside aes()
p2+labs(title='Here we Map the fill color \nto an attribute of the data')

plot of chunk unnamed-chunk-1


#The help file for each geom specifices what aesthetics it takes

#==============================================================

#-----Lets Quickly explore some common geoms
p<-ggplot(data=pd)

p+geom_histogram(aes(x=price,fill=cut)) #histogram
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
this.

plot of chunk unnamed-chunk-1


p+geom_violin(aes(x=cut,y=price)) #violin plot

plot of chunk unnamed-chunk-1


p+geom_boxplot(aes(x=cut,y=price)) #box plot

plot of chunk unnamed-chunk-1


p+geom_text(aes(x=carat,y=price,label=cut)) #the text geom

plot of chunk unnamed-chunk-1


#---The hex geom is usful for visualizing large datasets
ggplot(data=diamonds)+geom_hex(aes(x=carat,y=price),bins=10) #this shows the same plot with the full diamond dataset

plot of chunk unnamed-chunk-1


#============================================================


#---Different Variations on bar graphs, by playing with the 'position' argument
p<-ggplot(data=pd,aes(x=color))

p+geom_bar()+labs(title='Standard Bar Graph')  #standard bar

plot of chunk unnamed-chunk-1


p+geom_bar(aes(fill=cut))+labs(title='Filled by Cut') #put colors in by another variables

plot of chunk unnamed-chunk-1


p+geom_bar(aes(fill=cut),position='fill')+labs(title='A stacked Ratio using "position=fill"') # stacked ratio

plot of chunk unnamed-chunk-1


p+geom_bar(aes(fill=cut),position='dodge')+labs(title='Side by Side, using "position=dodge"') #put them side by side

plot of chunk unnamed-chunk-1





#=============================================================


#----------NOW LETS LOOK AT SCALES AND FACETS----------------

#---Lets Explore a Few More Aesthetics, and Facets------------------
p<-ggplot(data=pd,aes(x=carat,y=price))
p1<-p+geom_point(aes(shape=cut))+labs(title='Different Shapes for Different Cuts')
p1

plot of chunk unnamed-chunk-1


p2<-p+geom_point(aes(size=depth))+labs(title='Different Sizes for Different Depths')
p2

plot of chunk unnamed-chunk-1


p2a<-p+geom_point(alpha=0.25,aes(size=depth))+labs(title='Different Sizes with Transparency')
p2a

plot of chunk unnamed-chunk-1


p3<-p2+facet_wrap(~cut)+labs(title='Different Facets for Each Cut')
p3

plot of chunk unnamed-chunk-1


p4<-p2+facet_grid(cut~color)+labs(title='A Facet Grid, based on Cut and Color')
p4

plot of chunk unnamed-chunk-1


p4a<-p4+geom_smooth(method='lm',size=1)+labs(title='We added a Linear Regression Line to each Facet')
p4a

plot of chunk unnamed-chunk-1


p4b<-p4a+facet_grid(cut~color,margins=T)+labs(title='With Magins=T we can view the marginal facets\n(last colunm and last row)')
p4b

plot of chunk unnamed-chunk-1


#======================================================================



#-----Taking Care of Details using theme() and scales----------------
#--Make a New baseline Plot
p<-ggplot(data=pd,aes(x=carat,y=price))+geom_point(aes(size=depth,color=cut))+labs('Our Basic Plot\nWith Scales for X,Y,Color,and Size')
p

plot of chunk unnamed-chunk-1


#--Format, and relable axis
p<-p+scale_y_continuous(name='$$$',labels=dollar)+labs(title='Changed the name and formatting on the Y axis\n(using scale_y_continuous)')
p

plot of chunk unnamed-chunk-1

p<-p+scale_x_continuous(name='Natural Log of Carat',trans='log')+labs(title='Applied a Natural Log Transformation \nto the X-Axis Using the "trans" Argument')
p

plot of chunk unnamed-chunk-1


#---Play Around with the Color and Size Scales
p<-p+scale_color_discrete(name='Diamond\nCut',h=c(0,180))+labs(title='Changed the Name and Hues of the Color Scale')
p

plot of chunk unnamed-chunk-1


p<-p+scale_size_continuous(name='Diamond Depth',breaks=c(55,60,65,69),range=c(1,4))+labs(title='Changed the Name, Break Points, and Relative Sizes on the Size Scale') #the range controls the relative size of the smallest, vs the largest sizes
p

plot of chunk unnamed-chunk-1

#=============================================================

#---Make More Adjustments with Theme------
p<-p+theme(legend.position='bottom')+labs('Changed the Position of the Legend Using the "theme()" Function')
p

plot of chunk unnamed-chunk-1


p<-p+theme_bw()+labs(title='Changed the Basic Layout using a default Theme Element')
p

plot of chunk unnamed-chunk-1


pbig<-p+theme(text=element_text(size=24))+labs(title='All text big')
pbig

plot of chunk unnamed-chunk-1


ptbig<-p+theme(text=element_text(color='blue'),plot.title=element_text(size=24))+labs(title='Just the Title text is big\nBut all text is blue')
ptbig

plot of chunk unnamed-chunk-1



#see ?theme for more options.

#--We can also use str() to explore the object and make manual changes
str(ptbig,1)
List of 9
 $ data       :'data.frame':    1000 obs. of  10 variables:
 $ layers     :List of 1
 $ scales     :Reference class 'Scales' [package "ggplot2"] with 1 fields
  ..and 20 methods, of which 9 are possibly relevant
 $ mapping    :List of 2
 $ theme      :List of 38
  ..- attr(*, "class")= chr [1:2] "theme" "gg"
  ..- attr(*, "complete")= logi TRUE
 $ coordinates:List of 1
  ..- attr(*, "class")= chr [1:2] "cartesian" "coord"
 $ facet      :List of 1
  ..- attr(*, "class")= chr [1:2] "null" "facet"
 $ plot_env   :<environment: R_GlobalEnv> 
 $ labels     :List of 6
 - attr(*, "class")= chr [1:2] "gg" "ggplot"

#=================================================================================

#-----------Fun Extras---------------------------------------------------------

#--Visualize correlation matrices using  geom_raster and geom_text
d<-economics
head(d)
        date   pce    pop psavert uempmed unemploy
1 1967-06-30 507.8 198712     9.8     4.5     2944
2 1967-07-31 510.9 198911     9.8     4.7     2945
3 1967-08-31 516.7 199113     9.0     4.6     2958
4 1967-09-30 513.3 199311     9.8     4.9     3143
5 1967-10-31 518.5 199498     9.7     4.7     3066
6 1967-11-30 526.2 199657     9.4     4.8     3018

dc<-cor(d[,2:ncol(d)])
dc<-round(as.matrix(dc),2)
dc
           pce   pop psavert uempmed unemploy
pce       1.00  0.99   -0.93    0.51     0.32
pop       0.99  1.00   -0.89    0.55     0.41
psavert  -0.93 -0.89    1.00   -0.36    -0.08
uempmed   0.51  0.55   -0.36    1.00     0.78
unemploy  0.32  0.41   -0.08    0.78     1.00

dc[upper.tri(dc,diag=F)]<-NA #We only want to plot 1/2 the matrix

pd<-melt(t(dc),value.name='Correlation') #convert it to data.frame readiable by ggplot, transposing it 't()' helps it properly oriented

#--Reverese the Factor levels on the y axis so is properly oriented
levs<-levels(pd$Var1)
pd$Var2<-as.character(pd$Var2)
pd$Var2<-factor(pd$Var2,levels=sort(levs,decreasing=T))

head(pd) #this is what the data looks like before we plot it
      Var1 Var2 Correlation
1      pce  pce        1.00
2      pop  pce          NA
3  psavert  pce          NA
4  uempmed  pce          NA
5 unemploy  pce          NA
6      pce  pop        0.99

#Lets Make some labes for the Axes
axlabels<-c('p. Consupmtion','Population','P. Savings Rate','Unemp. Duration','Unemp.')

p<-ggplot(data=pd,aes(x=Var1,y=Var2,fill=Correlation,label=Correlation))+geom_raster()
p<-p+geom_raster()+geom_text()+theme_bw()+labs(title='The Raw Plot')
p
Warning: Removed 10 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-1


p<-p+scale_fill_gradient2(name='Correlation',na.value='white') #create a diverging color gradient
p+labs(title='Now we have a diverngent Color Scale')
Warning: Removed 10 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-1


#---Make some final Cosmetic Changes
p<-p+theme(panel.border=element_rect(colour=NULL,fill=NULL),panel.grid.major=element_line(colour=NULL),axis.text=element_text(size=12),axis.text.x=element_text(size=10,angle=15))

p<-p+scale_x_discrete(expand=c(0,0),labels=axlabels)+scale_y_discrete(expand=c(0,0),labels=rev(axlabels))

p<-p+labs(x='',y='',title='Macroeconomic Correlations\n(1967-2007)')
p+coord_equal() #coord_equal() will force your graphics to be 'square' avoiding the 'squishy look'
Warning: Removed 10 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-1



#====================================================================================================

#-----------------------Now Lets Make the Same Plot but Panel it by Year
d$year<-cut(d$date,'years') #cut the month data into years
d$year<-substr(as.character(d$year),1,4) #Strip away month and day

#--Create a custom function that will prep the data
#--This function takes one data.frame as an argument and does all the transformation we preformed above
fun.gprep<-function(df,vars=c('pce','pop','psavert','uempmed','unemploy')){
  dc<-cor(df[,vars])
  dc<-round(as.matrix(dc),2)
  dc[upper.tri(dc,diag=F)]<-NA #We only want to plot 1/2 the matrix
  pd<-melt(t(dc),value.name='Correlation') 
  levs<-levels(pd$Var1)
  pd$Var2<-as.character(pd$Var2)
  pd$Var2<-factor(pd$Var2,levels=sort(levs,decreasing=T))
  return(pd)
}

#--Now we will apply that function to each year in the dataset, using ddply() from the plyr package
pd<-ddply(d,'year',fun.gprep)

#---Show the time series example, with tables
p<-ggplot(data=pd,aes(x=Var1,y=Var2,fill=Correlation,label=Correlation))+geom_raster()
p<-p+geom_raster()+theme_bw()+facet_wrap(~year)
p<-p+scale_fill_gradient2(name='Correlation',na.value='white') #create a diverging color gradient
p<-p+scale_x_discrete(name="",expand=c(0,0),labels=axlabels)+scale_y_discrete(name="",expand=c(0,0),labels=rev(axlabels))
p<-p+theme(panel.border=element_rect(colour=NULL,fill=NULL),panel.grid.major=element_line(colour=NULL),axis.text.x=element_text(size=6,angle=15),legend.position='bottom')
p+coord_equal()

plot of chunk unnamed-chunk-1