frankdavenport — May 23, 2013, 9:55 AM
#---Gggplot2 Tutorial
#Author: Frank Davenport
#-------Set up----------------------------
rm(list=ls())
library(ggplot2) #the ggplot2 package
library(scales) #complements ggplot2 and also works with other graphics packages
#---Two Packages that are are great for data manipulation (also by ggplto2 author)
library(reshape2)
library(plyr)
#=========================================
#------Take a Sample from the Diamond Data-----
head(diamonds) #take a peek
carat cut color clarity depth table price x y z
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
set.seed(1410) #make this reproducible--follows pg. 11 from original ggplot2 book
pd<-diamonds[sample(nrow(diamonds),1000),] #pd stands for 'plot data'
#============================================
#----Lets make some basic plots----------
#Give the basic plot information
p<-ggplot(data=pd,aes(x=carat,y=price))
#p # does not draw anything, because we have not supplied any layers or geoms
#--Examine as points
p1<-p+geom_point()
p1
#-Examine as a line
p2<-p+geom_line()
p2
#--Plot as points, and a fit line
p3<-p+geom_point()+geom_smooth() #smooth fits a line, you can specify how, or it will pick a method for you, in this case it does a gam
p3
geom_smooth: method="auto" and size of largest group is >=1000, so using
gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the
smoothing method.
#=============================================================
#---------Lets Specify x and y in the geom, not the call to ggplot-------
#--In the examples above the arguments for geom_() are inherieted from ggplot
#---We can also specify the x,y in the geom, rather than the call to ggplot
p<-ggplot(data=pd) #no aesthetics (aes()) specified here
p0<-p+geom_point(aes(x=carat,y=price)) #instead we specify it in geom_point() but we get the same result
p0
#--This is useful, as not all geoms, take the same aesthetics
p<-p+geom_density(aes(x=price)) #looks weird
p
p1<-p+geom_density(aes(x=price),fill='red') #we can manually specify some aesthetics
p1+labs(title='Here we specify the fill color')
p2<-p+geom_density(aes(x=price,fill=cut)) #or we can fill based on an attribute; Note that here, fill is inside aes()
p2+labs(title='Here we Map the fill color \nto an attribute of the data')
#The help file for each geom specifices what aesthetics it takes
#==============================================================
#-----Lets Quickly explore some common geoms
p<-ggplot(data=pd)
p+geom_histogram(aes(x=price,fill=cut)) #histogram
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
this.
p+geom_violin(aes(x=cut,y=price)) #violin plot
p+geom_boxplot(aes(x=cut,y=price)) #box plot
p+geom_text(aes(x=carat,y=price,label=cut)) #the text geom
#---The hex geom is usful for visualizing large datasets
ggplot(data=diamonds)+geom_hex(aes(x=carat,y=price),bins=10) #this shows the same plot with the full diamond dataset
#============================================================
#---Different Variations on bar graphs, by playing with the 'position' argument
p<-ggplot(data=pd,aes(x=color))
p+geom_bar()+labs(title='Standard Bar Graph') #standard bar
p+geom_bar(aes(fill=cut))+labs(title='Filled by Cut') #put colors in by another variables
p+geom_bar(aes(fill=cut),position='fill')+labs(title='A stacked Ratio using "position=fill"') # stacked ratio
p+geom_bar(aes(fill=cut),position='dodge')+labs(title='Side by Side, using "position=dodge"') #put them side by side
#=============================================================
#----------NOW LETS LOOK AT SCALES AND FACETS----------------
#---Lets Explore a Few More Aesthetics, and Facets------------------
p<-ggplot(data=pd,aes(x=carat,y=price))
p1<-p+geom_point(aes(shape=cut))+labs(title='Different Shapes for Different Cuts')
p1
p2<-p+geom_point(aes(size=depth))+labs(title='Different Sizes for Different Depths')
p2
p2a<-p+geom_point(alpha=0.25,aes(size=depth))+labs(title='Different Sizes with Transparency')
p2a
p3<-p2+facet_wrap(~cut)+labs(title='Different Facets for Each Cut')
p3
p4<-p2+facet_grid(cut~color)+labs(title='A Facet Grid, based on Cut and Color')
p4
p4a<-p4+geom_smooth(method='lm',size=1)+labs(title='We added a Linear Regression Line to each Facet')
p4a
p4b<-p4a+facet_grid(cut~color,margins=T)+labs(title='With Magins=T we can view the marginal facets\n(last colunm and last row)')
p4b
#======================================================================
#-----Taking Care of Details using theme() and scales----------------
#--Make a New baseline Plot
p<-ggplot(data=pd,aes(x=carat,y=price))+geom_point(aes(size=depth,color=cut))+labs('Our Basic Plot\nWith Scales for X,Y,Color,and Size')
p
#--Format, and relable axis
p<-p+scale_y_continuous(name='$$$',labels=dollar)+labs(title='Changed the name and formatting on the Y axis\n(using scale_y_continuous)')
p
p<-p+scale_x_continuous(name='Natural Log of Carat',trans='log')+labs(title='Applied a Natural Log Transformation \nto the X-Axis Using the "trans" Argument')
p
#---Play Around with the Color and Size Scales
p<-p+scale_color_discrete(name='Diamond\nCut',h=c(0,180))+labs(title='Changed the Name and Hues of the Color Scale')
p
p<-p+scale_size_continuous(name='Diamond Depth',breaks=c(55,60,65,69),range=c(1,4))+labs(title='Changed the Name, Break Points, and Relative Sizes on the Size Scale') #the range controls the relative size of the smallest, vs the largest sizes
p
#=============================================================
#---Make More Adjustments with Theme------
p<-p+theme(legend.position='bottom')+labs('Changed the Position of the Legend Using the "theme()" Function')
p
p<-p+theme_bw()+labs(title='Changed the Basic Layout using a default Theme Element')
p
pbig<-p+theme(text=element_text(size=24))+labs(title='All text big')
pbig
ptbig<-p+theme(text=element_text(color='blue'),plot.title=element_text(size=24))+labs(title='Just the Title text is big\nBut all text is blue')
ptbig
#see ?theme for more options.
#--We can also use str() to explore the object and make manual changes
str(ptbig,1)
List of 9
$ data :'data.frame': 1000 obs. of 10 variables:
$ layers :List of 1
$ scales :Reference class 'Scales' [package "ggplot2"] with 1 fields
..and 20 methods, of which 9 are possibly relevant
$ mapping :List of 2
$ theme :List of 38
..- attr(*, "class")= chr [1:2] "theme" "gg"
..- attr(*, "complete")= logi TRUE
$ coordinates:List of 1
..- attr(*, "class")= chr [1:2] "cartesian" "coord"
$ facet :List of 1
..- attr(*, "class")= chr [1:2] "null" "facet"
$ plot_env :<environment: R_GlobalEnv>
$ labels :List of 6
- attr(*, "class")= chr [1:2] "gg" "ggplot"
#=================================================================================
#-----------Fun Extras---------------------------------------------------------
#--Visualize correlation matrices using geom_raster and geom_text
d<-economics
head(d)
date pce pop psavert uempmed unemploy
1 1967-06-30 507.8 198712 9.8 4.5 2944
2 1967-07-31 510.9 198911 9.8 4.7 2945
3 1967-08-31 516.7 199113 9.0 4.6 2958
4 1967-09-30 513.3 199311 9.8 4.9 3143
5 1967-10-31 518.5 199498 9.7 4.7 3066
6 1967-11-30 526.2 199657 9.4 4.8 3018
dc<-cor(d[,2:ncol(d)])
dc<-round(as.matrix(dc),2)
dc
pce pop psavert uempmed unemploy
pce 1.00 0.99 -0.93 0.51 0.32
pop 0.99 1.00 -0.89 0.55 0.41
psavert -0.93 -0.89 1.00 -0.36 -0.08
uempmed 0.51 0.55 -0.36 1.00 0.78
unemploy 0.32 0.41 -0.08 0.78 1.00
dc[upper.tri(dc,diag=F)]<-NA #We only want to plot 1/2 the matrix
pd<-melt(t(dc),value.name='Correlation') #convert it to data.frame readiable by ggplot, transposing it 't()' helps it properly oriented
#--Reverese the Factor levels on the y axis so is properly oriented
levs<-levels(pd$Var1)
pd$Var2<-as.character(pd$Var2)
pd$Var2<-factor(pd$Var2,levels=sort(levs,decreasing=T))
head(pd) #this is what the data looks like before we plot it
Var1 Var2 Correlation
1 pce pce 1.00
2 pop pce NA
3 psavert pce NA
4 uempmed pce NA
5 unemploy pce NA
6 pce pop 0.99
#Lets Make some labes for the Axes
axlabels<-c('p. Consupmtion','Population','P. Savings Rate','Unemp. Duration','Unemp.')
p<-ggplot(data=pd,aes(x=Var1,y=Var2,fill=Correlation,label=Correlation))+geom_raster()
p<-p+geom_raster()+geom_text()+theme_bw()+labs(title='The Raw Plot')
p
Warning: Removed 10 rows containing missing values (geom_text).
p<-p+scale_fill_gradient2(name='Correlation',na.value='white') #create a diverging color gradient
p+labs(title='Now we have a diverngent Color Scale')
Warning: Removed 10 rows containing missing values (geom_text).
#---Make some final Cosmetic Changes
p<-p+theme(panel.border=element_rect(colour=NULL,fill=NULL),panel.grid.major=element_line(colour=NULL),axis.text=element_text(size=12),axis.text.x=element_text(size=10,angle=15))
p<-p+scale_x_discrete(expand=c(0,0),labels=axlabels)+scale_y_discrete(expand=c(0,0),labels=rev(axlabels))
p<-p+labs(x='',y='',title='Macroeconomic Correlations\n(1967-2007)')
p+coord_equal() #coord_equal() will force your graphics to be 'square' avoiding the 'squishy look'
Warning: Removed 10 rows containing missing values (geom_text).
#====================================================================================================
#-----------------------Now Lets Make the Same Plot but Panel it by Year
d$year<-cut(d$date,'years') #cut the month data into years
d$year<-substr(as.character(d$year),1,4) #Strip away month and day
#--Create a custom function that will prep the data
#--This function takes one data.frame as an argument and does all the transformation we preformed above
fun.gprep<-function(df,vars=c('pce','pop','psavert','uempmed','unemploy')){
dc<-cor(df[,vars])
dc<-round(as.matrix(dc),2)
dc[upper.tri(dc,diag=F)]<-NA #We only want to plot 1/2 the matrix
pd<-melt(t(dc),value.name='Correlation')
levs<-levels(pd$Var1)
pd$Var2<-as.character(pd$Var2)
pd$Var2<-factor(pd$Var2,levels=sort(levs,decreasing=T))
return(pd)
}
#--Now we will apply that function to each year in the dataset, using ddply() from the plyr package
pd<-ddply(d,'year',fun.gprep)
#---Show the time series example, with tables
p<-ggplot(data=pd,aes(x=Var1,y=Var2,fill=Correlation,label=Correlation))+geom_raster()
p<-p+geom_raster()+theme_bw()+facet_wrap(~year)
p<-p+scale_fill_gradient2(name='Correlation',na.value='white') #create a diverging color gradient
p<-p+scale_x_discrete(name="",expand=c(0,0),labels=axlabels)+scale_y_discrete(name="",expand=c(0,0),labels=rev(axlabels))
p<-p+theme(panel.border=element_rect(colour=NULL,fill=NULL),panel.grid.major=element_line(colour=NULL),axis.text.x=element_text(size=6,angle=15),legend.position='bottom')
p+coord_equal()