Getting Started with GGPlot

GGplot is based on a grammar of graphics, which means . . .

Resources

The documentation provides details.

This cheatsheet is a handy reference guide

This blogpost has a lot of nice examples.

religion <- read.csv("religion-survey-results.csv")
#names(religion)
names(religion)[1] <- "relig"
names(religion)[5] <- "prayMotion"

Before we can use GGPlot2 to make plots, we need the library.

library(ggplot2)

First graphs

ggplot(data=religion, aes(x=relig)) + geom_bar()

ggplot(data=religion, aes(x=relig)) + geom_bar() + coord_flip()

religion <- subset(religion, relig %in% c("Roman Catholic", "Protestant", "Atheist","Agnostic"))
religion <- droplevels(religion)

ggplot(data=religion, aes(x=relig)) + geom_bar() + coord_flip()

ggplot(data=religion, aes(x=relig)) + geom_bar() + coord_flip()

ggplot(data=religion, aes(x=prayMotion,fill=prayMotion)) + geom_point(stat="count") + geom_bar(width=.02) + coord_flip()

ggplot(data=religion, aes(x=relig, fill=prayMotion)) + geom_bar(position="fill") + coord_flip()

ggplot(data=religion,aes(x=factor(1),fill=relig)) + geom_bar(position="fill")+coord_polar(theta="y")

toyData <- data.frame(c("C2","C2","C1","C1","C1","C1"))
names(toyData) <- "Cat"
ggplot(data=toyData,aes(x=Cat,fill=Cat))+geom_bar(width=1)+coord_polar()+scale_y_sqrt()

ggplot(data=religion,aes(x=relig,fill=relig))+geom_bar(width=1,stat="count")+scale_y_sqrt()+coord_polar()

Some additional data processing and graphs not presented in class:

names(religion)[9]<-"offerPrayer"
names(religion)[19]<-"comfOfferPrayer"
g<-ggplot(religion, aes(relig,fill=offerPrayer))+geom_bar()+coord_flip()

g+theme_bw()

g+theme_light()

g+theme_dark()

g+theme_minimal()

g+theme_classic()

g+theme_linedraw()

ggplot(religion, aes(relig,fill=offerPrayer))+geom_bar(position="fill")+coord_flip()

religion <- subset(religion,offerPrayer != "")
ggplot(religion, aes(relig,fill=offerPrayer))+geom_bar(position="fill")+coord_flip()

ggplot(religion, aes(relig,fill=offerPrayer))+geom_bar(position="fill")+coord_flip()

ggplot(religion, aes(relig,fill=comfOfferPrayer))+geom_bar(position="fill")+coord_flip()

This plot, showing the comfort level of those offering to pray with others is not very informative, because a large proportion of respondents say they don’t do this. Let’s restrict our attention to those who do.

religion <- subset(religion, comfOfferPrayer != "" & comfOfferPrayer != "I don't do this")
religion <- droplevels(religion)
ggplot(religion, aes(relig,fill=comfOfferPrayer))+geom_bar(position="fill")+coord_flip()

The order of these labels doesn’t make sense. Let’s change it so that it follows a logical order (instead of the order that R encountered while reading the data). Before we change the labels, let’s look at the data.

levels(religion$comfOfferPrayer)
## [1] "Extremely comfortable"  "Not at all comfortable"
## [3] "Not so comfortable"     "Somewhat comfortably"  
## [5] "Very comfortable"
table(religion$comfOfferPrayer)
## 
##  Extremely comfortable Not at all comfortable     Not so comfortable 
##                     83                     16                     43 
##   Somewhat comfortably       Very comfortable 
##                     65                     84

Now, let’s change the labels so they go from highest to lowest level of comfort.

religion$comfOfferPrayer <- factor(religion$comfOfferPrayer, levels=c("Extremely comfortable","Very comfortable", "Somewhat comfortably", "Not so comfortable", "Not at all comfortable"))

After changing the labels, let’s look at the data again, to confirm that we only re-ordered the labels, without changing the underlying data.

table(religion$comfOfferPrayer)
## 
##  Extremely comfortable       Very comfortable   Somewhat comfortably 
##                     83                     84                     65 
##     Not so comfortable Not at all comfortable 
##                     43                     16

Looks good. Let’s graph.

ggplot(religion, aes(relig,fill=comfOfferPrayer))+geom_bar(position="fill")+coord_flip()

With the revised labels, it is much easier to see what proportion of each religious group are at least somewhat comfortable with offering a prayer.

ggplot(religion, aes(comfOfferPrayer,fill=relig))+geom_bar()+coord_flip()

ggplot(religion, aes(comfOfferPrayer,fill=relig))+geom_bar(position="identity",alpha=.5)+coord_flip()

ggplot(religion, aes(comfOfferPrayer,fill=relig))+geom_bar(position="dodge")+coord_flip()

ggplot(religion, aes(comfOfferPrayer,fill=relig))+geom_bar(position="fill")+coord_flip()

Capital Bikeshare data is taken from Capital Bikeshare’s system data portal.

bikes <- read.csv("bikes.csv")
summary(bikes)
##       Year          Month                             Station.Name 
##  Min.   :2014   Apr    : 317   10th & E St NW               :  12  
##  1st Qu.:2015   Aug    : 317   10th & Florida Ave NW        :  12  
##  Median :2015   Jul    : 317   10th & Monroe St NE          :  12  
##  Mean   :2015   Jun    : 317   10th & U St NW               :  12  
##  3rd Qu.:2015   Mar    : 317   10th St & Constitution Ave NW:  12  
##  Max.   :2015   May    : 317   11th & F St NW               :  12  
##                 (Other):1894   (Other)                      :3724  
##                     Municipality    Departures        Arrivals     
##  Alexandria, VA           :  96   Min.   :   0.0   Min.   :   0.0  
##  Arlington, VA            : 867   1st Qu.: 100.0   1st Qu.:  92.0  
##  Rest of Montgomery County: 336   Median : 397.0   Median : 338.0  
##  Rockville/Shady Grove    : 240   Mean   : 765.7   Mean   : 762.9  
##  Washington, DC           :2257   3rd Qu.:1086.0   3rd Qu.:1079.2  
##                                   Max.   :9994.0   Max.   :9990.0  
## 
names(bikes)
## [1] "Year"         "Month"        "Station.Name" "Municipality"
## [5] "Departures"   "Arrivals"
ggplot(data=bikes, aes(x=Departures,fill=Month)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=bikes) + geom_histogram(aes(x=Departures),fill="green",alpha=.5) + geom_histogram(aes(x=Arrivals),fill="blue",alpha=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

all_states <- map_data("state")
dc <- subset(all_states, region %in% c("maryland", "district of columbia","virginia") )

ggplot()+geom_polygon(data=all_states, aes(x=long, y=lat, group=group),color="white",fill="black")+coord_map()

ggplot()+geom_polygon(data=dc, aes(x=long, y=lat, group=group),color="white",fill="black")+coord_map(xlim=c(-77.2,-76.9),ylim=c(38.8,39.05))

head(bikes$Station.Name)
## [1] Eads St & 15th St S                 18th & Eads St.                    
## [3] 20th & Crystal Dr                   15th & Crystal Dr                  
## [5] S Joyce & Army Navy Dr              Crystal City Metro / 18th & Bell St
## 319 Levels: 10th & E St NW 10th & Florida Ave NW ... Wisconsin Ave & O St NW
StationNames <- levels(bikes$Station.Name)

StationAddresses <- paste(StationNames,", Washington, DC", sep="")
write.csv(StationAddresses,"stations.csv")

bikes$Station.Name <- as.factor(bikes$Station.Name)

#go to http://www.findlatitudeandlongitude.com/batch-geocode/ to convert addresses into coordinates
#save results as stationLatLong.csv

coordData <- read.csv("stationLatLong.csv")

coordData$original.address <- StationNames
names(coordData)[1] <- names(bikes)[3]

mapdata <- merge(bikes,coordData, by="Station.Name")
mapdata <- subset(mapdata,!is.na(longitude))
mapdata <- subset(mapdata, Year == 2014 & Month == "Dec")

longLim <- c(min(mapdata$longitude)-.01,max(mapdata$longitude)+.01)
latLim <- c(min(mapdata$latitude)-.01,max(mapdata$latitude)+.01)
p <- ggplot()
p <- p + geom_polygon( data=dc, aes(x=long, y=lat, group = group),colour="white", fill="grey10" )

p <- p + geom_point(data=mapdata, aes(x=longitude, y=latitude,color=Arrivals))
p <- p + coord_map(xlim=longLim, ylim=latLim)
p <- p + scale_color_gradient(low="lightblue", high="red")
p

We could restrict our attention to DC stations.

mapdata <- merge(bikes,coordData, by="Station.Name")
mapdata <- subset(mapdata,!is.na(longitude))
mapdata <- subset(mapdata, Year == 2014 & Month == "Dec" & Municipality == "Washington, DC")

longLim <- c(min(mapdata$longitude),max(mapdata$longitude))
latLim <- c(min(mapdata$latitude),max(mapdata$latitude))

p <- ggplot()
p <- p + geom_polygon( data=dc, aes(x=long, y=lat, group = group),colour="white", fill="grey65" )

p <- p + geom_point(data=mapdata, aes(x=longitude, y=latitude,color=Arrivals))
p <- p + coord_map(xlim=longLim, ylim=latLim)
p <- p + scale_color_gradient(low="lightblue", high="red")
p