Lecture: Visualizing Data in R

Key Concepts covered in the lecture include (Same as before):
1. Learn concepts of visualizing statistics


The in-class exercise focuses on GGPlot2. GG stands for Grammer of Graphics (by Leland Wilkinson)

Part 1: Creating scatterplots in GGPlot

library(ggplot2)
library(maptools)
## Loading required package: foreign
## Loading required package: sp
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: TRUE

USA <- readShapePoly("H:/Quant/inClassExercises/InClassExerciseData/2004_Election_Counties.shp")
names(USA)
##  [1] "NAME"       "STATE_NAME" "STATE_FIPS" "CNTY_FIPS"  "FIPS"      
##  [6] "AREA"       "FIPS_num"   "Bush"       "Kerry"      "County_F"  
## [11] "Nader"      "Total"      "Bush_pct"   "Kerry_pct"  "Nader_pct" 
## [16] "MDratio"    "hosp"       "pcthisp"    "pcturban"   "urbrural"  
## [21] "pctfemhh"   "pcincome"   "pctpoor"    "pctlt9ed"   "pcthsed"   
## [26] "pctcoled"   "unemploy"   "pctwhtcl"   "homevalu"   "rent"      
## [31] "popdens"    "crowded"    "ginirev"    "SmokecurM"  "SmokevrM"  
## [36] "SmokecurF"  "SmokevrF"   "Obese"      "Noins"      "XYLENES__M"
## [41] "TOLUENE"    "TETRACHLOR" "STYRENE"    "NICKEL_COM" "METHYLENE_"
## [46] "MERCURY_CO" "LEAD_COMPO" "BENZENE__I" "ARSENIC_CO" "POP2000"   
## [51] "POP00SQMIL" "MALE2000"   "FEMALE2000" "MAL2FEM"    "UNDER18"   
## [56] "AIAN"       "ASIA"       "BLACK"      "NHPI"       "WHITE"     
## [61] "AIAN_MORE"  "ASIA_MORE"  "BLK_MORE"   "NHPI_MORE"  "WHT_MORE"  
## [66] "HISP_LAT"   "CH19902000" "MEDAGE2000" "PEROVER65"

plot1 <- ggplot(data = USA@data, aes(x = Obese, y = homevalu))  #Create a GGPlot.  R does nothing,

plot1 + geom_point()  #we need to plot it using a point geometry.  Now we get the scatterplot!

plot of chunk unnamed-chunk-1


plot1 + geom_point() + scale_x_log10() + scale_y_log10()  #Change the scale of the plot.

plot of chunk unnamed-chunk-1


plot1 + geom_point(alpha = 1/10) + scale_x_log10() + scale_y_log10()  #Edit the transparency for each dot based on attribute value

plot of chunk unnamed-chunk-1


plot1 + geom_point(alpha = 1/10) + geom_smooth(method = "lm")  #Fit a line through the data.  It also creates a standard error envelope for this fitted line.  Can we access informaiton about this line?

plot of chunk unnamed-chunk-1



# Do some additional overlays:
library(hexbin)  #bring in a new library

plot1 + stat_binhex()  #similar to a density plot.  Each point has a hexagon; then each hexagon shows overplotting.  It highlights where the there is more data in terms of the scatterplot space.

plot of chunk unnamed-chunk-1


plot1 + geom_bin2d()  #This is the same plot but uses a different binning shape.

plot of chunk unnamed-chunk-1


plot1 + geom_density2d()  #This is a weird contour plot of there there is data.  I don't much use in this.

plot of chunk unnamed-chunk-1


# Create a goofy factor for the data.  We can plot data in terms of its
# factors:
USA$good_states <- ifelse(USA$STATE_NAME %in% c("New York", "Massachusetts", 
    "Rhode Island", "Wyoming"), yes = "its good", no = "its ok")
USA$good_states <- as.factor(USA$good_states)  #Define the factor


plot2 <- ggplot(data = USA@data, aes(x = Obese, y = homevalu, color = good_states))
plot2 + geom_point()  #colors a point based on its attribute (the factor)

plot of chunk unnamed-chunk-1


plot2 <- ggplot(data = USA@data, aes(x = Obese, y = homevalu, color = good_states, 
    shape = good_states))
plot2 + stat_smooth()  #uses a local fit.  Note: this takes a few moments to draw.
## geom_smooth: method="auto" and size of largest group is >=1000, so using
## gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the
## smoothing method.

plot of chunk unnamed-chunk-1


plot2 + geom_point() + stat_smooth(method = "lm", se = TRUE, lwd = 0.5, lty = 1)  #Overlay the points and line plots.  Not sure how to interpret this...  I think it draws trend lines for the factors.

plot of chunk unnamed-chunk-1

Part 2: Modify ggplot appearance

plot3 <- ggplot(data = USA@data, aes(x = pctcoled, y = pcincome))
plot3 + geom_point() + ylab("Per Capita Income") + xlab("Percent College Educated") + 
    ggtitle("US Counties (2000)\nPercent College Educated by Per Capita Income")

plot of chunk unnamed-chunk-2


# We can get very detailed in how we modify the data to be displayed:
plot4 <- ggplot(data = USA@data, aes(x = pctcoled, y = pcincome, color = unemploy)) + 
    geom_point() + ylab("Per Capita Income") + xlab("Percent College Educated") + 
    ggtitle("US Counties (2000)\nPercent College Educated by Per Capita Income") + 
    scale_color_gradient2("Unemployment", breaks = c(min(USA$unemploy), mean(USA$unemploy), 
        max(USA$unemploy)), labels = c("Below Average", "Average", "Above Average"), 
        low = "green", mid = "yellow", high = "red", midpoint = mean(USA$unemploy))
plot4

plot of chunk unnamed-chunk-2


plot4 + facet_grid(. ~ good_states)  #We can also split the plots based on a factor variable

plot of chunk unnamed-chunk-2


plot4 + theme_classic()  #A theme to be applied to a plot.  This is just a regular white-backgrounded scatterplot.  This is useful to standardize plot appearances.  More on these themes in the next section...

plot of chunk unnamed-chunk-2


Part 3: Using and Creating Themes


install.packages("ggthemes")#This is a library full of pre-created themes.
## Installing package(s) into 'Q:/RStu97.64b/RStudio/R/library' (as 'lib' is
## unspecified)
## Warning: 'lib = "Q:/RStu97.64b/RStudio/R/library"' is not writable
## Error: unable to install packages
library(ggthemes)#This gives you access other created themes
## Warning: package 'ggthemes' was built under R version 2.15.3
plot4 + theme_economist()#This is an example of a default theme

plot of chunk unnamed-chunk-3

plot4 + theme_solarized() #here is another

plot of chunk unnamed-chunk-3

plot4 + theme_tufte()#and another

plot of chunk unnamed-chunk-3


#Some of the defualt themes aren't very good.  It is possible to create your own theme.  Here is Seth's creation:
sethTheme <- theme(
    panel.background = element_rect(fill = "black"),
    plot.background = element_rect(fill = "black"), 
    panel.grid.minor = element_blank(), 
    panel.grid.major = element_line(linetype = 3, colour = "white"), 
    axis.text.x = element_text(colour = "grey80"), 
    axis.text.y = element_text(colour = "grey80"), 
    axis.title.x = element_text(colour = "grey80"), 
    axis.title.y = element_text(colour = "grey80"), 
    legend.key = element_rect(fill = "black"), 
    legend.text = element_text(colour = "white"), 
    legend.title = element_text(colour = "black"), 
    legend.background = element_rect(fill = "black"),
    axis.ticks = element_blank()) 
plot4 + sethTheme#Uh-oh.  No titles present.  We must fix the text color and edit the theme...

plot of chunk unnamed-chunk-3



sethTheme <- theme(
    panel.background = element_rect(fill = "black"),
    plot.background = element_rect(fill = "black"), 
    panel.grid.minor = element_blank(), 
    panel.grid.major = element_line(linetype = 3, colour = "white"), 
    axis.text.x = element_text(colour = "grey80"), 
    axis.text.y = element_text(colour = "grey80"), 
    axis.title.x = element_text(colour = "grey80"), 
    axis.title.y = element_text(colour = "grey80"), 
    title = element_text(colour = "white"),#this was added
    legend.key = element_rect(fill = "black"), 
    legend.text = element_text(colour = "white"), 
    legend.title = element_text(colour = "white"),#This was edited 
    legend.background = element_rect(fill = "black"),
    axis.ticks = element_blank()) 
plot4 + sethTheme#Ah, much better!

plot of chunk unnamed-chunk-3


Part 4: Making Maps with GGPlot2

# note... we use the 'RGEOS' library for this? or something like this.

usa_geom <- fortify(USA, region = "FIPS")  #Fortify extracts the coords of each polygon.  Sounds like the same way google maps plots polygons in a mashup.  Holy crap this takes forever!
## Loading required package: rgeos
## rgeos version: 0.2-12, (SVN revision 372) GEOS runtime version:
## 3.3.6-CAPI-1.7.6 Polygon checking: TRUE

usa_map_df <- merge(usa_geom, USA, by.x = "id", by.y = "FIPS")  #Creates the dataframe of polygons.

# Create a map:
map1 <- ggplot(usa_map_df, aes(long, lat, group = group)) + geom_polygon(data = usa_map_df, 
    aes(fill = Bush_pct)) + coord_equal() + scale_fill_gradient(low = "yellow", 
    high = "red") + geom_path(data = usa_geom, aes(long, lat, group = group), 
    lty = 3, lwd = 0.1, color = "white")
map1

plot of chunk unnamed-chunk-4


map1 + sethTheme  #We can also apply themes to the maps

plot of chunk unnamed-chunk-4

Part 5: Creating proper thematic maps

library(classInt)
## Loading required package: class
## Loading required package: e1071
classIntervals(USA$Bush_pct, n = 5, style = "quantile")
## style: quantile
##     [0,50.52) [50.52,58.07) [58.07,64.37) [64.37,71.31) [71.31,92.83] 
##           622           622           622           622           623
# The ggplot isn't compatable with this stuff, so we need to manually
# define these breaks:
breaks <- c(0, 50, 58, 64, 71, 93)  #approximate quantiles
labels = c("[0 - 50%]", "[50% - 58%]", "[58% - 64%]", "[64% - 71%]", "[71% - 93%]")
usa_map_df$bushBreaks <- cut(usa_map_df$Bush_pct, breaks = breaks, labels = labels)
map2 <- ggplot(aes(long, lat, group = group), data = usa_map_df) + geom_polygon(data = usa_map_df, 
    aes(fill = bushBreaks)) + coord_equal()
map2  #How are the colors being applied?

plot of chunk unnamed-chunk-5


# Here, ggplot is looking at the class breaks as factors, so we must
# specify a proper color ramp:
library(RColorBrewer)
map2 + scale_fill_brewer("Votes for Bush in 2004 (%)", palette = "YlGnBu") + 
    sethTheme + ggtitle("Votes for Bush in 2004 (%)") + theme(plot.title = element_text(size = 24, 
    face = "bold", color = "white", hjust = 2))

plot of chunk unnamed-chunk-5