In this rmd document, we will create graphs to show * changes in financial indices over time * comparison of ranges of prices between Microsoft and IBM and Google
‘Base’ R (which you downloaded) does not include functionality to perform the visualization. We need to download additional packages by (1) installing the packages (one time) and (2) loading the packages into memory (each time R is started). There are over 10,000 R packages available. Instructions have been provided that explain how to download a package. To load it into memory, use the library command as demonstrated below.
library(ggplot2) # most popular visualization packages
library(ggthemes) # formatiing options for ggplot2
library(scales) # formatting options for ggplot2
library(quantmod) # retrieve financial data (used for function via getSymbols) (masks as.Date from base R)
library(broom) # allows manipulation of data for prep for ggplot2 (used for function tidy)
library(tidyr) # allows manipulation of data for prep for ggplot2 (used for function spread)
library(dplyr) # allows manipulation of data for prep for ggplot2 (used for function filter)
library(blscrapeR) # retrieve Bureau of Labors stats (used for function bls_api)
To create a graph we need data. Let’s start with obtaining the data. We will use the function ‘getSymbols’ made available via the quantmod package for the Dow Jones Industrial Average and the S&P 500 for dates from Fall 1980 to Summer 2018 (as set by our parameters). We will use the same function to pull the last ten years of stock data for Microsoft, IBM, and Google. This data is not quite in the form we need it in for our visualization package, so we perform some manipulation on the data to prepare it for the ggplot functions.
stock_data <- getSymbols(c("^DJI","^GSPC"),src='yahoo', from = "1980-09-01", to = "2018-08-15") # DJI = Dow Jones Industrial Index, GSPC = Standard and Poor's 500 Index
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
##
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
head(stock_data)
## [1] "DJI" "GSPC"
head(DJI)
## DJI.Open DJI.High DJI.Low DJI.Close DJI.Volume DJI.Adjusted
## 1985-01-29 1277.72 1295.49 1266.89 1292.62 13560000 1292.62
## 1985-01-30 1297.37 1305.10 1278.93 1287.88 16820000 1287.88
## 1985-01-31 1283.24 1293.40 1272.64 1286.77 14070000 1286.77
## 1985-02-01 1276.94 1286.11 1269.77 1277.72 10980000 1277.72
## 1985-02-04 1272.08 1294.94 1268.99 1290.08 11630000 1290.08
## 1985-02-05 1294.06 1301.13 1278.60 1285.23 13800000 1285.23
dow_jones2 <- tidy(DJI) # tidy from broom package converts list to "long" data frame for DJI
head(dow_jones2)
## # A tibble: 6 x 3
## index series value
## <date> <chr> <dbl>
## 1 1985-01-29 DJI.Open 1278.
## 2 1985-01-30 DJI.Open 1297.
## 3 1985-01-31 DJI.Open 1283.
## 4 1985-02-01 DJI.Open 1277.
## 5 1985-02-04 DJI.Open 1272.
## 6 1985-02-05 DJI.Open 1294.
dow_jones <- spread(dow_jones2, series, value) # spread from tidyr; converts long to wide using series values for new columns; that is, turn series row values to new columns
head(dow_jones)
## # A tibble: 6 x 7
## index DJI.Adjusted DJI.Close DJI.High DJI.Low DJI.Open DJI.Volume
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1985-01-29 1293. 1293. 1295. 1267. 1278. 13560000
## 2 1985-01-30 1288. 1288. 1305. 1279. 1297. 16820000
## 3 1985-01-31 1287. 1287. 1293. 1273. 1283. 14070000
## 4 1985-02-01 1278. 1278. 1286. 1270. 1277. 10980000
## 5 1985-02-04 1290. 1290. 1295. 1269. 1272. 11630000
## 6 1985-02-05 1285. 1285. 1301. 1279. 1294. 13800000
dim(dow_jones) # spread function broke rows into new columns, so fewer rows and more columns
## [1] 8456 7
ms_vs_ibm_vs_goog <- getSymbols(c("MSFT","IBM", "GOOG"),src='yahoo', from = "2008-08-15", to = "2018-08-15")
msft2 <- tidy(MSFT) # tidy from broom package converts list to "long" data frame
msft3 <- spread(msft2, series, value) # spread from tidyr; converts long to wide using series values for new columns
msft3$name <- "Microsoft"
msft4 <- rename(msft3, index = index, adjusted = MSFT.Adjusted, close = MSFT.Close, high = MSFT.High, low = MSFT.Low, open = MSFT.Open, volume = MSFT.Volume, name = name)
head(msft4)
## # A tibble: 6 x 8
## index adjusted close high low open volume name
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2008-08-15 21.6 27.8 28.2 27.6 28.0 47267400 Microsoft
## 2 2008-08-18 21.5 27.7 28.0 27.5 27.8 38078200 Microsoft
## 3 2008-08-19 21.3 27.3 27.8 27.3 27.5 40332900 Microsoft
## 4 2008-08-20 21.2 27.3 27.6 27.2 27.5 41497200 Microsoft
## 5 2008-08-21 21.2 27.2 27.2 26.9 27.1 43614200 Microsoft
## 6 2008-08-22 21.7 27.8 27.9 27.2 27.2 47930400 Microsoft
ibm2 <- tidy(IBM) # tidy from broom package converts list to "long" data frame for DJI
ibm3 <- spread(ibm2, series, value) # spread from tidyr; converts long to wide using series values for new columns
ibm3$name <- "IBM"
ibm4 <- rename(ibm3, index = index, adjusted = IBM.Adjusted, close = IBM.Close, high = IBM.High, low = IBM.Low, open = IBM.Open, volume = IBM.Volume, name = name)
head(ibm4)
## # A tibble: 6 x 8
## index adjusted close high low open volume name
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2008-08-15 95.8 126. 127. 125. 127. 5566200 IBM
## 2 2008-08-18 94.4 125. 127 124. 126. 5970000 IBM
## 3 2008-08-19 92.9 123. 125. 122. 124. 9311600 IBM
## 4 2008-08-20 92.8 123. 124. 122. 124. 6871600 IBM
## 5 2008-08-21 93.2 123. 123. 122. 122. 6315000 IBM
## 6 2008-08-22 94.7 125. 125. 123. 123. 5989200 IBM
google2 <- tidy(GOOG) # tidy from broom package converts list to "long" data frame for DJI
google3 <- spread(google2, series, value) # spread from tidyr; converts long to wide using series values for new columns
google3$name <- "Google"
google4 <- rename(google3, index = index, adjusted = GOOG.Adjusted, close = GOOG.Close, high = GOOG.High, low = GOOG.Low, open = GOOG.Open, volume = GOOG.Volume, name = name)
head(google4)
## # A tibble: 6 x 8
## index adjusted close high low open volume name
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2008-08-15 253. 253. 254. 251. 252. 7137300 Google
## 2 2008-08-18 248. 248. 253. 246. 253. 6711100 Google
## 3 2008-08-19 244. 244. 248. 242. 244. 6132500 Google
## 4 2008-08-20 241. 241. 247. 240. 246. 8015800 Google
## 5 2008-08-21 242. 242. 243. 238. 240. 7073900 Google
## 6 2008-08-22 244. 244. 246. 243. 244. 4624200 Google
ms_vs_ibm_vs_goog_df <- rbind(msft4, ibm4, google4)
head(ms_vs_ibm_vs_goog_df)
## # A tibble: 6 x 8
## index adjusted close high low open volume name
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2008-08-15 21.6 27.8 28.2 27.6 28.0 47267400 Microsoft
## 2 2008-08-18 21.5 27.7 28.0 27.5 27.8 38078200 Microsoft
## 3 2008-08-19 21.3 27.3 27.8 27.3 27.5 40332900 Microsoft
## 4 2008-08-20 21.2 27.3 27.6 27.2 27.5 41497200 Microsoft
## 5 2008-08-21 21.2 27.2 27.2 26.9 27.1 43614200 Microsoft
## 6 2008-08-22 21.7 27.8 27.9 27.2 27.2 47930400 Microsoft
Simple plot: Line graph of opening DJIA value by day across entire time period. The ggplot function (available as we loaded the ggplot2 package two code chunks above) builds graphs in layers which each can accept parameters defining (1) the data used and (2) the mapping of data variables to graph characteristics. The commands below produce three identical graphs using a different approach to setting the parameters and objects.
# identify data set (data) and which values in data set should be used for graph values (for line graph, need x and y values)
ggplot(data = dow_jones, aes(x = index, y = DJI.Open)) +
geom_line() # determines graph layer (in this case, essentially telling ggplot2 to create a line graph)
# The data set and data set value mappings can be declared at the layer level too
ggplot() +
geom_line(data = dow_jones, aes(x = index, y = DJI.Open))
# Almost everything yoiu use in R is an object. In the following case we create a p object and then add a layer withough changing the object
p <- ggplot(data = dow_jones, aes(x = index, y = DJI.Open))
p + geom_line()
Graph characteristics can also be set via other formatting options that are indepedent of the data values.
# Simple plot: Outside the data and value parameters, we can hard code visual values
ggplot(data = dow_jones, aes(x = index, y = DJI.Open)) +
geom_line(color = "firebrick")
When declaring the data to be used with the data parameter, a subset of the data can be declared for graphing
# We will define the data to be used as a subset of the original data set
ggplot(data = filter(dow_jones, index > "2017-09-12"), aes(x = index, y = DJI.Open)) + # filter function from dplyr package
geom_line(color = "firebrick")
In simple terms, ggplot builds graphs through geoms. Each geom results in different graph properties (and thus often graph types). Let’s change from geom_line() used in the three code chunks above to geom_point(). Both geoms are both appropriate choices for data with dates and continuous values.
# Let's use a different layer, a point
ggplot(data = filter(dow_jones, index > "2017-09-12"), aes(x = index, y = DJI.Open)) + # filter function from dplyr package
geom_point(color = "coral1")
A single graph can accept multiple geoms, helpful when displaying different attributes of the data or when working with multiple data sets.
# we can add multiple layers to (eventually) create elaborate graphs
ggplot(data = filter(dow_jones, index > "2017-09-12"), aes(x = index, y = DJI.Open)) + # filter function from dplyr package
geom_line(color = "cyan1") +
geom_point(color = "coral1")
# We can add statistical geoms such as geom_smooth (using lm which in a linear model, i.e., OLS)
ggplot(data = dow_jones, aes(index, DJI.Open)) +
geom_line() +
geom_smooth(method="lm", color = "cyan", lwd = .5, linetype = 'dotdash')
# In the next graph we add two statistical geoms, using lm and glm (generlized linear model), fitting the latter with a log function (as growth in stock prices are often evaluated as an exponential function)
ggplot(data = dow_jones, aes(index, DJI.Open)) +
geom_line() +
geom_smooth(method="lm", color = "cyan", lwd = .5, linetype = 'dotdash') +
geom_smooth(method="glm", method.args=list(family=gaussian(link="log")), color = "coral1")
ggplot2 allows for the manipulation of almost every visible glyph and componenet.
# Let's add labels for the graph title and axes
ggplot(data = filter(dow_jones, index > "2017-09-12"), aes(x = index, y = DJI.Open)) + # filter function from dplyr package
geom_line(color = "cyan1") +
geom_point(color = "coral1") +
scale_x_date(name = "Dates") +
scale_y_continuous(name = "Dow Jones Open") +
ggtitle('Dow Jones Opening Prices')
# The scale_x_ and scale_y accept a variety of parameters such as number formatting, breaks, setting limits to the axes, and formatting the axis labels (e.g., with commas)
ggplot(data = dow_jones, aes(index, DJI.Open)) +
geom_line() +
scale_x_date(name = "Dates", date_minor_breaks = "5 year", limits = c(as.Date("1985-01-01"), as.Date("2020-01-01"))) +
scale_y_continuous(name = "Dow Jones Open", labels = comma) +
ggtitle('Dow Jones Opening Prices')
Exponential growth is often represented using a log scale - the resulting graph can be interpreted as a percentage change, thus a straight line roughly represents a consistent percentage change. Other reasons to use log scales include bringing outliers into visible range of other data points on graph. Stock data plotted over time are often plotted on a log scale to present the relationship in a linear form. If we see the exponent changed from 3.3 (about 2,000) to almost 4.4 (about 25,000); we can anticipate (should the growth remain exponential) 5.5 over the next time period.
ggplot(dow_jones, aes(index, log10(DJI.Open))) + # First we transform the data directly via a log10 function
geom_line() +
geom_smooth(color = "coral1", method = "lm") +
scale_x_date(name = "Dates") +
scale_y_continuous(name = "Dow Jones Open", limits = c(0,5)) # Note the exponent values on the y axis
ggplot(dow_jones, aes(index, log10(DJI.Open))) +
geom_line() +
geom_smooth(color = "coral1", method = "lm") +
scale_x_date(name = "Dates") +
scale_y_continuous(name = "Dow Jones Open", limits = c(0,5), labels = math_format(10^.x)) # Changing the y axis to display a 10^exponent via labels
# We can also keep the data as is and use an axis scaling option
ggplot(dow_jones, aes(index, DJI.Open)) +
geom_line() +
geom_smooth(color = "coral1", method = "lm") +
scale_x_date(name = "Dates") +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), limits = c(1, 25000))
## Warning: Removed 63 rows containing non-finite values (stat_smooth).
## Warning: Removed 22 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_smooth).
# Change the y axis lables via the y axis transformation
ggplot(dow_jones, aes(index, DJI.Open)) +
geom_line() +
geom_smooth(color = "coral1", method = "lm") +
scale_x_date(name = "Dates") +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), limits = c(1, 25000),
labels = trans_format("log10", math_format(10^.x))) # do not start limits min at 0...
## Warning: Removed 63 rows containing non-finite values (stat_smooth).
## Warning: Removed 22 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_smooth).
We can project prices into the future simply by setting the axis limits into the future and setting the statistical layer to use the full axis values.
# Plotting into the future by using appropriate x and y ranges and setting geom_smooth to cover the entire span via the fullrange option
ggplot(dow_jones, aes(index, DJI.Open)) +
geom_line() +
geom_smooth(color = "coral1", method = "lm", fullrange = TRUE) +
scale_x_date(name = "Dates", limits = c(as.Date("1990-01-01"), as.Date("2050-01-01"))) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
# labels = function(x) format(x, scientific = FALSE), # defaults to scientific
labels = comma, # defaults to scientific notation
limits = c(1, 180000)) # do not start min at 0...
## Warning: Removed 1244 rows containing non-finite values (stat_smooth).
## Warning: Removed 1244 rows containing missing values (geom_path).
# Reverting to nontransformed data
ggplot(dow_jones, aes(index, DJI.Open)) +
geom_line() +
geom_smooth(method="lm", color = "cyan", lwd = .5, linetype = 'dotdash', fullrange = TRUE) +
geom_smooth(method="glm", method.args=list(family=gaussian(link="log")), color = "coral1", fullrange = TRUE) +
scale_x_date(name = "Dates", limits = c(as.Date("1985-01-01"), as.Date("2050-01-01"))) +
scale_y_continuous(name = "Dow Jones Open", labels = comma, limits = c(0,180000))
## Warning: Removed 1 rows containing missing values (geom_smooth).
Let’s compare stock prices and unemployment rates using a dual axis graph.
We retrieve unemployment data from the government’s Bureau of Labor Statistics using the codes: LNS14000000: U3 unemployment = actively looked for work within the past four weeks LNS13327708: U5 unemployment = u3 + those who want jobs but are discouraged and not looking LNS13327709: U6 unemployment = u5 and part time works (search for additional codes at https://beta.bls.gov/dataQuery/search)
First, we obtain the economic data.
tbl <- bls_api(c("LNS14000000", "LNS13327708", "LNS13327709"))
## REQUEST_SUCCEEDED
head(tbl)
## # A tibble: 6 x 7
## year period periodName latest value footnotes seriesID
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 2018 M08 August true 3.9 "" LNS14000000
## 2 2018 M07 July <NA> 3.9 "" LNS14000000
## 3 2018 M06 June <NA> 4 "" LNS14000000
## 4 2018 M05 May <NA> 3.8 "" LNS14000000
## 5 2018 M04 April <NA> 3.9 "" LNS14000000
## 6 2018 M03 March <NA> 4.1 "" LNS14000000
tbl2 <- spread(tbl, seriesID, value) # "spread" from tidyr, see help file for gather vs spread
head(tbl2)
## # A tibble: 6 x 8
## year period periodName latest footnotes LNS13327708 LNS13327709
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2016 M01 January <NA> "" 6.2 9.9
## 2 2016 M02 February <NA> "" 6 9.7
## 3 2016 M03 March <NA> "" 6.1 9.8
## 4 2016 M04 April <NA> "" 6 9.8
## 5 2016 M05 May <NA> "" 5.7 9.8
## 6 2016 M06 June <NA> "" 6 9.5
## # ... with 1 more variable: LNS14000000 <dbl>
tbl3 <- dateCast(tbl2) # from blscrapeR package, need year and period columns
head(tbl3)
## # A tibble: 6 x 9
## year period periodName latest footnotes LNS13327708 LNS13327709
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2016 M01 January <NA> "" 6.2 9.9
## 2 2016 M02 February <NA> "" 6 9.7
## 3 2016 M03 March <NA> "" 6.1 9.8
## 4 2016 M04 April <NA> "" 6 9.8
## 5 2016 M05 May <NA> "" 5.7 9.8
## 6 2016 M06 June <NA> "" 6 9.5
## # ... with 2 more variables: LNS14000000 <dbl>, date <date>
tbl3 <- rename(tbl3, u3_unemployment = LNS14000000, u5_unemployment = LNS13327708, u6_unemployment = LNS13327709)
head(tbl3)
## # A tibble: 6 x 9
## year period periodName latest footnotes u5_unemployment u6_unemployment
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2016 M01 January <NA> "" 6.2 9.9
## 2 2016 M02 February <NA> "" 6 9.7
## 3 2016 M03 March <NA> "" 6.1 9.8
## 4 2016 M04 April <NA> "" 6 9.8
## 5 2016 M05 May <NA> "" 5.7 9.8
## 6 2016 M06 June <NA> "" 6 9.5
## # ... with 2 more variables: u3_unemployment <dbl>, date <date>
# As unemployment is presented in percentage form and ggplot assumes decimal form (e.g., 50% vs. .5), we need to change the labels command when using the % format from labels = percent to use the raw value and append the '%' symbol via function(x){ paste0(x, "%")
ggplot(data = tbl3, aes(x = date)) +
geom_line(aes(y = u3_unemployment), color = 'coral1') +
geom_line(aes(y = u5_unemployment), color = 'goldenrod') +
geom_line(aes(y = u6_unemployment), color = 'black') +
scale_y_continuous(name = 'Rate', labels = function(x){ paste0(x, "%") }) +
scale_x_date(name = 'Dates', labels = date_format("%B-%y"))
Graph stock prices (left y axis) and unemployment (right x axis); note the x axis values are the same for both sets of data.
# Dual y axis via sec.axis = sec_axis parameter in scale_y_continuous
ggplot(data = tbl3, aes(x = date)) +
geom_line(data = filter(dow_jones, index > "2015-12-31"), aes(x = index, y = DJI.Open, color = 'DJIA')) + # Set color as hard coded aesthetic to force legend creation
geom_line(aes(y = u3_unemployment * 2000, color = 'Unemployment')) +
scale_y_continuous(name = 'DJIA', labels = comma, sec.axis = sec_axis(trans = ~./200000, name = "Unemployment", labels = percent)) # trans is formula for mapping x axis values; see ggplot2.tidyverse.org/reference/sec_axis.html
Rather than using data directly from the data set, we can calculate new values and plot the resulting values.
# The following two commands create a ggplot object and render the plot
p <- ggplot(data = filter(ms_vs_ibm_vs_goog_df, name == "Microsoft" | name == "IBM"), aes(x = index, color = name)) # Not the pipe for 'OR' condition
p + geom_point(aes(x = index, y = open, color = name))
# calculated values
p + geom_point(aes(y = high - low))
p + geom_point(aes(y = open - lag(open))) # removes one data point since day 1 has no log;
## Warning: Removed 1 rows containing missing values (geom_point).
p + geom_point(aes(y = open - lag(open, n = 5)))
## Warning: Removed 5 rows containing missing values (geom_point).
p + geom_point(aes(y = (open - lag(open))/open)) # show changes in % rather than absolute
## Warning: Removed 1 rows containing missing values (geom_point).
Layers can be used against different data sets. Say we think ibm changes predict msft movement. We can map one layer for IBM and one layer for Microsoft using the lag parameter to use data five days older against the IBM data.
ggplot() +
geom_point(data = filter(ms_vs_ibm_vs_goog_df, name == "IBM", index > "2018-01-01"), aes(x = index, y = lag(open, n = 5), color = name)) +
geom_point(data = filter(ms_vs_ibm_vs_goog_df, name == "Microsoft", index > "2018-01-01"), aes(x = index, y = open, color = name))
## Warning: Removed 5 rows containing missing values (geom_point).
We further explore the options ggplot presents to allow custom formatting.
# We first leverage a built in theme function: theme_bw()
ggplot(data = ms_vs_ibm_vs_goog_df, aes(x = index, y = open, color = name)) +
geom_point() +
theme_dark() # built in theme
# Another theme available is theme_economist(): enter theme_economist() to see the actual parameter settings. You can create your own themes that serve as 'global' formatting variables.
# The 'last' theme's formatting takes precedent in case of conflict as shown below. Alternatively, narrowly defined themes (e.g., one for axis, one for legends, etc.) can be used in combination.
ggplot(data = ms_vs_ibm_vs_goog_df, aes(x = index, y = open, fill = name)) +
geom_point() + theme_dark() + theme_bw()
# The scale_x and scale_y layers accept several parameters in addition the the axis name such as breaks
p <- ggplot(data = ms_vs_ibm_vs_goog_df, aes(x = index, color = name))
p + geom_point(aes(y = open)) + ggtitle('Technology Stocks') +
scale_x_date(limits=c(as.Date("2010-01-01"), as.Date("2014-12-31")), breaks=c(as.Date("2010-01-01"), as.Date("2012-01-01"), as.Date("2014-01-01")), name = "Date") +
scale_y_continuous(name = "Opening Value", labels = dollar) # Note that the limits on the x axis resulted in thousands of data points being dropped
## Warning: Removed 3777 rows containing missing values (geom_point).
# Rather than set a range (via the limits parameter), the coor_cartesian 'zooms' into the referenced part of the plot. This may be important as the former drops data points while the latter does not, potentially affecting some stat calculations
p + geom_point(aes(y = open)) + ggtitle('Technology Stocks') +
geom_smooth(aes(y = open), method=lm) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Open", labels = dollar) +
coord_cartesian(xlim=c(as.Date("2010-01-01"), as.Date("2014-12-31"))) # zooms instead of truncate values
# We next rename the legend to "Stock" using a scale_color (since the legend is defining values in color)
p + geom_point(aes(y = open)) + ggtitle('Technology Stocks') +
scale_x_date(name = "Applications") +
scale_y_continuous(name = "Open", labels = percent) +
scale_color_discrete(name="Stock") # rename legend label
# If the legend was built using a size aesthetic instead of color, use scale_size("name") to rename legend or scale_size(range = c(10, 100)) to adjust glyph sizes
# We can override formatting for the legend/color by declaring a single formatting option that overrides the theme as well (in this case, color)
p + geom_point(aes(y = open)) +
scale_y_continuous(name = "Open", labels = dollar) +
scale_color_manual(name="Stock", values=c("darkgreen", "goldenrod1", "firebrick")) # override color aesthetics
We can declare color palettes to be used repeatedly.
# Create a boxplot with default colors for categories for s.f.ratio.factor
p <- ggplot(data = ms_vs_ibm_vs_goog_df, aes(y = open, x = name, fill = name)) + geom_boxplot()
p
p + scale_fill_hue(c=45, l=80)
p + scale_fill_brewer(palette = "Spectral")
p + scale_fill_manual(values = c("#CC6666", "#9999CC", "#66CC99", "#66CC99", "#66CC99"))
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
p + scale_fill_manual(values=cbPalette)
# To use custom colors for line and point colors, add scale_colour_manual(values=cbPalette)
ggplot allows intricate annotation options beyond the simple placement of a text field. In the example below, we highlight and label the ‘sweet spot’ of the plot, that is, the subjectively selected desirable x and y coordinates that we want to draw the viewer’s attention to.
p <- ggplot(data = ms_vs_ibm_vs_goog_df, aes(y = open, x = index, color = name))
p + geom_point() +
annotate("text", label = "Big Announcement", x = as.Date("2014/02/01"), y = 750, size = 3, fontface = "italic") +
annotate("rect", xmin = as.Date("2013/12/01"), xmax = as.Date("2015/01/31"), ymin = 550, ymax = 650, alpha = .2, color = 'red', fill = 'purple')