Created on Aug 15 2013
Revised on Thu Aug 15 13:14:06 2013
original post is here and here
Example
options(warn = -1)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.0.1
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
as.factor(mtcars$carb)
## [1] 4 4 1 1 2 1 4 2 2 4 4 3 3 3 4 4 4 1 2 1 1 2 2 4 2 1 2 2 4 6 8 2
## Levels: 1 2 3 4 6 8
ggplot(data = mtcars, aes(y = carb, x = mpg, colour = hp)) + geom_point()
ggplot(data = mtcars, aes(y = as.factor(carb), x = mpg, colour = hp)) + geom_point()
# Rearrange_Guy: But I want 2 to come first and 8 last Helpful_Gal: OK use
# rev with levels
mtcars$carb2 <- factor(mtcars$carb, levels = rev(levels(factor(mtcars$carb))))
ggplot(data = mtcars, aes(y = carb2, x = mpg, colour = hp)) + geom_point()
# Rearrange_Guy: Well I just want to specify the order Helpful_Gal: OK
# type it in by hand then
mtcars$carb2 <- factor(mtcars$carb, levels = c("1", "2", "3", "6", "8", "4"))
ggplot(data = mtcars, aes(y = carb2, x = mpg, colour = hp)) + geom_point()
# Rearrange_Guy: What about faceting? I bet it doesn't work for that.
# Helpful_Gal: Um yes it does.
ggplot(data = mtcars, aes(y = carb2, x = mpg, colour = hp)) + geom_point() +
facet_grid(cyl ~ .)
# Rearrange_Guy: OK Helpful_Gal I want it to go 6, 4, and then 8
# Helpful_Gal: OK
mtcars$cyl2 <- factor(mtcars$cyl, levels = c("6", "4", "8"))
ggplot(data = mtcars, aes(y = carb2, x = mpg, colour = hp)) + geom_point() +
facet_grid(cyl2 ~ .)
# Rearrange_Guy: Why do you keep making new variables? Helpful_Gal: It's
# probably not the best idea to overwrite variables just for the sake of
# plotting Rearrange_Guy: Thank you for showing me the way of re-ordering
# and re-arranging. Helpful_Gal: You welcome.
# SECTION 1: REORDERING BY BAR/POINT SIZE Create a data set we can alter
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb carb2
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 1
## cyl2
## Mazda RX4 6
## Mazda RX4 Wag 6
## Datsun 710 4
## Hornet 4 Drive 6
## Hornet Sportabout 8
## Valiant 6
mtcars3 <- mtcars2 <- data.frame(car = rownames(mtcars), mtcars, row.names = NULL)
mtcars3$cyl <- mtcars2$cyl <- as.factor(mtcars2$cyl)
head(mtcars2)
## car mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## carb2 cyl2
## 1 4 6
## 2 4 6
## 3 1 4
## 4 1 6
## 5 2 8
## 6 1 6
## An Example of Unordered Bars/Points
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.0.1
## Loading required package: grid
x <- ggplot(mtcars2, aes(y = car, x = mpg)) + geom_point(stat = "identity")
y <- ggplot(mtcars2, aes(x = car, y = mpg)) + geom_bar(stat = "identity") +
coord_flip() # Flipped cartesian coordinates so that horizontal becomes vertical, and vertical, horizontal.
grid.arrange(x, y, ncol = 2)
## An Example of Ordered Bars/Points Re-level the cars by mpg
mtcars3$car <- factor(mtcars2$car, levels = mtcars2[order(mtcars$mpg), "car"])
x <- ggplot(mtcars3, aes(y = car, x = mpg)) + geom_point(stat = "identity")
y <- ggplot(mtcars3, aes(x = car, y = mpg)) + geom_bar(stat = "identity") +
coord_flip()
grid.arrange(x, y, ncol = 2)
## An Example of Ordered and Faceted Bars/Points Re-level the carb by
## average mpg
ag_mtcars <- aggregate(mpg ~ carb, mtcars3, mean)
mtcars3$carb <- factor(mtcars2$carb, levels = ag_mtcars[order(ag_mtcars$mpg),
"carb"])
ggplot(mtcars3, aes(y = carb, x = mpg)) + geom_point(stat = "identity", size = 2,
aes(color = carb))
## An Example of Ordered and Faceted Bars/Points
ggplot(mtcars3, aes(y = car, x = mpg)) + geom_point(stat = "identity") + facet_grid(cyl ~
., scales = "free", space = "free")
# SECTION 2: SPEEDING UP THE WORKFLOW WITH THE PLOTFLOW PACKAGE Getting
# the 'plotflow package' browseURL('https://github.com/trinker/plotflow')
# install.packages('devtools')
library(devtools)
## Warning: package 'devtools' was built under R version 3.0.1
install_github("plotflow", "trinker")
## Installing github repo(s) plotflow/master from trinker
## Downloading plotflow.zip from
## https://github.com/trinker/plotflow/archive/master.zip
## Installing package from
## C:\Users\alice\AppData\Local\Temp\RtmpaW8UiU/plotflow.zip
## Installing plotflow
## "C:/PROGRA~1/R/R-30~1.0/bin/x64/R" --vanilla CMD INSTALL \
## "C:\Users\alice\AppData\Local\Temp\RtmpaW8UiU\plotflow-master" \
## --library="C:/Users/alice/Documents/R/win-library/3.0" \
## --with-keep.source --install-tests
## ```
```r
## What Does `order_by` do?
library(plotflow)
dat <- aggregate(cbind(mpg, hp, disp) ~ carb, mtcars, mean)
dat$carb <- factor(dat$carb)
## compare levels (data set looks the same though)
dat$carb
## [1] 1 2 3 4 6 8
## Levels: 1 2 3 4 6 8
order_by(carb, ~-hp + -mpg, data = dat)$carb
## [1] 1 2 3 4 6 8
## Levels: 8 4 3 6 2 1
## Return just the vector with new levels
order_by(carb, ~-hp + -mpg, dat, df = FALSE)
## [1] 1 2 3 4 6 8
## Levels: 8 4 3 6 2 1
## Use `order_by` to Order Bars
library(ggplot2)
## Reset the data from Section 1
dat2 <- data.frame(car = rownames(mtcars), mtcars, row.names = NULL)
ggplot(order_by(car, ~mpg, dat2), aes(x = car, y = mpg)) + geom_bar(stat = "identity") +
coord_flip() + ggtitle("Order Pretty Easy")
## Aggregated by Summary Stat Carb Ordered By Summary (Mean) of mpg
## Ordered points with the order_by function
a <- ggplot(order_by(carb, ~mpg, dat2, mean), aes(x = carb, y = mpg)) + geom_point(stat = "identity",
aes(colour = carb)) + coord_flip() + ggtitle("Ordered Dot Plots Made Easy")
## Reverse the ordered points
b <- ggplot(order_by(carb, ~-mpg, dat2, mean), aes(x = carb, y = mpg)) + geom_point(stat = "identity",
aes(colour = carb)) + coord_flip() + ggtitle("Reverse Order Too!")
grid.arrange(a, b, ncol = 1)
## Nested Usage (order_by on an order by dataframe)
ggplot(order_by(gear, ~mpg, dat2, mean), aes(mpg, carb)) + geom_point(aes(color = factor(cyl))) +
facet_grid(gear ~ ., scales = "free") + ggtitle("I'm Nested (Yay for me!)")
# SECTION 3: USING ORDER_BY ON REAL DATA #
library(RCurl)
## Loading required package: bitops
library(XML)
## Warning: package 'XML' was built under R version 3.0.1
library(rjson)
library(ggplot2)
library(qdap)
## Warning: package 'qdap' was built under R version 3.0.1
## Loading required package: gdata
## Warning: package 'gdata' was built under R version 3.0.1
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## ```
```r
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.0.1
library(gridExtra)
## GRab the data
URL <- "http://www.payscale.com/top-tech-employers-compared-2012/job-satisfaction-survey-data"
doc <- htmlTreeParse(URL, useInternalNodes = TRUE)
nodes <- getNodeSet(doc, "//script[@type='text/javascript']")[[19]][[1]]
dat <- gsub("];", "]", capture.output(nodes)[5:27])
ndat <- data.frame(do.call(rbind, fromJSON(paste(dat, collapse = ""))))[, -2]
ndat[, 1:5] <- lapply(ndat, unlist)
IBM <- grepl("International Business Machines", ndat[, 1])
ndat[IBM, 1] <- bracketXtract(ndat[IBM, 1])
ndat[, 1] <- sapply(strsplit(ndat[, 1], "\\s|,"), "[", 1)
## Re-level with order_by
ndat[, "Employer.Name"] <- order_by(Employer.Name, ~Job.Satisfaction, ndat,
df = FALSE)
colnames(ndat)[1] <- "Employer"
## Melt the data to long format
mdat <- melt(ndat)
## Using Employer as id variables
mdat[, 2] <- factor(gsub("\\.", " ", mdat[, 2]), levels = gsub("\\.", " ",
colnames(ndat)[-1]))
head(mdat)
## Employer variable value
## 1 Adobe Job Satisfaction 0.6875
## 2 Amazon.com Job Satisfaction 0.7723
## 3 AOL Job Satisfaction 0.7714
## 4 Apple Job Satisfaction 0.7800
## 5 Dell Job Satisfaction 0.6890
## 6 eBay Job Satisfaction 0.7097
ggplot(data = mdat, aes(x = Employer, y = value, fill = factor(Employer))) +
geom_bar(stat = "identity") + coord_flip() + ylim(c(0, 1)) + facet_wrap(~variable,
ncol = 2) + theme(legend.position = "none") + ggtitle("Plot 3: Employee Job Satisfaction at Top Tech Companies") +
ylab(c("Job Satisfaction"))
## A regression model
mod <- lm(Job.Satisfaction ~ Work.Stress + Job.Meaning + Job.Flexibility, data = ndat)
mod
##
## Call:
## lm(formula = Job.Satisfaction ~ Work.Stress + Job.Meaning + Job.Flexibility,
## data = ndat)
##
## Coefficients:
## (Intercept) Work.Stress Job.Meaning Job.Flexibility
## 0.3101 0.1062 0.5241 0.0733
anova(mod)
## Analysis of Variance Table
##
## Response: Job.Satisfaction
## Df Sum Sq Mean Sq F value Pr(>F)
## Work.Stress 1 0.0069 0.0069 1.45 0.2452
## Job.Meaning 1 0.0816 0.0816 17.04 0.0007 ***
## Job.Flexibility 1 0.0006 0.0006 0.13 0.7260
## Residuals 17 0.0814 0.0048
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod)
##
## Call:
## lm(formula = Job.Satisfaction ~ Work.Stress + Job.Meaning + Job.Flexibility,
## data = ndat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12043 -0.03002 -0.00263 0.03268 0.11915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3101 0.2413 1.29 0.2160
## Work.Stress 0.1062 0.2147 0.49 0.6273
## Job.Meaning 0.5241 0.1288 4.07 0.0008 ***
## Job.Flexibility 0.0733 0.2058 0.36 0.7260
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0692 on 17 degrees of freedom
## Multiple R-squared: 0.523, Adjusted R-squared: 0.438
## F-statistic: 6.21 on 3 and 17 DF, p-value: 0.00483
theplot <- ggplot(data = ndat, aes(x = Job.Meaning, y = Job.Satisfaction)) +
geom_smooth(method = "lm", fill = "blue", alpha = 0.1, size = 1) + geom_smooth(color = "red",
fill = "pink", alpha = 0.3, size = 1) + xlim(c(0.4, 0.9)) + geom_point(aes(size = Job.Flexibility,
colour = Work.Stress)) + geom_text(aes(label = Employer), size = 3, hjust = -0.1,
vjust = -0.1) + scale_colour_gradient(low = "gold", high = "red")
theplot
## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.
theplot + annotation_custom(grob = circleGrob(r = unit(0.4, "npc")), xmin = 0.47,
xmax = 0.57, ymin = 0.72, ymax = 0.82)
## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.
ndat$outs <- 1
ndat$outs[ndat$Employer %in% qcv(AOL, Amazon.com, Nvidia, Sony)] <- 0
ggplot(data = ndat, aes(x = Job.Meaning, y = Job.Satisfaction)) + geom_smooth(method = "lm",
fill = "blue", alpha = 0.1, size = 1, aes(group = outs)) + geom_smooth(color = "red",
fill = "pink", alpha = 0.3, size = 1) + xlim(c(0.4, 0.9)) + geom_point(aes(size = Job.Flexibility,
colour = Work.Stress)) + geom_text(aes(label = Employer), size = 3, hjust = -0.1,
vjust = -0.1) + scale_colour_gradient(low = "gold", high = "red")
## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.