Bubble Charts

Flowing Data tutorial link: http://flowingdata.com/2010/11/23/how-to-make-bubble-charts/

This is not a very intuitive visualization of the data provided, however, it is an interesting chart type.

Hans Rosling TED Talk: https://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen?language=en

getwd()
## [1] "/Users/hollyjones/Desktop/FlowingData"
setwd("/Users/hollyjones/Desktop/FlowingData/bubbles")

#data on crime rates by state in 2008
#one caveat - crime rates are for 2005 while population numbers are for 2008
crime = read.csv("crimeRatesByState2008.csv", header=TRUE, sep="\t")
head(crime)
##         state murder Forcible_rate Robbery aggravated_assult burglary
## 1    Alabama     8.2          34.3   141.4             247.8    953.8
## 2     Alaska     4.8          81.1    80.9             465.1    622.5
## 3    Arizona     7.5          33.8   144.4             327.4    948.4
## 4    Arkansas    6.7          42.9    91.1             386.8   1084.6
## 5 California     6.9          26.0   176.1             317.3    693.3
## 6   Colorado     3.7          43.4    84.6             264.7    744.8
##   larceny_theft motor_vehicle_theft population
## 1        2650.0               288.3    4627851
## 2        2599.1               391.0     686293
## 3        2965.2               924.4    6500180
## 4        2711.2               262.1    2855390
## 5        1916.5               712.8   36756666
## 6        2735.2               559.5    4861515
symbols(crime$murder, crime$burglary, circles = crime$population, main= "Default Chart")

#the default chart features bubbles that are disproportionately large
#this is because circles are sized by RADIUS instead of AREA
#TO SIZE THE RADII CORRECTLY: use the equation for the area of a circle = pie*r^2
#Thus, RADIUS = sqrt(area/pie)
radius = sqrt(crime$population/pi)


#GRAPHIC PARAMETERS for "symbols" - by deafult, "symbols" scales the largest bubble to 1 inch
#all remaining bubbles are scaled relative to the largest 1 inch bubble. To change this use the "inches" arg
symbols(crime$murder, crime$burglary, circles = radius, inches = 0.35, bg = "red", xlab = "Murder Rate", ylab = "Burglary Rate", main = "Burglary vs. Murder per 100,000 People")

#Annotate the chart
text(crime$murder, crime$burglary, crime$state, cex = 0.85)

Need to edit the above plot in an open-source version of Adobe Illustrator in order to clean up the text annotations and add additional details, such as a legend. (Export the image as a PDF and proceed to edit).

Heatmaps

Flowing Data tutorial link: http://flowingdata.com/2012/10/04/more-on-making-heat-maps-in-r/

getwd()
## [1] "/Users/hollyjones/Desktop/FlowingData"
setwd("/Users/hollyjones/Desktop/FlowingData/heatmaps")

dir()
## [1] "2008-09nba.trunc.csv" "heatmaps-custom.R"    "ray.png"
#NBA play-by-play data (DRB, 3pa, STL, AST, MIN, etc.) for leading scorers 
#with X and Y coordinates for the 2008-2009 season - very interesting!!!
nba = read.csv("2008-09nba.trunc.csv", header=TRUE, sep = ",")
head(nba)
##   team      etype        player points     type  x  y
## 1  OFF  jump ball                   NA          NA NA
## 2  BOS       foul   Rajon Rondo     NA shooting NA NA
## 3  CLE free throw  Delonte West     NA          NA NA
## 4  CLE free throw  Delonte West     NA          NA NA
## 5  BOS       shot Kevin Garnett     NA     jump  9  7
## 6  CLE    rebound  LeBron James     NA      def NA NA
tail(nba)
##        team   etype       player points type  x  y
## 510989  WAS    shot   Juan Dixon     NA  3pt 17 30
## 510990  BOS rebound   Glen Davis     NA  def NA NA
## 510991  BOS    shot   Glen Davis      3  3pt  2  6
## 510992  WAS    shot Caron Butler      3  3pt 41 26
## 510993  BOS    shot   Glen Davis     NA  3pt 25 77
## 510994  BOS rebound                  NA      NA NA
shots = nba[nba$etype == "shot", ]
head(shots)
##    team etype             player points          type  x  y
## 5   BOS  shot      Kevin Garnett     NA          jump  9  7
## 7   CLE  shot       LeBron James      2   pullup jump 39 20
## 8   BOS  shot        Paul Pierce      2     slam dunk 26  6
## 9   CLE  shot       LeBron James     NA           3pt 32 30
## 11  BOS  shot      Kevin Garnett      2     slam dunk 25  6
## 12  CLE  shot Zydrunas Ilgauskas      2 driving layup 25  8
#some shots are NA - get only known shots
shots = shots[!is.na(shots$x) & !is.na(shots$y), ]


#subset to just GSW
gsw = subset(shots, team == "GSW")

#PROBLEM = when there is more than one shot from a particular spot on the floor, dots overlap each other
#circles need to be sized by the number of shots as opposed to making them all the same size
symbols(gsw$x, gsw$y, circles = rep(1, length(gsw$x)), inches = FALSE, asp = 1, main = "Default Chart")

######################################################################################################
#                               CODE TO DRAW BASKETBALL COURT FROM SCRATCH                          #
######################################################################################################

library(plotrix)
#Adding an NBA court outline for kicks and giggles.
draw.arc(25, 5.25, 9/12, angle1=0, angle2=2*pi, col="orange", lwd=2) # Hoop
lines(c(22,28), c(4,4), col="orange", lwd=2)   # Backboard
lines(c(2.5, 2.5), c(0, 13.5), col="orange", lwd=2) # Side 3-pt
lines(c(47.5, 47.5), c(0, 13.5), col="orange", lwd=2)
lines(c(19,19), c(0,19), col="orange", lwd=2)  # Inside lane
lines(c(31,31), c(0,19), col="orange", lwd=2)
lines(c(19,31), c(19,19), col="orange", lwd=2) # Free throw
lines(c(0,50),c(0,0), col="orange", lwd=2) # Baseline
draw.arc(25, 5.25, 23.75, angle1=pi/9.8, angle2=pi/1.113, col="orange", lwd=2) # 3-pt arc

######################################################################################################
#                                           END CODE                                                 #
######################################################################################################

#subset to look at ray allen's scoring only
ray = subset(shots, player == "Ray Allen")
head(ray)
##     team etype    player points  type  x  y
## 19   BOS  shot Ray Allen     NA layup 25  7
## 28   BOS  shot Ray Allen     NA   3pt  2 13
## 31   BOS  shot Ray Allen     NA   3pt 42 24
## 115  BOS  shot Ray Allen     NA layup 25  6
## 154  BOS  shot Ray Allen     NA  jump 44 18
## 208  BOS  shot Ray Allen      2  jump 16 24
#The count() function from the plyr package makes it straightforward to aggregate by two columns.
library(plyr)

scoring.agg = count(ray, "type")
scoring.agg = scoring.agg[order(scoring.agg$freq, decreasing = FALSE), ]
head(scoring.agg)
##               type freq
## 4     driving jump    1
## 7             dunk    1
## 15   putback layup    1
## 20  step back jump    1
## 22 turnaround jump    1
## 2     driving dunk    2
par(mar=c(5, 9, 2, 2))
barplot(scoring.agg$freq, names.arg = scoring.agg$type, horiz = TRUE, las = 2, cex.names = 0.85, xlim = c(0, 500), col = "dark green", main = "Ray Allen Scoring by Shot Type")

##################################################################################################
par(mar = c(2, 2, 2, 2))

# Aggregate by shot frequency
library(plyr)
gsw.agg <- count(gsw, c("x","y"))

# Size symbols by number of shots.
symbols(gsw.agg$x, gsw.agg$y, circles=sqrt(gsw.agg$freq)/8, asp=1, inches=FALSE, ylim=c(0,40))

symbols(gsw.agg$x, gsw.agg$y, squares=sqrt(gsw.agg$freq)/8, asp=1, inches=FALSE, ylim=c(0,40))

# Hexbins
library(hexbin)

h <- hexbin(gsw$x, gsw$y, xbins=20, shape=80/50)
plot(h)

# Helper function to get color by shot frequency
getColor <- function(val) {
    
    minVal <- log(1)        # Using logarithmic scale
    maxVal <- log(1065) 
    
    numCols <- 20
    pal <- colorRampPalette(c("forestgreen", "red"))
    cols <- pal(numCols)
    
    # Get index to pick color.
    colIndex <- round(numCols * (log(val) - minVal) / (maxVal - minVal))
    colIndex <- max(1, colIndex)
    
    return(cols[colIndex])
}

# Get colors and draw heat map of shot frequency.
gridColors <- sapply(gsw.agg$freq, getColor)
symbols(gsw.agg$x, gsw.agg$y, squares=rep(1,length(gsw.agg$x)), asp=1, inches=FALSE, ylim=c(0,40), bg=gridColors, fg=NA)

# Find points per shot for each spot on court.
ptsper <- c()
for (i in 1:length(gsw.agg[,1])) {
    
    xcoord <- gsw.agg$x[i]
    ycoord <- gsw.agg$y[i]
    spot <- subset(gsw, x == xcoord & y == ycoord)
    totalShots <- length(spot[,1])
    totalPts <- sum(spot$points[!is.na(spot$points)])
    
    ptsper <- c(ptsper, totalPts/totalShots)
}



# Helper function to get color by points per shot.
getColorByPoints <- function(val) {
    
    minVal <- 0
    maxVal <- 3
    
    numCols <- 30
    
    pal <- colorRampPalette(c("forestgreen", "red"))
    cols <- pal(numCols)
    
    # Get index to pick color.
    colIndex <- round(numCols * (val - minVal) / (maxVal - minVal))
    colIndex <- max(1, colIndex)
    
    return(cols[colIndex])
}

# Get colors and draw heat map of points per shot.
gridColors <- sapply(ptsper, getColorByPoints)
plot(0, 0, xlim=c(0,50), ylim=c(0,40), type="n", xaxt="n", yaxt="n", xlab="", ylab="", bty="n", main = "Points Per Area on the Floor - GSW")
symbols(gsw.agg$x, gsw.agg$y, squares=sqrt(gsw.agg$freq)/3, asp=1, inches=FALSE, add=TRUE, bg=gridColors, fg=NA)



# Adding an NBA court outline for kicks and giggles.
draw.arc(25, 5.25, 9/12, angle1=0, angle2=2*pi, col="#cccccc", lwd=2) # Hoop
lines(c(22,28), c(4,4), col="#cccccc", lwd=2)   # Backboard
lines(c(2.5, 2.5), c(0, 13.5), col="#cccccc", lwd=2) # Side 3-pt
lines(c(47.5, 47.5), c(0, 13.5), col="#cccccc", lwd=2)
lines(c(19,19), c(0,19), col="#cccccc", lwd=2)  # Inside lane
lines(c(31,31), c(0,19), col="#cccccc", lwd=2)
lines(c(19,31), c(19,19), col="#cccccc", lwd=2) # Free throw
lines(c(0,50),c(0,0), col="#cccccc", lwd=2) # Baseline
draw.arc(25, 5.25, 23.75, angle1=pi/9.8, angle2=pi/1.113, col="#cccccc", lwd=2) # 3-pt arc