library (readr)
library(RCurl)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
## Reason: image not found
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library(ggplot2)
In this section I bring in a dataset from 538 that contains information on MLB fould balls. In this analysis I would like to look at the number of balls and the average exit velocity of balls that reach the stands in baseball. I acquire these summary statistics after removing records that do not have an exit velocity or would not have reached the stands.
x <- getURL("https://raw.githubusercontent.com/agersowitz/Data-607-Datasets/master/foul-balls%5B1%5D.csv")
foul <- read.csv(text=x)
foul<-subset(foul, foul$type_of_hit != "Ground")
foul<-subset(foul, foul$type_of_hit != "Batter hits self")
foulev<-subset(foul,foul$exit_velocity!="NA")
avg_exit<-sqldf("select avg(exit_velocity) aev, used_zone from foulev group by used_zone")
count_hit<-sqldf("select count(exit_velocity) cev, used_zone from foul group by used_zone")
In this final section I use various ggplot functions to create a baseball diamond based on an x,y plot. I roughly based the sections off of the graphic found at https://fivethirtyeight.com/features/we-watched-906-foul-balls-to-find-out-where-the-most-dangerous-ones-land/
ids <- factor(c( "1", "2", "3", "4", "5","6","7"))
ev <- data.frame(
id = ids,
value = c(avg_exit$aev)
)
ch <- data.frame(
id = ids,
value = c(count_hit$cev)
)
positions <- data.frame(
id = rep(ids, each = 5),
x = c(
-0.25, -0.5, 0.5, 0.25, 0,
-0.25,-0.5, -0.75, -0.5,-0.5,
0.25,0.5, 0.75, 0.5,0.5,
-0.5, -0.75, -1,-0.75,-0.75,
0.5, 0.75, 1,0.75,0.75,
-0.75, -1, -1,-1,-1,
0.75, 1, 1,1,1)
,
y = c(
-0.75, -1, -1, -0.75, -1,
-0.75, -0.5, -0.75, -1, -1,
-0.75, -0.5, -0.75, -1, -1,
-0.5, -0.75, -0.25, -0.25,-0.25,
-0.5, -0.75, -0.25, -0.25,-0.25,
-0.25, -.25, 0, 0, 0,
-0.25, -.25, 0, 0, 0)
)
datapoly <- merge(ev, positions, by = c("id"))
datapolych <- merge(ch, positions, by = c("id"))
p <- ggplot(datapoly, aes(x = x, y = y)) +
geom_polygon(aes(fill = value, group = id))
pc <- ggplot(datapolych, aes(x = x, y = y)) +
geom_polygon(aes(fill = value, group = id))
baseline<-data.frame(
x = c(-1,0,1),
y = c(0,-1,0)
)
field <- data.frame(x = c(1, 0, -1, 0), y = c(0, 1, 0, -1))
pev<-p + scale_fill_gradient(low = "pink", high = "red") +
ggtitle("Average Exit Velocity of Foul Balls by Seating Zone") +
geom_line(data = baseline, colour = "grey30", size = 2)
pch<-pc + scale_fill_gradient(low = "cyan", high = "blue") +
ggtitle("Number of Foul Balls by Seating Zone") +
geom_line(data = baseline, colour = "grey30", size = 2)
pev
pch
By the end of this analysis you can visualy see statistics on foul balls in MLB. This can be very useful when trying to convey information that has a physical correlation to people who aren’t data scientists. For example population data on a map based on county or a blueprint of a building that shows where certain problems are occurring with certain tenants.
Using Plotly with ggplot2 and creating geom plots. The extend assignment shows how to use the packages to create different styled geom plots using both the ggplot2 package along with plotly. I’ll create two graphs, one drawing elispes on a graph and a plot of a map.
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
set.seed(20130226)
n <- 200
x1 <- rnorm(n, mean = 2)
y1 <- 1.5 + 0.4 * x1 + rnorm(n)
x2 <- rnorm(n, mean = -1)
y2 <- 3.5 - 1.2 * x2 + rnorm(n)
class <- rep(c("A", "B"), each = n)
df <- data.frame(x = c(x1, x2), y = c(y1, y2), colour = class)
# get code for "stat_ellipse"
library(devtools)
## Loading required package: usethis
library(ggplot2)
library(proto) #source_url("https://raw.github.com/JoFrhwld/FAAV/master/r/stat-ellipse.R")
p <- qplot(data = df, x = x, y = y, colour = class) +
stat_ellipse(geom = "polygon", alpha = 1/2, aes(fill = class))
fig <- ggplotly(p)
fig
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(maps)
# map data
county_df <- map_data("county")
state_df <- map_data("state")
county_df$subregion <- gsub(" ", "", county_df$subregion)
#election data
df <- read.csv("https://raw.githubusercontent.com/bcdunbar/datasets/master/votes.csv")
df <- subset(df, select = c(Obama, Romney, area_name))
df$area_name <- tolower(df$area_name)
df$area_name <- gsub(" county", "", df$area_name)
df$area_name <- gsub(" ", "", df$area_name)
df$area_name <- gsub("[.]", "", df$area_name)
df$Obama <- df$Obama*100
df$Romney <- df$Romney*100
for (i in 1:length(df[,1])) {
if (df$Obama[i] > df$Romney[i]) {
df$Percent[i] = df$Obama[i]
} else {
df$Percent[i] = -df$Romney[i]
}
}
names(df) <- c("Obama", "Romney", "subregion", "Percent")
# join data
US <- inner_join(county_df, df, by = "subregion")
US <- US[!duplicated(US$order), ]
# colorramp
blue <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)
red <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)
#plot
p <- ggplot(US, aes(long, lat, group = group)) +
geom_polygon(aes(fill = Percent),
colour = alpha("white", 1/2), size = 0.05) +
geom_polygon(data = state_df, colour = "white", fill = NA) +
ggtitle("2012 US Election") +
scale_fill_gradientn(colours=c(blue,"white", red)) +
theme_void()
fig <- ggplotly(p)
fig
There are so many packages and tools to use for creating plot in R. The ggplot2 has integrations with just about all of them. The level of customization is endless, you can design plots for everything. The cool addition the potly adds to your graph is the ability to make the plot interactive. Try clicking on on legends on the right and see how the plot changes. Have your mouse over the states in the map plot to see additional information on each district.