NBA Data: SQLdf & GGPlot2

Excellent ggplot2 cheat sheet

getwd()
## [1] "/Users/HollyJones/Desktop/play"
dir()
## [1] "figure"        "NBA_test.csv"  "NBA_train.csv" "NBA.html"     
## [5] "NBA.md"        "NBA.R"         "NBA.Rmd"       "overlap.png"  
## [9] "play.Rproj"
#checking head and tail
train = read.csv("NBA_train.csv", header=TRUE, sep=",")
head(train)
##   SeasonEnd                Team Playoffs  W  PTS oppPTS   FG  FGA  X2P
## 1      1980       Atlanta Hawks        1 50 8573   8334 3261 7027 3248
## 2      1980      Boston Celtics        1 61 9303   8664 3617 7387 3455
## 3      1980       Chicago Bulls        0 30 8813   9035 3362 6943 3292
## 4      1980 Cleveland Cavaliers        0 37 9360   9332 3811 8041 3775
## 5      1980      Denver Nuggets        0 30 8878   9240 3462 7470 3379
## 6      1980     Detroit Pistons        0 16 8933   9609 3643 7596 3586
##   X2PA X3P X3PA   FT  FTA  ORB  DRB  AST STL BLK  TOV
## 1 6952  13   75 2038 2645 1369 2406 1913 782 539 1495
## 2 6965 162  422 1907 2449 1227 2457 2198 809 308 1539
## 3 6668  70  275 2019 2592 1115 2465 2152 704 392 1684
## 4 7854  36  187 1702 2205 1307 2381 2108 764 342 1370
## 5 7215  83  255 1871 2539 1311 2524 2079 746 404 1533
## 6 7377  57  219 1590 2149 1226 2415 1950 783 562 1742
tail(train)
##     SeasonEnd                   Team Playoffs  W  PTS oppPTS   FG  FGA
## 830      2011 Portland Trail Blazers        1 48 7896   7771 2951 6599
## 831      2011       Sacramento Kings        0 24 8151   8589 3134 6979
## 832      2011      San Antonio Spurs        1 61 8502   8034 3148 6628
## 833      2011        Toronto Raptors        0 22 8124   8639 3144 6755
## 834      2011              Utah Jazz        0 39 8153   8303 3064 6590
## 835      2011     Washington Wizards        0 23 7977   8584 3048 6888
##      X2P X2PA X3P X3PA   FT  FTA  ORB  DRB  AST STL BLK  TOV
## 830 2433 5096 518 1503 1476 1835  996 2230 1736 660 358 1070
## 831 2706 5702 428 1277 1455 1981 1071 2526 1675 608 391 1324
## 832 2463 4901 685 1727 1521 1984  829 2603 1836 602 372 1101
## 833 2799 5664 345 1091 1491 1976  963 2343 1795 581 350 1206
## 834 2629 5334 435 1256 1590 2061  898 2338 1921 629 484 1175
## 835 2656 5706 392 1182 1489 1999 1013 2374 1592 665 502 1258
#SQL QUERIES AND VISUALIZATIONS
library(sqldf)
library(ggplot2)

#2011 Team Rankings
#Wins by team in the 2011 season
season_2011 = sqldf("select Team, W from train where SeasonEnd == 2011 order by W")
season_2011
##                      Team  W
## 1  Minnesota Timberwolves 17
## 2     Cleveland Cavaliers 19
## 3         Toronto Raptors 22
## 4      Washington Wizards 23
## 5         New Jersey Nets 24
## 6        Sacramento Kings 24
## 7         Detroit Pistons 30
## 8    Los Angeles Clippers 32
## 9       Charlotte Bobcats 34
## 10        Milwaukee Bucks 35
## 11  Golden State Warriors 36
## 12         Indiana Pacers 37
## 13              Utah Jazz 39
## 14           Phoenix Suns 40
## 15     Philadelphia 76ers 41
## 16        New York Knicks 42
## 17        Houston Rockets 43
## 18          Atlanta Hawks 44
## 19      Memphis Grizzlies 46
## 20    New Orleans Hornets 46
## 21 Portland Trail Blazers 48
## 22         Denver Nuggets 50
## 23          Orlando Magic 52
## 24  Oklahoma City Thunder 55
## 25         Boston Celtics 56
## 26       Dallas Mavericks 57
## 27     Los Angeles Lakers 57
## 28             Miami Heat 58
## 29      San Antonio Spurs 61
## 30          Chicago Bulls 62
#Visualize 2011 Team Rankings

#SETTING ENTIRE "THEME"/GRAPH SIZE
theme_set(theme_grey(base_size=18))


#flip coordinate axis, reorder from most wins to least
g = ggplot(season_2011, aes(x = reorder(Team,W), y= W, fill = W))
g + geom_bar(stat = "identity") + coord_flip() + ggtitle("NBA Team Wins in 2011") + xlab("Teams") + ylab("Wins") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black"))

plot of chunk unnamed-chunk-1

Correlation Plots

Defensive rebounds most highly correlated with wins.

#Create a correlation plot to identify relationships with wins column
library(corrplot)

#corrplot only works with numeric variables
#create new df with only numeric variables
library(dplyr)

#select only numeric variables
train_numeric = train %>% select(which(sapply(., is.numeric)))
train_numeric = select(train_numeric, -c(SeasonEnd, Playoffs))
head(train_numeric)
##    W  PTS oppPTS   FG  FGA  X2P X2PA X3P X3PA   FT  FTA  ORB  DRB  AST STL
## 1 50 8573   8334 3261 7027 3248 6952  13   75 2038 2645 1369 2406 1913 782
## 2 61 9303   8664 3617 7387 3455 6965 162  422 1907 2449 1227 2457 2198 809
## 3 30 8813   9035 3362 6943 3292 6668  70  275 2019 2592 1115 2465 2152 704
## 4 37 9360   9332 3811 8041 3775 7854  36  187 1702 2205 1307 2381 2108 764
## 5 30 8878   9240 3462 7470 3379 7215  83  255 1871 2539 1311 2524 2079 746
## 6 16 8933   9609 3643 7596 3586 7377  57  219 1590 2149 1226 2415 1950 783
##   BLK  TOV
## 1 539 1495
## 2 308 1539
## 3 392 1684
## 4 342 1370
## 5 404 1533
## 6 562 1742
library(reshape2)

cor = melt(cor(train_numeric))

l = round(cor$value, 3)

theme_set(theme_grey(base_size=20))

ggplot(cor, aes(Var2, Var1)) +
  geom_tile(data=cor, aes(fill=value, label=round(cor$value, 3)), color="white") +
  scale_fill_gradient2(low="blue", high="red", mid="white", 
  midpoint=0, limit=c(-1,1),name="Correlation\n(Pearson)") +
  theme(axis.text.x = element_text(angle=90, vjust=1, size=16, hjust=1)) +
  coord_equal() + ggtitle("Which NBA Stats Are \n Highly Correlated with Wins?") +
  theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica")) +
  theme(axis.title=element_blank())

plot of chunk unnamed-chunk-2

Team Wins Over Time

#visualizes wins over time 
head(train$Team)
## [1] Atlanta Hawks       Boston Celtics      Chicago Bulls      
## [4] Cleveland Cavaliers Denver Nuggets      Detroit Pistons    
## 37 Levels: Atlanta Hawks Boston Celtics ... Washington Wizards
#Get 6 best teams in 2011
w_teams_2011 = sqldf("select Team, W, SeasonEnd from train where SeasonEnd == 2011 AND W >= 56")
w_teams_2011
##                 Team  W SeasonEnd
## 1     Boston Celtics 56      2011
## 2      Chicago Bulls 62      2011
## 3   Dallas Mavericks 57      2011
## 4 Los Angeles Lakers 57      2011
## 5         Miami Heat 58      2011
## 6  San Antonio Spurs 61      2011
#Get wins of 2011's 6 best teams over time
over_time = sqldf("select Team, W, SeasonEnd from train where Team == 'Boston Celtics' OR Team == 'Chicago Bulls' OR Team == 'Los Angeles Lakers'")

theme_set(theme_grey(base_size=22))

p = qplot(over_time$SeasonEnd, over_time$W, geom=c("point", "smooth"), colour=over_time$Team, xlab="Season", ylab="Wins", main="Franchise Wins 1980-2011") + scale_colour_manual(values = c("dark green", "red", "purple")) + geom_point(size=3)

p + theme(
  axis.title=element_text(face="bold", size=18, color="black"), 
  legend.position="top",
  legend.title=element_blank(),
  plot.title = element_text(size = 22, face = "bold", family = "Helvetica")
  )

plot of chunk unnamed-chunk-3