getwd()
## [1] "/Users/HollyJones/Desktop/play"
dir()
## [1] "figure" "NBA_test.csv" "NBA_train.csv" "NBA.html"
## [5] "NBA.md" "NBA.R" "NBA.Rmd" "overlap.png"
## [9] "play.Rproj"
#checking head and tail
train = read.csv("NBA_train.csv", header=TRUE, sep=",")
head(train)
## SeasonEnd Team Playoffs W PTS oppPTS FG FGA X2P
## 1 1980 Atlanta Hawks 1 50 8573 8334 3261 7027 3248
## 2 1980 Boston Celtics 1 61 9303 8664 3617 7387 3455
## 3 1980 Chicago Bulls 0 30 8813 9035 3362 6943 3292
## 4 1980 Cleveland Cavaliers 0 37 9360 9332 3811 8041 3775
## 5 1980 Denver Nuggets 0 30 8878 9240 3462 7470 3379
## 6 1980 Detroit Pistons 0 16 8933 9609 3643 7596 3586
## X2PA X3P X3PA FT FTA ORB DRB AST STL BLK TOV
## 1 6952 13 75 2038 2645 1369 2406 1913 782 539 1495
## 2 6965 162 422 1907 2449 1227 2457 2198 809 308 1539
## 3 6668 70 275 2019 2592 1115 2465 2152 704 392 1684
## 4 7854 36 187 1702 2205 1307 2381 2108 764 342 1370
## 5 7215 83 255 1871 2539 1311 2524 2079 746 404 1533
## 6 7377 57 219 1590 2149 1226 2415 1950 783 562 1742
tail(train)
## SeasonEnd Team Playoffs W PTS oppPTS FG FGA
## 830 2011 Portland Trail Blazers 1 48 7896 7771 2951 6599
## 831 2011 Sacramento Kings 0 24 8151 8589 3134 6979
## 832 2011 San Antonio Spurs 1 61 8502 8034 3148 6628
## 833 2011 Toronto Raptors 0 22 8124 8639 3144 6755
## 834 2011 Utah Jazz 0 39 8153 8303 3064 6590
## 835 2011 Washington Wizards 0 23 7977 8584 3048 6888
## X2P X2PA X3P X3PA FT FTA ORB DRB AST STL BLK TOV
## 830 2433 5096 518 1503 1476 1835 996 2230 1736 660 358 1070
## 831 2706 5702 428 1277 1455 1981 1071 2526 1675 608 391 1324
## 832 2463 4901 685 1727 1521 1984 829 2603 1836 602 372 1101
## 833 2799 5664 345 1091 1491 1976 963 2343 1795 581 350 1206
## 834 2629 5334 435 1256 1590 2061 898 2338 1921 629 484 1175
## 835 2656 5706 392 1182 1489 1999 1013 2374 1592 665 502 1258
#SQL QUERIES AND VISUALIZATIONS
library(sqldf)
library(ggplot2)
#2011 Team Rankings
#Wins by team in the 2011 season
season_2011 = sqldf("select Team, W from train where SeasonEnd == 2011 order by W")
season_2011
## Team W
## 1 Minnesota Timberwolves 17
## 2 Cleveland Cavaliers 19
## 3 Toronto Raptors 22
## 4 Washington Wizards 23
## 5 New Jersey Nets 24
## 6 Sacramento Kings 24
## 7 Detroit Pistons 30
## 8 Los Angeles Clippers 32
## 9 Charlotte Bobcats 34
## 10 Milwaukee Bucks 35
## 11 Golden State Warriors 36
## 12 Indiana Pacers 37
## 13 Utah Jazz 39
## 14 Phoenix Suns 40
## 15 Philadelphia 76ers 41
## 16 New York Knicks 42
## 17 Houston Rockets 43
## 18 Atlanta Hawks 44
## 19 Memphis Grizzlies 46
## 20 New Orleans Hornets 46
## 21 Portland Trail Blazers 48
## 22 Denver Nuggets 50
## 23 Orlando Magic 52
## 24 Oklahoma City Thunder 55
## 25 Boston Celtics 56
## 26 Dallas Mavericks 57
## 27 Los Angeles Lakers 57
## 28 Miami Heat 58
## 29 San Antonio Spurs 61
## 30 Chicago Bulls 62
#Visualize 2011 Team Rankings
#SETTING ENTIRE "THEME"/GRAPH SIZE
theme_set(theme_grey(base_size=18))
#flip coordinate axis, reorder from most wins to least
g = ggplot(season_2011, aes(x = reorder(Team,W), y= W, fill = W))
g + geom_bar(stat = "identity") + coord_flip() + ggtitle("NBA Team Wins in 2011") + xlab("Teams") + ylab("Wins") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black"))
Defensive rebounds most highly correlated with wins.
#Create a correlation plot to identify relationships with wins column
library(corrplot)
#corrplot only works with numeric variables
#create new df with only numeric variables
library(dplyr)
#select only numeric variables
train_numeric = train %>% select(which(sapply(., is.numeric)))
train_numeric = select(train_numeric, -c(SeasonEnd, Playoffs))
head(train_numeric)
## W PTS oppPTS FG FGA X2P X2PA X3P X3PA FT FTA ORB DRB AST STL
## 1 50 8573 8334 3261 7027 3248 6952 13 75 2038 2645 1369 2406 1913 782
## 2 61 9303 8664 3617 7387 3455 6965 162 422 1907 2449 1227 2457 2198 809
## 3 30 8813 9035 3362 6943 3292 6668 70 275 2019 2592 1115 2465 2152 704
## 4 37 9360 9332 3811 8041 3775 7854 36 187 1702 2205 1307 2381 2108 764
## 5 30 8878 9240 3462 7470 3379 7215 83 255 1871 2539 1311 2524 2079 746
## 6 16 8933 9609 3643 7596 3586 7377 57 219 1590 2149 1226 2415 1950 783
## BLK TOV
## 1 539 1495
## 2 308 1539
## 3 392 1684
## 4 342 1370
## 5 404 1533
## 6 562 1742
library(reshape2)
cor = melt(cor(train_numeric))
l = round(cor$value, 3)
theme_set(theme_grey(base_size=20))
ggplot(cor, aes(Var2, Var1)) +
geom_tile(data=cor, aes(fill=value, label=round(cor$value, 3)), color="white") +
scale_fill_gradient2(low="blue", high="red", mid="white",
midpoint=0, limit=c(-1,1),name="Correlation\n(Pearson)") +
theme(axis.text.x = element_text(angle=90, vjust=1, size=16, hjust=1)) +
coord_equal() + ggtitle("Which NBA Stats Are \n Highly Correlated with Wins?") +
theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica")) +
theme(axis.title=element_blank())
#visualizes wins over time
head(train$Team)
## [1] Atlanta Hawks Boston Celtics Chicago Bulls
## [4] Cleveland Cavaliers Denver Nuggets Detroit Pistons
## 37 Levels: Atlanta Hawks Boston Celtics ... Washington Wizards
#Get 6 best teams in 2011
w_teams_2011 = sqldf("select Team, W, SeasonEnd from train where SeasonEnd == 2011 AND W >= 56")
w_teams_2011
## Team W SeasonEnd
## 1 Boston Celtics 56 2011
## 2 Chicago Bulls 62 2011
## 3 Dallas Mavericks 57 2011
## 4 Los Angeles Lakers 57 2011
## 5 Miami Heat 58 2011
## 6 San Antonio Spurs 61 2011
#Get wins of 2011's 6 best teams over time
over_time = sqldf("select Team, W, SeasonEnd from train where Team == 'Boston Celtics' OR Team == 'Chicago Bulls' OR Team == 'Los Angeles Lakers'")
theme_set(theme_grey(base_size=22))
p = qplot(over_time$SeasonEnd, over_time$W, geom=c("point", "smooth"), colour=over_time$Team, xlab="Season", ylab="Wins", main="Franchise Wins 1980-2011") + scale_colour_manual(values = c("dark green", "red", "purple")) + geom_point(size=3)
p + theme(
axis.title=element_text(face="bold", size=18, color="black"),
legend.position="top",
legend.title=element_blank(),
plot.title = element_text(size = 22, face = "bold", family = "Helvetica")
)