数据导入以及绘图准备

library(readxl)
nbadata=read_excel('F:\\intership\\NBA\\seasonstat.xlsx')
data18=read_excel('F:\\intership\\NBA\\season2018.xlsx')
head(nbadata)
## # A tibble: 6 x 26
##    Year    Rk Team       G    MP    FG   FGA   FGP    P3   P3A   P3P    P2
##   <dbl> <dbl> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2009.    1. Phoen~   82.  241.  41.1  81.6 0.504  6.70  17.6 0.383  34.4
## 2 2009.    2. Golde~   82.  242.  39.4  86.0 0.458  6.70  18.0 0.373  32.7
## 3 2009.    3. Los A~   82.  241.  40.3  85.1 0.474  6.70  18.5 0.361  33.7
## 4 2009.    4. New Y~   82.  242.  38.5  86.5 0.445 10.0   27.9 0.360  28.5
## 5 2009.    5. India~   82.  242.  39.3  86.3 0.455  8.00  21.0 0.378  31.3
## 6 2009.    6. Denve~   82.  241.  37.3  79.4 0.470  6.70  18.0 0.371  30.6
## # ... with 14 more variables: P2A <dbl>, P2P <dbl>, FT <dbl>, FTA <dbl>,
## #   FTP <dbl>, ORB <dbl>, DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>,
## #   BLK <dbl>, TOV <dbl>, PF <dbl>, PTS <dbl>
avgdata=subset(nbadata,nbadata$Rk==0)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(plotly)
mytheme1=theme(plot.title = element_text(face = 'bold.italic',size = 12,
                                         color='brown',hjust = 0.5),
               axis.title = element_text(face = 'bold',size=10,color = 'brown'),
               axis.text = element_text(face = 'bold',size=9,color='black'),
               panel.background=element_rect(color = 'black'),
               panel.grid.major.y=element_line(color = 'white',linetype = 1),
               panel.grid.minor.y=element_line(color = 'white'),
               legend.position='none',
               legend.title=element_text(size = 10,color='brown',face = 'bold'))

此数据集是NBA从2008-2009赛季到2017-2018赛季(截止到北京时间3.28)的各个球队赛季场均技术数据统计,一共有22项技术指标。

NBA投篮命中率

goalsdata=data.frame(Year=rep(c(2009:2018),3),percentage=c(avgdata$FGP,avgdata$P3P,avgdata$P2P),label=factor(c(rep('Field goal',10),rep('3-Point',10),rep('2-Point',10))))
ggplot(goalsdata,aes(x=Year,y=percentage,color=label))+
  geom_line()+
  geom_point()+
  ggtitle('League Average Goal Percentage')+
  scale_fill_brewer(palette = "Set1")+
  directlabels::geom_dl(aes(label = label), method = "smart.grid")+#添加标签
  mytheme1

我们这里调用了directlabels包的geom_dl()函数,directlabels包能够自动在ggplot2图中添加数据标签,并能自动匹配颜色。从图中可以看出,2008-2009赛季到2013-2014赛季平均投篮命中率处于波动状态,2015年以来,NBA平均投篮命中率处于上升状态。其中‘Field Goal’包括两分投篮与三分投篮。

teamdata=subset(nbadata,nbadata$Rk!=0)
teamdata$Year=factor(teamdata$Year)
teamdata$Rk=factor(teamdata$Rk)
mycol=rainbow(10)#产生离散渐变色
ggplot(teamdata,aes(Year,P3,fill=Year,color=Year))+
  geom_violin(alpha=.2)+
  scale_fill_manual(values = mycol)+
  ggtitle('Violin Plot of League 3-Point Field Goals')+
  mytheme1

ggplot(teamdata,aes(Year,P3P,fill=Year,color=Year))+
  geom_violin(alpha=.3)+
  ggtitle('Violin Plot of League 3-Point Field Goal Percentage')+
  scale_fill_manual(values = mycol)+
  mytheme1

这两幅图是用ggplot2包画的10年来NBA各个球队三分球进球数以及三分球命中率的小提琴图,小提琴的“胖瘦”代表处于这一区间的球队数目,某个区间段越胖说明处于这一区间的球队越多。可以看出2017-2018赛季大部分球队的三分球命中率集中在0.345-0.36之间,而且处于这一区间的球队比以往都多,说明整个NBA的三分水平有了很大程度的提高。

library("yarrr")
mycol=rainbow(10)
opar=par(no.readonly=T)
par(mar=c(3,4,4,2)+0.1,col.main='brown',col.lab='brown')
pirateplot(formula = P3 ~ Year,
           data = teamdata,
           point.o = .4,
           point.col = "gray",
           point.bg = mycol,
           point.pch = 21,
           bar.f.col = mycol,
           bar.b.col = mycol,
           inf.b.col = mycol,
           inf.f.col = mycol,
           avg.line.col = mycol,
           avg.line.o = 0.8,
           bean.b.col = mycol,
           bean.b.o = 0.5,
           bean.f.col = mycol,
           bean.f.o = .2, # Turn down bean filling
           inf.f.o = .5,# Turn up inf filling
           bty = 'n',yaxt = 'n',xlab = '',
           ylab='Goals',xaxt = 'n') 
Year=factor(c(2009:2018))
axis(2,seq(4,16,by=2),seq(4,16,by=2),col='brown',col.axis='brown')
title(main='League 3-Point Goals by Year',col='brown')
text(c(1:10),rep(3,10),2009:2018,col = 'brown')

pirateplot(formula = P3P ~ Year,
           data = teamdata,
           point.o = .4, 
           point.col = "gray",
           point.bg = mycol,
           point.pch = 21,
           bar.f.col = mycol,
           bar.b.col = mycol,
           inf.b.col = mycol,
           inf.f.col = mycol,
           avg.line.col = mycol,
           avg.line.o = 0.8,
           bean.b.col = mycol,
           bean.b.o = 0.5,
           bean.f.col = mycol,
           bean.f.o = .2, # Turn down bean filling
           inf.f.o = .5,# Turn up inf filling
           bty = 'n',yaxt = 'n',xlab = '',
           ylab='Goal  Percentage',xaxt = 'n') 

axis(2,seq(0.30,0.42,by=0.03),seq(0.30,0.42,by=0.03),col='brown',col.axis='brown')
title(main='League 3-Point Goal Percentage by Year',col='brown')
text(c(1:10),rep(0.29,10),2009:2018,col = 'brown')

par(opar)

可以看出,相比上面ggplot2画出的小提琴图,yarrr包绘制的这两张图添加了每年内各个球队的三分球数据点以及每年的boxplot。对于颜色较多的图,适当地设定透明度alpha值可以使图更加“晶莹剔透”,一般来说线条的alpha值要高于填充颜色的alpha值,可以使图更有层次感。

交互式散点图

下面我们研究一下变量间的关系,这里我们画了场均总篮板数vs场均总投球数的散点图以及场均三分球进球数vs场均两分球进球数的散点图,并且添加了回归线以及置信区间。利用plotly包可以实现交互式绘图功能,将鼠标移至点的位置就会自动出现该点的横纵坐标以及所属的group。

(ggplot(teamdata,aes(x=TRB,y=FGA))+
  geom_point(aes(color=Year),alpha=0.8)+
  scale_color_hue( h= c(80, 300))+
  geom_smooth(method = 'lm')+
  xlab('Total Rebounds')+
  ylab('Field Goal Attempts')+
  ggtitle('TRB vs FGA')+
  theme_light()+
  theme(plot.title =element_text(hjust = 0.5,face ='bold.italic',size = 12,
                                 color='black'),
        axis.title = element_text(face = 'bold',size=10,color = 'black'),
        axis.text = element_text(face = 'bold',size=9,color='black'),
        legend.title=element_text(size = 10,color='black',face = 'bold',
                                  hjust = 0.5)))%>%
  ggplotly()
(ggplot(teamdata,aes(x=P2,y=P3))+
  geom_point(aes(color=Year),alpha=0.8)+
  scale_color_hue( h= c(80, 300))+
  geom_smooth(method = 'lm')+
  xlab('2-Point Goals')+
  ylab('3-Point Goals')+
  ggtitle('Goals Per Game')+
  theme_light()+
  theme(plot.title =element_text(hjust = 0.5,face ='bold.italic',size = 12,
                                 color='black'),
        axis.title = element_text(face = 'bold',size=10,color = 'black'),
        axis.text = element_text(face = 'bold',size=9,color='black'),
        legend.title=element_text(size = 10,color='black',face = 'bold',
                                  hjust = 0.5)))%>%
  ggplotly()

环状条形图

下面两张图是关于2017-2018赛季NBA各个球队的场均三分球进球数以及场均失误数的环状条形图。从图中可以看出,今年的NBA最强球队火箭场均三分球数远高于联盟平均水平,并居于联盟第一的位置,而且其场均失误数也是低于联盟平均水平的。

library(tidyverse)
circuplot=function(data){
  data$id=seq(1, nrow(data))
  label_data=data
  number_of_bar=nrow(label_data)
  angle= 90 - 360 * (label_data$id-0.5) /number_of_bar    
  label_data$hjust<-ifelse( angle < -90, 1, 0)
  label_data$angle<-ifelse(angle < -90, angle+180, angle)
  ggplot(data, aes(x=as.factor(id), y=value, fill=group)) +   
    geom_bar(stat="identity", alpha=1) +
    ylim(-10,25) +
    theme_minimal() +
    theme(
      legend.position = "none",
      axis.text = element_blank(),
      axis.title = element_blank(),
      panel.grid = element_blank(),
      plot.margin = unit(rep(-1,4), "cm") 
    ) +
    coord_polar() + 
    geom_text(data=label_data, aes(x=id, y=value+1, label=individual, hjust=hjust), 
              color="black", fontface="bold",alpha=0.8, 
              size=2.5, angle= label_data$angle, inherit.aes = FALSE )+
    scale_fill_brewer(palette = "Set1")
}
#3-point goals
p3data=data.frame(
  individual=data18$Team,
  value=data18$P3,
  group=data18$Diviation
)
circuplot(p3data)

#turnover
tovdata=data.frame(
  individual=data18$Team,
  value=data18$TOV,
  group=data18$Diviation
)
#data = data %>% arrange(group, value)
circuplot(tovdata)

雷达图

这两张图是2017-2018赛季NBA东西部前五的球队场均3分球命中率(P3P),2分球命中率(P2P),个人犯规次数(PF),失误数(TOV),封盖数(BLK),抢断数(STL),助攻数(AST),总篮板数(TRB),罚篮命中率(FTP)9个变量的雷达图。

#radar charts
library(fmsb)
radata=data18[c(1,3,4,6,11),c(11,14,17,20:25)]
rownames(radata)=data18$Team[c(1,3,4,6,11)]
radata=rbind( apply(radata,2, max),apply(radata,2,min),radata)
colors_border=c(rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9),rgb(0.7,0.5,0.1,0.9),
                rgb(0.5,0.3,0.6,0.9),rgb(1,0.4,0.6,0.9))
colors_in=c(rgb(0.2,0.5,0.5,0.4), rgb(0.8,0.2,0.5,0.4),rgb(0.7,0.5,0.1,0.4),
             rgb(0.5,0.3,0.6,0.4),rgb(1,0.4,0.6,0.4) )
radarchart(radata,axistype=1 ,            
            pcol=colors_border , pfcol=colors_in , plwd=2, plty=1,
            cglcol="grey", cglty=1, axislabcol="grey",  
            cglwd=0.8,
            vlcex=0.8
)
legend(x=1.2, y=1, legend = rownames(radata)[3:7], bty = "n", pch=20 ,
       col=colors_in , text.col = "grey", cex=0.8, pt.cex = 1.5)

从东部赛区的雷达图来看,目前居于东部第一的猛龙队在罚篮命中率以及封盖数数上都居于第一,而且其失误数也非常低;凯尔特人队的三分球命中率最高,但是其两分球命中率却是最低的;骑士队的两分球命中率最高,但是其场均篮板数最低;暂居东部第四的76人队在助攻数和篮板数均遥遥领先其他四支球队,但其失误数和个人犯规数上也位居第一。

radata2=data18[c(17,18,20,25,27),c(11,14,17,20:25)]
rownames(radata2)=data18$Team[c(17,18,20,25,27)]
radata2=rbind(apply(radata2,2, max),apply(radata2,2,min),radata2)
colors_border=c(rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9),
                rgb(0.5,0.3,0.6,0.9),rgb(1,0.4,0.6,0.9))
colors_in=c( rgb(0.2,0.5,0.5,0.4), rgb(0.8,0.2,0.5,0.4) , rgb(0.7,0.5,0.1,0.4),
             rgb(0.5,0.3,0.6,0.4),rgb(1,0.4,0.6,0.4) )

radarchart( radata2  , axistype=1 ,            
            pcol=colors_border , pfcol=colors_in , plwd=2, plty=1,
            cglcol="grey", cglty=1, axislabcol="grey",  
            cglwd=0.8,
            vlcex=0.8
)
legend(x=1, y=1, legend = rownames(radata2)[3:7], bty = "n", pch=20 ,
       col=colors_in , text.col = "grey", cex=0.8, pt.cex = 1.5)

从西部赛区的雷达图来看,西部第二的勇士队在三分球命中率、两分球命中率、罚球命中率、助攻数、封盖数上都位居第一,但是其失误数却远高于其他四支球队。西部第一火箭队,在两分球命中率上与勇士仅有非常微小的差距,在上面的环状条形图上也可以看出火箭队的场均三分球数是高于勇士的,这也是火箭现在稳居西部第一的原因之一。

关于数据可视化,笔者认为,清晰的逻辑以及多维度的表达是至关重要的,合理地运用色彩、点的形状、气泡图来可以在二维平面上表达更高维的变量信息。当然图的美观也是很重要的,没有人会喜欢看一张“老气横秋”的图,对于R user而言,The R Graph Gallery 是一个很好的平台,上面有各种类型的数据可视化作品可用来借鉴学习。