基于R的奥林匹克运动员数据可视化

数据网址: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results

## 加载库
library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(treemap)
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(tidyr)
library(circlize)
## ========================================
## circlize version 0.4.4
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization 
##   in R. Bioinformatics 2014.
## ========================================
library(sp)
library(rworldmap)
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(RColorBrewer)
library(circlize)
library(treemap)
library(d3heatmap)
library(gganimate)
library(ggalluvial)
library(ggridges)
## 
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(ggChernoff)

读取数据

## 读取数据,数据融合
athlete_events <- read_csv("athlete_events.csv")
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_integer(),
##   Height = col_integer(),
##   Weight = col_double(),
##   Team = col_character(),
##   NOC = col_character(),
##   Games = col_character(),
##   Year = col_integer(),
##   Season = col_character(),
##   City = col_character(),
##   Sport = col_character(),
##   Event = col_character(),
##   Medal = col_character()
## )
noc_regions <- read_csv("noc_regions.csv")
## Parsed with column specification:
## cols(
##   NOC = col_character(),
##   region = col_character(),
##   notes = col_character()
## )
## 数据连接
athletedata <- inner_join(athlete_events,noc_regions[,1:2],by=c("NOC"="NOC"))

## 查看数据
summary(athletedata)
##        ID             Name               Sex                 Age       
##  Min.   :     1   Length:270767      Length:270767      Min.   :10.00  
##  1st Qu.: 34630   Class :character   Class :character   1st Qu.:21.00  
##  Median : 68187   Mode  :character   Mode  :character   Median :24.00  
##  Mean   : 68229                                         Mean   :25.56  
##  3rd Qu.:102066                                         3rd Qu.:28.00  
##  Max.   :135571                                         Max.   :97.00  
##                                                         NA's   :9462   
##      Height          Weight           Team               NOC           
##  Min.   :127.0   Min.   : 25.00   Length:270767      Length:270767     
##  1st Qu.:168.0   1st Qu.: 60.00   Class :character   Class :character  
##  Median :175.0   Median : 70.00   Mode  :character   Mode  :character  
##  Mean   :175.3   Mean   : 70.71                                        
##  3rd Qu.:183.0   3rd Qu.: 79.00                                        
##  Max.   :226.0   Max.   :214.00                                        
##  NA's   :60083   NA's   :62785                                         
##     Games                Year         Season              City          
##  Length:270767      Min.   :1896   Length:270767      Length:270767     
##  Class :character   1st Qu.:1960   Class :character   Class :character  
##  Mode  :character   Median :1988   Mode  :character   Mode  :character  
##                     Mean   :1978                                        
##                     3rd Qu.:2002                                        
##                     Max.   :2016                                        
##                                                                         
##     Sport              Event              Medal          
##  Length:270767      Length:270767      Length:270767     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     region         
##  Length:270767     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
head(athletedata)
## # A tibble: 6 x 16
##      ID Name  Sex     Age Height Weight Team  NOC   Games  Year Season
##   <int> <chr> <chr> <int>  <int>  <dbl> <chr> <chr> <chr> <int> <chr> 
## 1     1 A Di… M        24    180     80 China CHN   1992…  1992 Summer
## 2     2 A La… M        23    170     60 China CHN   2012…  2012 Summer
## 3     3 Gunn… M        24     NA     NA Denm… DEN   1920…  1920 Summer
## 4     4 Edga… M        34     NA     NA Denm… DEN   1900…  1900 Summer
## 5     5 Chri… F        21    185     82 Neth… NED   1988…  1988 Winter
## 6     5 Chri… F        21    185     82 Neth… NED   1988…  1988 Winter
## # ... with 5 more variables: City <chr>, Sport <chr>, Event <chr>,
## #   Medal <chr>, region <chr>
colnames(athletedata)
##  [1] "ID"     "Name"   "Sex"    "Age"    "Height" "Weight" "Team"  
##  [8] "NOC"    "Games"  "Year"   "Season" "City"   "Sport"  "Event" 
## [15] "Medal"  "region"

数据可视化

1.可视化随时间变化参加奥运会的运动员数

## 查看不同季节举办的的奥运会运动员人数变化
 plotdata <- athletedata%>%group_by(Year,Season)%>%
  summarise(number=n())

ggplot(plotdata,aes(x=Year,y=number))+
  theme_bw(base_family = "STKaiti")+
  geom_line(aes(colour=Season))+
  theme(legend.position = "top")+
  labs(x="举办时间",y="运动员人数",title="奥运会运动员人数变化")+
  scale_x_continuous(breaks=unique( plotdata$Year)) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5))

2.查看不同时间举办的的奥运会运动员人数性别情况

## 查看不同时间举办的的奥运会运动员人数性别情况
 plotdata <- athletedata%>%group_by(Year,Sex)%>%
  summarise(number=n())

ggplot(plotdata,aes(x=Year,y=number))+
  theme_bw(base_family = "STKaiti")+
  geom_bar(aes(fill=Sex),stat = "identity",position = "fill")+
  theme(legend.position = "top")+
  labs(x="举办时间",y="运动员人数百分比",title="奥运会运动员人数变化")+
  scale_x_continuous(breaks=unique( plotdata$Year)) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5),
        plot.title = element_text(hjust = 0.5))

### 3.查看国家参与奥运会运动员人数

## 查看国家参与奥运会运动员人数
plotdata <- athletedata%>%group_by(region)%>%
  summarise(number=n())%>%
   arrange(desc(number))

 ## 可视化前40个人数多的国家的参与人数
 
ggplot(plotdata[1:30,],aes(x=reorder(region,number),y=number))+
  theme_bw(base_family = "STKaiti")+
  geom_bar(aes(fill=number),stat = "identity")+
  coord_flip()+
  scale_fill_gradient(low = "#56B1F7", high = "#132B43")+
  labs(x="地区",y="运动员人数",title="不同地区奥运会运动员人数")+
  theme(axis.text.x = element_text(vjust = 0.5),
        plot.title = element_text(hjust = 0.5))

### 4.可视化人数最多的30个地区,不同年份运动员人数变化

## 可视化人数最多的30个地区,不同年份运动员人数变化
region30 <- athletedata%>%group_by(region)%>%
  summarise(number=n())%>%
   arrange(desc(number))
region30 <- region30$region[1:30]

plotdata <- athletedata[athletedata$region %in%region30,]%>%
  group_by(region,Year)%>%
  summarise(number=n())

ggplot(data=plotdata, aes(x=Year,y=region)) + 
  theme_bw(base_family = "STKaiti") +
  geom_tile(aes(fill = number),colour = "white")+
  scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
  scale_x_continuous(breaks=unique( plotdata$Year)) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5),
        legend.position = "top")

5.不同性别下的,可视化人数最多的150个地区,不同年份运动员人数变化

## 不同性别下的,可视化人数最多的150个地区,不同年份运动员人数变化
plotdata <- athletedata[athletedata$region %in%region30[1:15],]%>%
  group_by(region,Year,Sex)%>%
  summarise(number=n())

ggplot(data=plotdata, aes(x=Year,y=region)) + 
  theme_bw(base_family = "STKaiti") +
  geom_tile(aes(fill = number),colour = "white")+
  scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
  scale_x_continuous(breaks=unique( plotdata$Year)) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
  facet_wrap(~Sex,nrow = 2)

6.使用treemap 可视化数据

## 使用treemap 可视化数据
plotdata <- athletedata%>%
  group_by(region,Sex)%>%
  summarise(number=n())


treemap(plotdata,index = c("Sex","region"),vSize = "number",
        title = "不同性别下每个国家的运动员人数",fontfamily.title = "STKaiti")

7.使用treemap可视化更多变量的数据

## 使用treemap 可视化数据
plotdata <- athletedata%>%
  group_by(region,Sex)%>%
  summarise(number=n())
##  计算奖牌数量
plotdata2 <- athletedata[!is.na(athletedata$Medal),]%>%
  group_by(region,Sex)%>%
  summarise(Medalnum=n())
## 合并数据
plotdata3 <- inner_join(plotdata2,plotdata,by=c("region", "Sex"))

treemap(plotdata3,index = c("Sex","region"),vSize = "number",
        vColor = "Medalnum",type="value",palette="RdYlGn",
        title = "不同性别下每个国家的运动员人数",fontfamily.title = "STKaiti",
        title.legend = "奖牌数量",fontfamily.legend="STKaiti")

### 8.关于不同时间不同国家国家参赛人数的热力图

## 热力图
region30 <- athletedata%>%group_by(region)%>%
  summarise(number=n())%>%
   arrange(desc(number))
region30 <- region30$region[1:30]

plotdata <- athletedata[athletedata$region %in%region30,]%>%
  group_by(region,Year)%>%
  summarise(number=n())

## 长数据转换为宽数据
plotdata2 <- tidyr::spread(plotdata,key="Year",value="number")
plotdata2[is.na(plotdata2)] <- 0
rownames(plotdata2) <- plotdata2$region
## Warning: Setting row names on a tibble is deprecated.
plotdata2$region <- NULL
plotdata2 <- as.matrix(plotdata2)

d3heatmap(plotdata2,scale = "column", dendrogram = "none",
          colors = "Blues")

9.数据动态可视化

plotdata <- athletedata%>%
  group_by(Year,region,Sex,Season)%>%
  summarise(athletenum=n())
##  计算奖牌数量
plotdata2 <- athletedata[!is.na(athletedata$Medal),]%>%
  group_by(Year,region,Sex,Season)%>%
  summarise(Medalnum=n())
## 合并数据
plotdata3 <- inner_join(plotdata2,plotdata,by=c("Year","region","Sex","Season"))


ggplot(plotdata3,aes(x=athletenum,y=Medalnum,colour=Sex))+
  theme_bw(base_family = "STKaiti")+
  geom_point(alpha = 0.7)+
  facet_wrap(~Season)+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}', x = "athlete number", y = "Medal number")

10.数据动态可视化每年的运动员年龄分布

## 去除缺失值
plotdata <- athletedata[!is.na(athletedata$Age),]

ggplot(plotdata,aes(fill=Sex))+
  theme_bw(base_family = "STKaiti")+
  geom_density(aes(Age),alpha = 0.5,n=50,size=0.2)+
  scale_fill_brewer(palette="RdYlGn")+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}', x = "Age", y = "Freq")

11.地图可视化奖牌的获取情况

## 11.地图可视化奖牌的获取情况
##  计算奖牌数量
plotdata <- athletedata[!is.na(athletedata$Medal),]%>%
  group_by(Year,region,Sex,Season,Sport)%>%
  summarise(Medalnum=n())


plotdata2 <- plotdata%>%
  group_by(region)%>%
  summarise(Medalnum=sum(Medalnum))

## plot map
malMap <- joinCountryData2Map(plotdata2, joinCode = "NAME",
                              nameJoinColumn = "region")
## 133 codes from your data successfully matched countries in the map
## 3 codes from your data failed to match with a country code in the map
## 110 codes from the map weren't represented in your data
mapCountryData(mapToPlot = malMap,nameColumnToPlot = "Medalnum",
               catMethod = "fixedWidth",numCats = 20,
               oceanCol = "steelblue1",missingCountryCol = "white",
               mapTitle = "Medal number by regin",
               aspect = "variable") 
## Warning in plot.window(xlim = xlim, ylim = ylim, asp = aspect): 强制改变过
## 程中产生了NA

12.可视化运动的类型

##  计算奖牌数量
plotdata <- athletedata%>%
  group_by(Sex,Sport)%>%
  summarise(athletenum=n())%>%
  arrange(desc(athletenum))

Sport <- plotdata%>%group_by(Sport)%>%
  summarise(athletenum=sum(athletenum))%>%
  arrange(desc(athletenum))
Sport20 <- Sport$Sport[1:20]
## 每种运动中运动员的数量
ggplot(plotdata,aes(x=reorder(Sport,athletenum),y=athletenum,fill=Sex))+
  theme_bw(base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "stack")+
  coord_flip()+
  labs(x="Sport",y="Athlete number")

### 13.动态可视化运动的类型的人数

##  计算奖牌数量
plotdata <- athletedata%>%
  group_by(Year,Sex,Sport)%>%
  summarise(athletenum=n())



## 每种运动中运动员的数量
ggplot(plotdata,aes(x=reorder(Sport,athletenum),y=athletenum,fill=Sex))+
  theme_bw(base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "stack")+
  coord_flip()+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}', x="Sport",y="Athlete number")

14.可视化冲击图

##  整理数据
index <- (athletedata$Sport %in% Sport20[1:5]) &(athletedata$region %in% region30[1:5])
plotdata <- athletedata[index,]
plotdata$Age <- cut_number(plotdata$Age,4)
plotdata2 <- plotdata%>%
  group_by(Year,Age,Sex,Sport,region,Season)%>%
  summarise(athletenum=n())

ggplot(plotdata2,
       aes(axis1 = Season, axis2 = region, axis3 =Sport,axis4=Age,
           y = athletenum)) +
  theme_bw()+
  geom_alluvium(aes(fill = Sex)) +
  scale_x_discrete(limits = c("Season","Sport", "region","Age"))+
  geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE)

15.可视化每个国家热门运动的情况

##  整理数据

index <- (athletedata$Sport %in% Sport20[1:20]) &(athletedata$region %in% region30[1:30])
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%
  group_by(Year,Sport,region,Sex)%>%
  summarise(athletenum=n())

ggplot(data=plotdata2, aes(x=region,y=Sport)) + 
  theme_bw(base_family = "STKaiti") +
  geom_tile(aes(fill = athletenum),colour = "white")+
  scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
  facet_wrap(~Sex,nrow = 2)+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}')

16.可视化每种热门运动的运动员年龄情况

## 16.可视化每种热门运动的运动员年龄情况
index <- ((athletedata$Sport %in% Sport20[1:20])&(!is.na(athletedata$Age)))
plotdata <- athletedata[index,]

ggplot(plotdata,aes(x=Age,y=Sport,fill=..x..))+
  geom_density_ridges_gradient()+
  facet_wrap(~Sex)+
  labs(title="Age in sport")+
  theme(plot.title = element_text(hjust = 0.5))
## Picking joint bandwidth of 0.986
## Picking joint bandwidth of 0.73

17.可视化每年每种热门运动的运动员年龄情况

## 17.可视化每种热门运动的运动员年龄情况

index <- (athletedata$Sport %in% Sport20[1:10]&(!is.na(athletedata$Age)))
plotdata <- athletedata[index,]

ggplot(plotdata,aes(x=Age,y=Sport,fill=..x..))+
  theme_bw()+
  geom_density_ridges_gradient()+
  facet_wrap(~Sex)+
  labs(title="Age in sport")+
  theme(plot.title = element_text(hjust = 0.5))+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}')
## Picking joint bandwidth of 1.38
## Picking joint bandwidth of 1.26

18.可视化每个地区每年奖牌的获取情况

## 18.可视化每个地区每年奖牌的获取情况

index <- (athletedata$region %in% region30[1:20]&(!is.na(athletedata$Medal)))
plotdata <- athletedata[index,]

plotdata2 <- plotdata%>%group_by(Year,region,Medal)%>%
  summarise(Medalnum = n())


ggplot(plotdata2,aes(x=region,y=Medalnum,fill=Medal))+
  theme_bw()+
  geom_bar(stat = "identity",position = "stack")+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
  scale_fill_brewer(palette="RdYlGn")+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}')

19.可视化运动员的身高体重数据

index <- ((athletedata$region%in%region30[1:15])&(!is.na(athletedata$Height))&(!is.na(athletedata$Weight)))
plotdata <- athletedata[index,]

ggplot(plotdata,aes(x=Weight,y=Height,colour=Sex))+
  theme_bw()+
  geom_point(alpha=0.5)+
  geom_smooth(method="lm")+
  theme(plot.title = element_text(hjust = 0.5))+
  facet_wrap(~region,scales = "fixed")+
  transition_time(Year) +
  labs(title = 'Year: {frame_time}')
## Warning in qt((1 - level)/2, df): 产生了NaNs

## Warning in qt((1 - level)/2, df): 产生了NaNs

## Warning in qt((1 - level)/2, df): 产生了NaNs

## Warning in qt((1 - level)/2, df): 产生了NaNs

## Warning in qt((1 - level)/2, df): 产生了NaNs

20.可视化使用表情包

## 查看不同季节举办的的奥运会运动员人数变化

region6 <- c("USA","Germany","France" ,"UK","Russia","China")
index <- ((athletedata$region %in% region6)&(!is.na(athletedata$Medal))&(athletedata$Season=="Summer"))
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%group_by(Year,region)%>%
  summarise(Medalnum=n())

ggplot(plotdata2,aes(x=Year,y=Medalnum))+
  theme_bw(base_family = "STKaiti")+
  geom_line()+
  geom_chernoff(fill = 'goldenrod1')+
  facet_wrap(~region,ncol = 2)+
  labs(x="举办时间",y="奖牌数")