数据网址: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results
## 加载库
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(treemap)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(tidyr)
library(circlize)
## ========================================
## circlize version 0.4.4
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
## ========================================
library(sp)
library(rworldmap)
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(RColorBrewer)
library(circlize)
library(treemap)
library(d3heatmap)
library(gganimate)
library(ggalluvial)
library(ggridges)
##
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
##
## scale_discrete_manual
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggChernoff)
## 读取数据,数据融合
athlete_events <- read_csv("athlete_events.csv")
## Parsed with column specification:
## cols(
## ID = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_integer(),
## Height = col_integer(),
## Weight = col_double(),
## Team = col_character(),
## NOC = col_character(),
## Games = col_character(),
## Year = col_integer(),
## Season = col_character(),
## City = col_character(),
## Sport = col_character(),
## Event = col_character(),
## Medal = col_character()
## )
noc_regions <- read_csv("noc_regions.csv")
## Parsed with column specification:
## cols(
## NOC = col_character(),
## region = col_character(),
## notes = col_character()
## )
## 数据连接
athletedata <- inner_join(athlete_events,noc_regions[,1:2],by=c("NOC"="NOC"))
## 查看数据
summary(athletedata)
## ID Name Sex Age
## Min. : 1 Length:270767 Length:270767 Min. :10.00
## 1st Qu.: 34630 Class :character Class :character 1st Qu.:21.00
## Median : 68187 Mode :character Mode :character Median :24.00
## Mean : 68229 Mean :25.56
## 3rd Qu.:102066 3rd Qu.:28.00
## Max. :135571 Max. :97.00
## NA's :9462
## Height Weight Team NOC
## Min. :127.0 Min. : 25.00 Length:270767 Length:270767
## 1st Qu.:168.0 1st Qu.: 60.00 Class :character Class :character
## Median :175.0 Median : 70.00 Mode :character Mode :character
## Mean :175.3 Mean : 70.71
## 3rd Qu.:183.0 3rd Qu.: 79.00
## Max. :226.0 Max. :214.00
## NA's :60083 NA's :62785
## Games Year Season City
## Length:270767 Min. :1896 Length:270767 Length:270767
## Class :character 1st Qu.:1960 Class :character Class :character
## Mode :character Median :1988 Mode :character Mode :character
## Mean :1978
## 3rd Qu.:2002
## Max. :2016
##
## Sport Event Medal
## Length:270767 Length:270767 Length:270767
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## region
## Length:270767
## Class :character
## Mode :character
##
##
##
##
head(athletedata)
## # A tibble: 6 x 16
## ID Name Sex Age Height Weight Team NOC Games Year Season
## <int> <chr> <chr> <int> <int> <dbl> <chr> <chr> <chr> <int> <chr>
## 1 1 A Di… M 24 180 80 China CHN 1992… 1992 Summer
## 2 2 A La… M 23 170 60 China CHN 2012… 2012 Summer
## 3 3 Gunn… M 24 NA NA Denm… DEN 1920… 1920 Summer
## 4 4 Edga… M 34 NA NA Denm… DEN 1900… 1900 Summer
## 5 5 Chri… F 21 185 82 Neth… NED 1988… 1988 Winter
## 6 5 Chri… F 21 185 82 Neth… NED 1988… 1988 Winter
## # ... with 5 more variables: City <chr>, Sport <chr>, Event <chr>,
## # Medal <chr>, region <chr>
colnames(athletedata)
## [1] "ID" "Name" "Sex" "Age" "Height" "Weight" "Team"
## [8] "NOC" "Games" "Year" "Season" "City" "Sport" "Event"
## [15] "Medal" "region"
## 查看不同季节举办的的奥运会运动员人数变化
plotdata <- athletedata%>%group_by(Year,Season)%>%
summarise(number=n())
ggplot(plotdata,aes(x=Year,y=number))+
theme_bw(base_family = "STKaiti")+
geom_line(aes(colour=Season))+
theme(legend.position = "top")+
labs(x="举办时间",y="运动员人数",title="奥运会运动员人数变化")+
scale_x_continuous(breaks=unique( plotdata$Year)) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))
## 查看不同时间举办的的奥运会运动员人数性别情况
plotdata <- athletedata%>%group_by(Year,Sex)%>%
summarise(number=n())
ggplot(plotdata,aes(x=Year,y=number))+
theme_bw(base_family = "STKaiti")+
geom_bar(aes(fill=Sex),stat = "identity",position = "fill")+
theme(legend.position = "top")+
labs(x="举办时间",y="运动员人数百分比",title="奥运会运动员人数变化")+
scale_x_continuous(breaks=unique( plotdata$Year)) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5),
plot.title = element_text(hjust = 0.5))
### 3.查看国家参与奥运会运动员人数
## 查看国家参与奥运会运动员人数
plotdata <- athletedata%>%group_by(region)%>%
summarise(number=n())%>%
arrange(desc(number))
## 可视化前40个人数多的国家的参与人数
ggplot(plotdata[1:30,],aes(x=reorder(region,number),y=number))+
theme_bw(base_family = "STKaiti")+
geom_bar(aes(fill=number),stat = "identity")+
coord_flip()+
scale_fill_gradient(low = "#56B1F7", high = "#132B43")+
labs(x="地区",y="运动员人数",title="不同地区奥运会运动员人数")+
theme(axis.text.x = element_text(vjust = 0.5),
plot.title = element_text(hjust = 0.5))
### 4.可视化人数最多的30个地区,不同年份运动员人数变化
## 可视化人数最多的30个地区,不同年份运动员人数变化
region30 <- athletedata%>%group_by(region)%>%
summarise(number=n())%>%
arrange(desc(number))
region30 <- region30$region[1:30]
plotdata <- athletedata[athletedata$region %in%region30,]%>%
group_by(region,Year)%>%
summarise(number=n())
ggplot(data=plotdata, aes(x=Year,y=region)) +
theme_bw(base_family = "STKaiti") +
geom_tile(aes(fill = number),colour = "white")+
scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
scale_x_continuous(breaks=unique( plotdata$Year)) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5),
legend.position = "top")
## 不同性别下的,可视化人数最多的150个地区,不同年份运动员人数变化
plotdata <- athletedata[athletedata$region %in%region30[1:15],]%>%
group_by(region,Year,Sex)%>%
summarise(number=n())
ggplot(data=plotdata, aes(x=Year,y=region)) +
theme_bw(base_family = "STKaiti") +
geom_tile(aes(fill = number),colour = "white")+
scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
scale_x_continuous(breaks=unique( plotdata$Year)) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
facet_wrap(~Sex,nrow = 2)
## 使用treemap 可视化数据
plotdata <- athletedata%>%
group_by(region,Sex)%>%
summarise(number=n())
treemap(plotdata,index = c("Sex","region"),vSize = "number",
title = "不同性别下每个国家的运动员人数",fontfamily.title = "STKaiti")
## 使用treemap 可视化数据
plotdata <- athletedata%>%
group_by(region,Sex)%>%
summarise(number=n())
## 计算奖牌数量
plotdata2 <- athletedata[!is.na(athletedata$Medal),]%>%
group_by(region,Sex)%>%
summarise(Medalnum=n())
## 合并数据
plotdata3 <- inner_join(plotdata2,plotdata,by=c("region", "Sex"))
treemap(plotdata3,index = c("Sex","region"),vSize = "number",
vColor = "Medalnum",type="value",palette="RdYlGn",
title = "不同性别下每个国家的运动员人数",fontfamily.title = "STKaiti",
title.legend = "奖牌数量",fontfamily.legend="STKaiti")
### 8.关于不同时间不同国家国家参赛人数的热力图
## 热力图
region30 <- athletedata%>%group_by(region)%>%
summarise(number=n())%>%
arrange(desc(number))
region30 <- region30$region[1:30]
plotdata <- athletedata[athletedata$region %in%region30,]%>%
group_by(region,Year)%>%
summarise(number=n())
## 长数据转换为宽数据
plotdata2 <- tidyr::spread(plotdata,key="Year",value="number")
plotdata2[is.na(plotdata2)] <- 0
rownames(plotdata2) <- plotdata2$region
## Warning: Setting row names on a tibble is deprecated.
plotdata2$region <- NULL
plotdata2 <- as.matrix(plotdata2)
d3heatmap(plotdata2,scale = "column", dendrogram = "none",
colors = "Blues")
plotdata <- athletedata%>%
group_by(Year,region,Sex,Season)%>%
summarise(athletenum=n())
## 计算奖牌数量
plotdata2 <- athletedata[!is.na(athletedata$Medal),]%>%
group_by(Year,region,Sex,Season)%>%
summarise(Medalnum=n())
## 合并数据
plotdata3 <- inner_join(plotdata2,plotdata,by=c("Year","region","Sex","Season"))
ggplot(plotdata3,aes(x=athletenum,y=Medalnum,colour=Sex))+
theme_bw(base_family = "STKaiti")+
geom_point(alpha = 0.7)+
facet_wrap(~Season)+
transition_time(Year) +
labs(title = 'Year: {frame_time}', x = "athlete number", y = "Medal number")
## 去除缺失值
plotdata <- athletedata[!is.na(athletedata$Age),]
ggplot(plotdata,aes(fill=Sex))+
theme_bw(base_family = "STKaiti")+
geom_density(aes(Age),alpha = 0.5,n=50,size=0.2)+
scale_fill_brewer(palette="RdYlGn")+
transition_time(Year) +
labs(title = 'Year: {frame_time}', x = "Age", y = "Freq")
## 11.地图可视化奖牌的获取情况
## 计算奖牌数量
plotdata <- athletedata[!is.na(athletedata$Medal),]%>%
group_by(Year,region,Sex,Season,Sport)%>%
summarise(Medalnum=n())
plotdata2 <- plotdata%>%
group_by(region)%>%
summarise(Medalnum=sum(Medalnum))
## plot map
malMap <- joinCountryData2Map(plotdata2, joinCode = "NAME",
nameJoinColumn = "region")
## 133 codes from your data successfully matched countries in the map
## 3 codes from your data failed to match with a country code in the map
## 110 codes from the map weren't represented in your data
mapCountryData(mapToPlot = malMap,nameColumnToPlot = "Medalnum",
catMethod = "fixedWidth",numCats = 20,
oceanCol = "steelblue1",missingCountryCol = "white",
mapTitle = "Medal number by regin",
aspect = "variable")
## Warning in plot.window(xlim = xlim, ylim = ylim, asp = aspect): 强制改变过
## 程中产生了NA
## 计算奖牌数量
plotdata <- athletedata%>%
group_by(Sex,Sport)%>%
summarise(athletenum=n())%>%
arrange(desc(athletenum))
Sport <- plotdata%>%group_by(Sport)%>%
summarise(athletenum=sum(athletenum))%>%
arrange(desc(athletenum))
Sport20 <- Sport$Sport[1:20]
## 每种运动中运动员的数量
ggplot(plotdata,aes(x=reorder(Sport,athletenum),y=athletenum,fill=Sex))+
theme_bw(base_family = "STKaiti")+
geom_bar(stat = "identity",position = "stack")+
coord_flip()+
labs(x="Sport",y="Athlete number")
### 13.动态可视化运动的类型的人数
## 计算奖牌数量
plotdata <- athletedata%>%
group_by(Year,Sex,Sport)%>%
summarise(athletenum=n())
## 每种运动中运动员的数量
ggplot(plotdata,aes(x=reorder(Sport,athletenum),y=athletenum,fill=Sex))+
theme_bw(base_family = "STKaiti")+
geom_bar(stat = "identity",position = "stack")+
coord_flip()+
transition_time(Year) +
labs(title = 'Year: {frame_time}', x="Sport",y="Athlete number")
## 整理数据
index <- (athletedata$Sport %in% Sport20[1:5]) &(athletedata$region %in% region30[1:5])
plotdata <- athletedata[index,]
plotdata$Age <- cut_number(plotdata$Age,4)
plotdata2 <- plotdata%>%
group_by(Year,Age,Sex,Sport,region,Season)%>%
summarise(athletenum=n())
ggplot(plotdata2,
aes(axis1 = Season, axis2 = region, axis3 =Sport,axis4=Age,
y = athletenum)) +
theme_bw()+
geom_alluvium(aes(fill = Sex)) +
scale_x_discrete(limits = c("Season","Sport", "region","Age"))+
geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE)
## 整理数据
index <- (athletedata$Sport %in% Sport20[1:20]) &(athletedata$region %in% region30[1:30])
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%
group_by(Year,Sport,region,Sex)%>%
summarise(athletenum=n())
ggplot(data=plotdata2, aes(x=region,y=Sport)) +
theme_bw(base_family = "STKaiti") +
geom_tile(aes(fill = athletenum),colour = "white")+
scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
facet_wrap(~Sex,nrow = 2)+
transition_time(Year) +
labs(title = 'Year: {frame_time}')
## 16.可视化每种热门运动的运动员年龄情况
index <- ((athletedata$Sport %in% Sport20[1:20])&(!is.na(athletedata$Age)))
plotdata <- athletedata[index,]
ggplot(plotdata,aes(x=Age,y=Sport,fill=..x..))+
geom_density_ridges_gradient()+
facet_wrap(~Sex)+
labs(title="Age in sport")+
theme(plot.title = element_text(hjust = 0.5))
## Picking joint bandwidth of 0.986
## Picking joint bandwidth of 0.73
## 17.可视化每种热门运动的运动员年龄情况
index <- (athletedata$Sport %in% Sport20[1:10]&(!is.na(athletedata$Age)))
plotdata <- athletedata[index,]
ggplot(plotdata,aes(x=Age,y=Sport,fill=..x..))+
theme_bw()+
geom_density_ridges_gradient()+
facet_wrap(~Sex)+
labs(title="Age in sport")+
theme(plot.title = element_text(hjust = 0.5))+
transition_time(Year) +
labs(title = 'Year: {frame_time}')
## Picking joint bandwidth of 1.38
## Picking joint bandwidth of 1.26
## 18.可视化每个地区每年奖牌的获取情况
index <- (athletedata$region %in% region30[1:20]&(!is.na(athletedata$Medal)))
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%group_by(Year,region,Medal)%>%
summarise(Medalnum = n())
ggplot(plotdata2,aes(x=region,y=Medalnum,fill=Medal))+
theme_bw()+
geom_bar(stat = "identity",position = "stack")+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
scale_fill_brewer(palette="RdYlGn")+
transition_time(Year) +
labs(title = 'Year: {frame_time}')
index <- ((athletedata$region%in%region30[1:15])&(!is.na(athletedata$Height))&(!is.na(athletedata$Weight)))
plotdata <- athletedata[index,]
ggplot(plotdata,aes(x=Weight,y=Height,colour=Sex))+
theme_bw()+
geom_point(alpha=0.5)+
geom_smooth(method="lm")+
theme(plot.title = element_text(hjust = 0.5))+
facet_wrap(~region,scales = "fixed")+
transition_time(Year) +
labs(title = 'Year: {frame_time}')
## Warning in qt((1 - level)/2, df): 产生了NaNs
## Warning in qt((1 - level)/2, df): 产生了NaNs
## Warning in qt((1 - level)/2, df): 产生了NaNs
## Warning in qt((1 - level)/2, df): 产生了NaNs
## Warning in qt((1 - level)/2, df): 产生了NaNs
## 查看不同季节举办的的奥运会运动员人数变化
region6 <- c("USA","Germany","France" ,"UK","Russia","China")
index <- ((athletedata$region %in% region6)&(!is.na(athletedata$Medal))&(athletedata$Season=="Summer"))
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%group_by(Year,region)%>%
summarise(Medalnum=n())
ggplot(plotdata2,aes(x=Year,y=Medalnum))+
theme_bw(base_family = "STKaiti")+
geom_line()+
geom_chernoff(fill = 'goldenrod1')+
facet_wrap(~region,ncol = 2)+
labs(x="举办时间",y="奖牌数")