athletes_and_events.csv, noc_regions.csv, and
gdp_pop.csv. Note, that the noc_regions.csv is
the set all NOC regions, while gdp_pop.csv only contains a
snapshot of the current set of countries.library(readr)
ath <- read_csv("data/athletes_and_events.csv")
## Rows: 271116 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gdp <- read_csv("data/gdp_pop.csv")
## Rows: 201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Code
## dbl (2): Population, GDP per Capita
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
noc <- read_csv("data/noc_regions.csv")
## Rows: 230 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): NOC, region, notes
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#combine all German designations under German and all Russian designation under Russia.
ath$NOC[ath$NOC %in% c("SAA","EUA","GDR","FRG")] <- "GER"
ath$NOC[ath$NOC %in% c("RUS","RU1","URS","EUN",'ROC')] <- "RUS"
temp_df <- merge(ath, noc, by.x = "NOC",
by.y = "NOC", all.x = TRUE, all.y = FALSE)
total_df <- merge(temp_df, noc, by.x = "NOC",
by.y = "NOC", all.x = TRUE, all.y = FALSE)
Please provide (i) one visualization showing an over time comparison and (ii) one visualization in which a total medal count (across all Winter Olympics) is used. Briefly discuss which visualization you recommend to your editor and why.
b- i)How many winter games did each country compete in? The number of winter games each country competed in ranges from 1 to 22, as illustrated in the below data table.
total_df %>%
filter(Season == 'Winter') %>%
group_by(NOC, Year) %>%
summarize(Medal_Count=n()) %>%
distinct()
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.
## # A tibble: 1,021 × 3
## # Groups: NOC [115]
## NOC Year Medal_Count
## <chr> <dbl> <int>
## 1 AHO 1988 3
## 2 AHO 1992 2
## 3 ALB 2006 3
## 4 ALB 2010 2
## 5 ALB 2014 2
## 6 ALG 1992 8
## 7 ALG 2006 3
## 8 ALG 2010 1
## 9 AND 1976 11
## 10 AND 1980 9
## # … with 1,011 more rows
top10 <- total_df %>%
filter(Season == "Winter") %>%
group_by(NOC) %>%
summarise(count = n_distinct(Sport,Event,Sex,Medal)) %>%
arrange(desc(count))%>%
select(NOC) %>%
head(10)
top10_byyear<- total_df %>%
filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")
, NOC %in% top10$NOC)%>%
group_by(NOC,Year) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, NOC)) %>%
arrange(desc(Medal_Count))
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.
top10 <- total_df %>%
filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")) %>%
group_by(NOC) %>%
summarise(count = n_distinct(Sport,Event,Sex,Medal,Year)) %>%
arrange(desc(count))%>%
select(NOC) %>%
head(10)
totals_all_time<- total_df%>%
filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")
, NOC %in% top10$NOC)%>%
group_by(NOC,Medal) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, NOC)) %>%
arrange(desc(Medal_Count))
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.
plot0 <- ggplot(totals_all_time
, aes(x=reorder(NOC, desc(Medal_Count), sum), y=Medal_Count, fill=factor(Medal, levels=c("Gold", "Silver", "Bronze")))) +
theme_tufte(base_size = 10) +
labs(fill = "Medal") +
labs(x="Country", y="Number of Medals", title="All Time Medals per Country") +
geom_bar(stat="identity") +
scale_fill_manual(values=c('#D6AF36','#A7A7AD','#A77044'))+
theme(plot.title=element_text(hjust = 0.5))
plot0
temp<- total_df %>%
filter(Season == 'Winter') %>%
group_by(NOC, Year) %>%
summarize(n=n()) %>%
distinct()
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.
temp1 <- temp[order(-temp$n),] %>%
group_by(NOC) %>%
summarize(n = sum(n))
temp1 <- temp1[order(-temp1$n),]
temp2 <- temp1[1:10,1]
top10_countries <- temp2[['NOC']]
plot2 <- subset(temp, NOC %in% top10_countries) %>%
ggplot( aes( x= Year, y= n, group = NOC, color = NOC)) +
geom_line() +
labs(y="Number of Medals", x="Year") +
ggtitle("Number of Medals per Year") +
theme_bw()
# medals_count_year_top10 <- temp %>% filter(NOC %in% head(temp, 10)$NOC)
# plot <- ggplot(medals_count_year_top10, aes(x = Year, y = n, group = NOC, color = NOC)) +
# geom_line() +
# labs(y="Number of Medals", x="Year") +
# ggtitle("Number of Medals per Year") +
# theme_bw()
#
plot2
plot1<-ggplot(top10_byyear,aes(Year,Medal_Count))+
#theme_tufte(base_size = 10)+
geom_point(aes(color=NOC))+
geom_line(aes(color=NOC))+
scale_color_brewer(palette="Spectral")+
#facet_grid(rows = vars(Country))+
#facet_wrap(~Country, ncol=3)+
labs(x="Year", y="Number of Medals", title="Number of Medals per Year")+
theme(legend.position="bottom")+
theme(plot.title=element_text(hjust = 0.5))
plot1
#count_function:lets you quickly count the unique values of one or more variables: df %>% count(a, b) is roughly equivalent to df %>% group_by(a, b) %>% summarise(n = n()). count() is paired with tally(), a lower-level helper that is equivalent to df %>% summarise(n = n()).
is.winter <- total_df
topcountry.game<-is.winter %>%
group_by(NOC) %>%
select(Year) %>%
unique() %>%
summarize(count=n()) %>%
arrange(desc(count),NOC) %>%
mutate(rank =row_number())
## Adding missing grouping variables: `NOC`
How many medals of each type did each country win?
#how many medals of each type the country won
has.medal <- is.winter %>% drop_na(Medal)
#gold
topcountry.gold <- has.medal%>%
filter(Medal == "Gold") %>%
group_by(NOC) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
mutate(rank = row_number())
plot_gold<- head(topcountry.gold,25) %>% top_n(10) %>%
ggplot(aes(x=reorder(NOC, -count), y=count, fill=NOC)) +
geom_bar(stat="identity", fill='#d4af37')+
theme_bw()+
theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
labs(x="Name Of Countries", y="Medals Count", title="Number Of Gold Medals Won")+
theme(plot.title=element_text(hjust=0.5, size=8))+
theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
guides(fill="none")
## Selecting by rank
#silver
topcountry.silver <- has.medal%>%
filter(Medal == "Silver") %>%
group_by(NOC) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
mutate(rank = row_number())
plot_silver<-head(topcountry.silver,25) %>% top_n(10) %>%
ggplot(aes(x=reorder(NOC, -count), y=count,fill=NOC),fill=NOC) +
geom_bar(stat="identity", fill='#C0C0C0')+
theme_bw()+
theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
labs(x="Name Of Countries", y="Medals Count", title="Number Of Silver Medals Won")+
theme(plot.title=element_text(hjust=0.5, size=8))+
theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
guides(fill="none")
## Selecting by rank
#bronze
topcountry.bronze <- has.medal%>%
filter(Medal == "Bronze") %>%
group_by(NOC) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
mutate(rank = row_number())
plot_bronze<-head(topcountry.bronze,25) %>% top_n(10) %>%
ggplot(aes(x=reorder(NOC, -count), y=count, fill=NOC))+
geom_bar(stat="identity", fill='#CD7F32')+
theme_bw()+
theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
labs(x="Name Of Countries", y="Medals Count", title="Number Of Bronze Medals Won")+
theme(plot.title=element_text(hjust=0.5, size=8))+
theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
guides(fill="none")
## Selecting by rank
print(plot_gold)
print(plot_silver)
print(plot_bronze)
There are different ways to calculate “success”. Consider the
following variants and choose one (and make sure your choice is clear in
the visualization):
- Just consider gold medals.
- Simply add up the number of medals of different types.
- Create an index in which medals are valued differently. (gold=3,
silver=2, bronze=1).
- A reasonable other way that you prefer.
Now, adjust the ranking of medal success by (a) GDP per capita and (b) population. You have now three rankings: unadjusted ranking, adjusted by GDP per capita, and adjusted by population.
Visualize how these rankings differ. Try to highlight a specific pattern (e.g. “South Korea – specialization reaps benefits” or “The superpowers losing their grip”).
ath = merge(ath, noc, by.x="NOC", by.y="NOC")
ath = merge(ath, gdp, by.x="NOC", by.y="Code")
medals_count = ath[complete.cases(ath[ , 15]),] %>% filter(Season == 'Winter')%>% count(NOC, sort = TRUE)
medals_count <- medals_count[order(medals_count$n, decreasing=TRUE),]
medal_gdp_df = merge(medals_count, gdp, by.x="NOC", by.y="Code")
medal_gdp_df <- merge(medals_count, gdp, by.x = "NOC",
by.y = "Code", all.x = TRUE, all.y = FALSE)
medal_gdp_df <- medal_gdp_df[order(medal_gdp_df$n, decreasing=TRUE),]
pop_new <- medal_gdp_df$n/(medal_gdp_df$Population)
medal_gdp_df_2 <- cbind(medal_gdp_df,pop_new)
medal_gdp_plot <-medal_gdp_df_2 %>% top_n(10) %>%
ggplot(aes(x=reorder(NOC, -pop_new) , y=pop_new , fill=Country))+
geom_bar(stat="identity",)+ scale_fill_hue(l=40)+ labs(x="Name Of Countries", y="Medals Count", title="Number Of Medals Won Adjusted per Population")+
theme_bw()+
theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
theme(plot.title=element_text(hjust=0.5, size=15))+
theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
guides(fill="none")
## Selecting by pop_new
medal_gdp_plot
medal_gdp_df <- na.omit(medal_gdp_df)
gdp_new <- medal_gdp_df$n/(medal_gdp_df$'GDP per Capita')
#adjusted by GDP
medal_gdp_df_2 <- cbind(medal_gdp_df,gdp_new)
medal_gdp_plot <-medal_gdp_df_2 %>% top_n(10) %>%
ggplot(aes(x=reorder(NOC, -gdp_new) , y=gdp_new , fill=Country))+
geom_bar(stat="identity",)+ scale_fill_hue(l=40)+ labs(x="Name Of Countries", y="Medals Count", title="Number Of Medals Won Adjusted per GDP per Capita")+
theme_bw()+
theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
theme(plot.title=element_text(hjust=0.5, size=15))+
theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
guides(fill="none")
## Selecting by gdp_new
medal_gdp_plot
Until the 2014 Sochi Winter Olympics (our data for Winter Olympics end here), there were 19 host cities. Calculate whether the host nation had an advantage. That is calculate whether the host country did win more medals when the Winter Olympics was in their country compared to other times.
Note, that the 19 host cities are noted in the data but not the countries they are located in. This happens commonly and often Wikipedia has the kind of additional data you want for the task. To save you some time, here is a quick way to get this kind of table from Wikipedia into R.
Provide a visualization of the host country advantage (or absence thereof).
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
winter_hosts<-hosts %>% filter(Winter != "", Year<='2016') %>%
rename(Host_City = City) %>%
rename(Host_Country = Country)%>%
select(Host_City, Host_Country, Year)
athletes_events <- read.csv("./data/athletes_and_events.csv")
gdp_pop <- read.csv("./data/gdp_pop.csv")
noc_regions <- read.csv("./data/noc_regions.csv")
#recoding: all russian teams = Russia, all german teams = Germany
athletes_events$NOC[athletes_events$NOC == "URS"] <- "RUS"
athletes_events$NOC[athletes_events$NOC == "FRG"] <- "GER"
athletes_events$NOC[athletes_events$NOC == "GDR"] <- "GER"
athletes_events$NOC[athletes_events$NOC == "EUA"] <- "GER"
#removing duplicate medals for team-events
athletes <- distinct(athletes_events, Sex, Team, NOC, Games, Year, Season, City, Sport, Event, Medal)
merge1 <- left_join(athletes, noc_regions)
## Joining, by = "NOC"
dataset <- left_join(merge1, gdp_pop, by=c("NOC" = "Code"))
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
hosts <- hosts %>% filter(Winter != "") %>%
select(City, Country, Year)
hosts <- select(hosts, Year, Country, City)
hosts <- rename(hosts, HostCountry = Country)
hosts <- rename(hosts, HostCity = City)
hosts$HostCountry[hosts$HostCountry == "Russia[h]"] <- "Russia"
hosts$HostCity[hosts$HostCity == "Innsbruck[g]"] <- "Innsbruck"
data_prep <- filter(dataset, Season == "Winter")
host_dataset <- left_join(data_prep, hosts, by=c("Year" = "Year"))
avg_medals <- host_dataset %>%
filter(Country %in% hosts$HostCountry, Medal =="Gold", Country != "China") %>%
group_by(Country) %>%
summarise(avgtotal = sum(Medal=="Gold")/length(unique(Year)))
avg_medals_hosting <- host_dataset %>%
filter(Country == HostCountry, Medal=="Gold") %>%
group_by(Country) %>%
summarise(avgtotal = sum(Medal=="Gold")/length(unique(Year)))
#ggplot(host_diff, aes(x=reorder(country, diff), y=diff, fill = country)) +
a <- ggplot(avg_medals_hosting, aes(x=Country, y=avgtotal)) +
geom_point(data=avg_medals_hosting, size=3, color="violetred4") +
geom_point(data=avg_medals, aes(x=Country, y=avgtotal), size=3, color="grey" ) +
scale_colour_manual(values=c('Average Gold Medals Earned (Total)'='blue', 'Average Gold Medals Earned (Hosting)'='red')) +
labs(x = "Country",
y = "Average Gold Medals",
title = "Winter Olympics Host Country Advantage (Gold Medals)",caption="Sources: International Olympic Committee, Wikipedia") +
theme_classic()
a
athletes<-read.csv("data/athletes_and_events.csv")
gdp_pop<-read.csv("data/gdp_pop.csv")
noc_regions<-read.csv("data/noc_regions.csv")
athletes$NOC[athletes$NOC %in% c("SAA","EUA","GDR","FRG")] <- "GER"
athletes$NOC[athletes$NOC %in% c("RUS","RU1","URS","EUN",'ROC')] <- "RUS"
colnames(gdp_pop)[colnames(gdp_pop) == 'Code'] <- 'NOC'
all_data <- athletes %>%
left_join(noc_regions, by = 'NOC') %>%
left_join(gdp_pop,by='NOC')%>%
filter(Season=="Winter")
top_athletes <-all_data %>%
filter(Medal %in% c("Gold",'Silver','Bronze'))%>%
group_by(ID, Name, Sex) %>%
summarise(Medal_Count = n_distinct(Sport,Event,Sex,Medal,Year)) %>%
arrange(desc(Medal_Count))%>%
head(10)
## `summarise()` has grouped output by 'ID', 'Name'. You can override using the
## `.groups` argument.
ggplot(data=top_athletes
, aes(x=Medal_Count, y=reorder(reorder(Name, Sex), Medal_Count, sum), fill=factor(Sex))) +
theme_minimal()+
labs(fill = "Sex") +
labs(x="Medal Count", y="Athlete", title="Top 10 Athletes with the Most Medal Count") +
geom_bar(stat="identity") +
scale_fill_manual(values=c('#FF6666',' dark blue'))+
theme(plot.title=element_text(hjust = 0.5))
Surprisingly, the athletes with the most medals count are female.
athlete_dimensions <- all_data %>%
filter(Medal %in% c("Gold",'Silver','Bronze')) %>%
filter(!is.na(Height)) %>%
filter(!is.na(Weight)) %>%
distinct(Name, Sport,Sex,Height, Weight)
ave_dimensions<- athlete_dimensions %>%
group_by(Sport,Sex) %>%
summarise(Avg_Height = mean(Height)
, Ave_Weight = mean(Weight))
## `summarise()` has grouped output by 'Sport'. You can override using the
## `.groups` argument.
ggplot(data=ave_dimensions
, aes(x=Avg_Height, y=Ave_Weight, color =Sex)) +
theme_minimal()+
labs(x="Average Height(cm)", y="Average Weight", title="Average Height and Weight of Medal Winners per Sport") +
geom_point() +
theme(plot.title=element_text(hjust = 0.5))+
geom_text_repel(aes(label = Sport),
color = "gray20", size = 1.9)
Choose 2 of the plots you created above and add interactivity. One of
the plots needs to be written in plotly rather than just
using the ggplotly automation. Briefly describe to the
editor why interactivity in these visualization is particularly helpful
for a reader.
ggplotly(plot2)
fig = plot_ly(ave_dimensions, x=~Avg_Height, y=~Ave_Weight
, color=~Sex,colors = c("hotpink", "royalblue")
, type = "scatter", mode='markers'
, text = ~paste('Sport: ', Sport)) %>% layout(title = 'Average Height and Weight of Medal Winners per Sport')
fig
Prepare a selected data set and add a datatable to the
output. Make sure the columns are clearly labelled. Select the
appropriate options for the data table (e.g. search bar, sorting, column
filters etc.). Suggest to the editor which kind of information you would
like to provide in a data table in the online version of the article and
why.
table_data <- total_df %>%
filter(Season == "Winter", Medal %in% c("Gold",'Silver','Bronze')) %>%
group_by(Team ,Medal) %>%
summarise(Medal_Count = n_distinct(Year, Season, Medal, Event, Team),
Athlete_Count = n_distinct(Name)
) %>%
arrange((Team))
## `summarise()` has grouped output by 'Team'. You can override using the
## `.groups` argument.
datatable(totals_all_time, rownames = FALSE,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:")))