#library(tidyverse)
library(dplyr)
#library(purrr)
library(ggplot2)
dataset=read.csv('https://osf.io/download/xq8us/')
skimr::skim(dataset)
Name | dataset |
Number of rows | 1344 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 4 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Host_country | 0 | 1 | 5 | 16 | 0 | 21 | 0 |
Host_city | 0 | 1 | 4 | 19 | 0 | 23 | 0 |
Country_Name | 0 | 1 | 3 | 32 | 0 | 157 | 0 |
Country_Code | 0 | 1 | 0 | 3 | 86 | 156 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Year | 0 | 1 | 1978.96 | 33.48 | 1896 | 1956 | 1988 | 2008 | 2020 | ▂▂▃▃▇ |
Gold | 0 | 1 | 4.07 | 8.45 | 0 | 0 | 1 | 4 | 83 | ▇▁▁▁▁ |
Silver | 0 | 1 | 4.04 | 7.10 | 0 | 0 | 2 | 4 | 78 | ▇▁▁▁▁ |
Bronze | 0 | 1 | 4.39 | 6.84 | 0 | 1 | 2 | 5 | 77 | ▇▁▁▁▁ |
-Let’s get familiar with our data. Create a table that shows the frequency of each Country in the dataset.
tab1 =dataset %>%
group_by(Country_Name) %>%
summarise(Frequency=n())
slice_sample(tab1,n=5)
Country_Name | Frequency |
---|---|
France | 29 |
Switzerland | 28 |
Belarus | 7 |
Croatia | 8 |
Argentina | 20 |
-Next, let’s look at the representation of each country’s medal status. Create atable that shows the number of times each country has won a bronze, silver, and gold medal.
tab2=dataset %>%
group_by(Country_Name) %>%
summarise(Gold = sum(Gold), Silver = sum(Silver), Bronze = sum(Bronze))
slice_sample(tab2,n=5)
Country_Name | Gold | Silver | Bronze |
---|---|---|---|
Sri Lanka | 0 | 1 | 0 |
Lithuania | 6 | 7 | 13 |
Bahrain | 2 | 2 | 0 |
Ivory Coast | 1 | 1 | 2 |
Germany | 202 | 207 | 247 |
-Is there a difference in the number of silver medals a country earns if they are competing in their home country? *Note please pick any one country you’d like to look at
home=dataset %>%
filter(Host_country=='United States') %>%
group_by(Country_Name) %>%
summarise(Silver = sum(Silver)) %>%
mutate(Percent_Win = round(Silver/sum(Silver) * 100,2))
away =dataset %>%
filter(!Host_country=='United States') %>%
group_by(Country_Name) %>%
summarise(Silver = sum(Silver)) %>%
mutate(Percent_Win = round(Silver/sum(Silver) * 100,2))
shapiro.test(home$Silver)
##
## Shapiro-Wilk normality test
##
## data: home$Silver
## W = 0.30633, p-value < 2.2e-16
shapiro.test(away$Silver)
##
## Shapiro-Wilk normality test
##
## data: away$Silver
## W = 0.45531, p-value < 2.2e-16
hist(home$Silver)
hist(away$Silver)
wilcox1=wilcox.test(home$Silver,away$Silver)
#yes, there's a difference in the number of silver medals a country earns if they are competing in their home country
print(wilcox1)
##
## Wilcoxon rank sum test with continuity correction
##
## data: home$Silver and away$Silver
## W = 5164.5, p-value = 0.000254
## alternative hypothesis: true location shift is not equal to 0
-Create a visualization that shows the United State’s gold medal over time.
USA_visual = dataset %>%
filter(Country_Name == 'United States')
USA_visual %>%
ggplot(aes(x=Year,y=Gold))+
geom_line(lwd=1,col='darkred')+
theme_bw()+
labs(y='Number of Gold Medals',
x= 'Years',
title = 'The Roller Coaster of the American Quest for Gold',
subtitle = "Examining the trajectory of the fight for the ultimate prize",fill=' ')+
theme(plot.title = element_text(hjust = .5),
plot.subtitle = element_text(hjust = .5))