Loading libraries and getting the data
library(tidyverse)
library(ggthemes)
data<-read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-06-12/week11_fifa_audience.csv")
data<-data%>%mutate(confederation = factor(confederation))
summary(data)
## X country confederation population_share
## Min. : 1.0 Length:191 AFC :43 Min. : 0.0000
## 1st Qu.: 48.5 Class :character CAF :50 1st Qu.: 0.0000
## Median : 96.0 Mode :character CONCACAF:30 Median : 0.1000
## Mean : 96.0 CONMEBOL:10 Mean : 0.5225
## 3rd Qu.:143.5 OFC :12 3rd Qu.: 0.3500
## Max. :191.0 UEFA :46 Max. :19.5000
## tv_audience_share gdp_weighted_share
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.100 Median : 0.0000
## Mean : 0.523 Mean : 0.5204
## 3rd Qu.: 0.300 3rd Qu.: 0.3000
## Max. :14.800 Max. :11.3000
Summarizing the data based on gdp_weighted share and confederation
data%>%arrange(gdp_weighted_share)%>%select(confederation)%>%ggplot(aes(confederation,fill=confederation))+geom_bar()

Finding which countries have the highest and lowest viewership
data%>%arrange(tv_audience_share)%>%group_by(tv_audience_share)%>%count()
## # A tibble: 29 x 2
## # Groups: tv_audience_share [29]
## tv_audience_share n
## <dbl> <int>
## 1 0 67
## 2 0.1 46
## 3 0.2 19
## 4 0.3 15
## 5 0.4 6
## 6 0.5 7
## 7 0.6 2
## 8 0.7 3
## 9 0.8 2
## 10 0.9 1
## # ... with 19 more rows
data%>%arrange(desc(tv_audience_share))%>%group_by(tv_audience_share)
## # A tibble: 191 x 6
## # Groups: tv_audience_share [29]
## X country confederation population_share tv_audience_sha~
## <int> <chr> <fct> <dbl> <dbl>
## 1 3 China AFC 19.5 14.8
## 2 5 Brazil CONMEBOL 2.8 7.1
## 3 12 Indone~ AFC 3.5 6.7
## 4 2 Japan AFC 1.9 4.9
## 5 1 United~ CONCACAF 4.5 4.3
## 6 13 Mexico CONCACAF 1.7 3.2
## 7 9 Russia UEFA 2.1 3.1
## 8 4 Germany UEFA 1.2 2.9
## 9 27 Nigeria CAF 2.3 2.6
## 10 30 Vietnam AFC 1.3 2.6
## # ... with 181 more rows, and 1 more variable: gdp_weighted_share <dbl>
Finding which countries have the highest and lowest population
data%>%arrange(population_share)%>%group_by(population_share)%>%count()
## # A tibble: 26 x 2
## # Groups: population_share [26]
## population_share n
## <dbl> <int>
## 1 0 61
## 2 0.1 49
## 3 0.2 21
## 4 0.3 12
## 5 0.4 9
## 6 0.5 6
## 7 0.6 3
## 8 0.7 6
## 9 0.8 1
## 10 0.9 4
## # ... with 16 more rows
data%>%arrange(desc(population_share))%>%group_by(population_share)
## # A tibble: 191 x 6
## # Groups: population_share [26]
## X country confederation population_share tv_audience_sha~
## <int> <chr> <fct> <dbl> <dbl>
## 1 3 China AFC 19.5 14.8
## 2 39 India AFC 17.6 2
## 3 1 United~ CONCACAF 4.5 4.3
## 4 12 Indone~ AFC 3.5 6.7
## 5 5 Brazil CONMEBOL 2.8 7.1
## 6 75 Pakist~ AFC 2.5 0.4
## 7 27 Nigeria CAF 2.3 2.6
## 8 114 Bangla~ AFC 2.2 0.1
## 9 9 Russia UEFA 2.1 3.1
## 10 2 Japan AFC 1.9 4.9
## # ... with 181 more rows, and 1 more variable: gdp_weighted_share <dbl>
Finding which countries have the highest and lowest GDP share
data%>%arrange(gdp_weighted_share)%>%group_by(gdp_weighted_share)%>%count()
## # A tibble: 28 x 2
## # Groups: gdp_weighted_share [28]
## gdp_weighted_share n
## <dbl> <int>
## 1 0 101
## 2 0.1 30
## 3 0.2 6
## 4 0.3 7
## 5 0.4 5
## 6 0.5 7
## 7 0.6 6
## 8 0.7 5
## 9 0.8 1
## 10 0.9 2
## # ... with 18 more rows
data%>%arrange(desc(gdp_weighted_share))%>%group_by(gdp_weighted_share)
## # A tibble: 191 x 6
## # Groups: gdp_weighted_share [28]
## X country confederation population_share tv_audience_sha~
## <int> <chr> <fct> <dbl> <dbl>
## 1 1 United~ CONCACAF 4.5 4.3
## 2 2 Japan AFC 1.9 4.9
## 3 3 China AFC 19.5 14.8
## 4 4 Germany UEFA 1.2 2.9
## 5 5 Brazil CONMEBOL 2.8 7.1
## 6 6 United~ UEFA 0.9 2.1
## 7 7 Italy UEFA 0.9 2.1
## 8 8 France UEFA 0.9 2
## 9 9 Russia UEFA 2.1 3.1
## 10 10 Spain UEFA 0.7 1.8
## # ... with 181 more rows, and 1 more variable: gdp_weighted_share <dbl>
ggplot(data,aes(x=gdp_weighted_share,y=..density..))+geom_freqpoly(mapping = aes(colour=confederation),binwidth=1)+theme_tufte()+ggtitle("Distribution of GDP shares based on confederations")

Finding the relationship between multiple different values
cor(data$population_share,data$tv_audience_share)
## [1] 0.7313239
ggplot(data,aes(x=population_share,y=tv_audience_share),color=confederation)+geom_point()

cor(data$population_share,data$gdp_weighted_share)
## [1] 0.4472681
Countries that contribute the most to the values collected and other summaries
majorCountries<-data%>%filter(population_share>mean(population_share) & gdp_weighted_share>mean(gdp_weighted_share) &
tv_audience_share>mean(tv_audience_share))
as_tibble(majorCountries)
## # A tibble: 22 x 6
## X country confederation population_share tv_audience_sha~
## <int> <chr> <fct> <dbl> <dbl>
## 1 1 United~ CONCACAF 4.5 4.3
## 2 2 Japan AFC 1.9 4.9
## 3 3 China AFC 19.5 14.8
## 4 4 Germany UEFA 1.2 2.9
## 5 5 Brazil CONMEBOL 2.8 7.1
## 6 6 United~ UEFA 0.9 2.1
## 7 7 Italy UEFA 0.9 2.1
## 8 8 France UEFA 0.9 2
## 9 9 Russia UEFA 2.1 3.1
## 10 10 Spain UEFA 0.7 1.8
## # ... with 12 more rows, and 1 more variable: gdp_weighted_share <dbl>
majorCountries%>%group_by(confederation)%>%summarise(avgPopulation=mean(population_share),avgTV=mean(tv_audience_share),avgGDP=mean(gdp_weighted_share))
## # A tibble: 5 x 4
## confederation avgPopulation avgTV avgGDP
## <fct> <dbl> <dbl> <dbl>
## 1 AFC 4.14 4.84 3.59
## 2 CAF 1.5 1.95 0.75
## 3 CONCACAF 3.1 3.75 6.95
## 4 CONMEBOL 1.37 3.4 2.63
## 5 UEFA 1.05 2.19 3.55
# geom_col(position = position_dodge2(width = 0.9, preserve = "single"))