Import data

# excel file
data <- read_excel("../00_data/myData.xlsx")

data_clean <- data %>%
  mutate(overall_score = as.numeric(overall_score))

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = data) +
  geom_bar(mapping = aes(x = region))

data %>% count(region)
## # A tibble: 7 × 2
##   region                         n
##   <chr>                      <int>
## 1 East Asia & Pacific          740
## 2 Europe & Central Asia       1160
## 3 Latin America & Caribbean    840
## 4 Middle East & North Africa   420
## 5 North America                 60
## 6 South Asia                   160
## 7 Sub-Saharan Africa           960
ggplot(data = data, mapping = aes(x = data_use_score, colour = region)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(data = data) +
  geom_histogram(mapping = aes(x = data_use_score))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

Unusual values

ggplot(data = data) + 
  geom_histogram(mapping = aes(x = population), binwidth = 1000000) +
  coord_cartesian(ylim = c(0, 10))

data %>%
  filter(year == 2023) %>%
  mutate(population = population / 1000000) %>%
  ggplot() + 
  geom_histogram(mapping = aes(x = population))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values

data2 <- data %>%
  mutate(data_use_score = ifelse(data_use_score < 0 | data_use_score > 100,NA,data_use_score))
ggplot(data = data2, aes(x = region, y = data_use_score)) +
  geom_boxplot()

Covariation

A categorical and continuous variable

ggplot(data = data2, mapping = aes(x = data_use_score)) + 
  geom_freqpoly(mapping = aes(colour = region), binwidth = 10)

Two categorical variables

data2 %>% 
  count(region, data_use_score) %>%  
  ggplot(aes(x = region, y = data_use_score)) +
    geom_tile(aes(fill = n))

Two continous variables

final_data <- data_clean %>%
  mutate(
    year = as.numeric(year), 
    data_services_score = as.numeric(data_services_score)
  ) %>%
  filter(year >= 2015)

# Now plot using final_data
ggplot(data = final_data, aes(x = data_services_score, y = year)) +
  geom_point(alpha = 0.3)

Patterns and models