Import data
# excel file
data <- read_excel("../00_data/myData.xlsx")
data_clean <- data %>%
mutate(overall_score = as.numeric(overall_score))
Introduction
Questions
Variation
Visualizing distributions
ggplot(data = data) +
geom_bar(mapping = aes(x = region))

data %>% count(region)
## # A tibble: 7 × 2
## region n
## <chr> <int>
## 1 East Asia & Pacific 740
## 2 Europe & Central Asia 1160
## 3 Latin America & Caribbean 840
## 4 Middle East & North Africa 420
## 5 North America 60
## 6 South Asia 160
## 7 Sub-Saharan Africa 960
ggplot(data = data, mapping = aes(x = data_use_score, colour = region)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(data = data) +
geom_histogram(mapping = aes(x = data_use_score))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values
Unusual values
ggplot(data = data) +
geom_histogram(mapping = aes(x = population), binwidth = 1000000) +
coord_cartesian(ylim = c(0, 10))

data %>%
filter(year == 2023) %>%
mutate(population = population / 1000000) %>%
ggplot() +
geom_histogram(mapping = aes(x = population))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values
data2 <- data %>%
mutate(data_use_score = ifelse(data_use_score < 0 | data_use_score > 100,NA,data_use_score))
ggplot(data = data2, aes(x = region, y = data_use_score)) +
geom_boxplot()

Covariation
A categorical and continuous variable
ggplot(data = data2, mapping = aes(x = data_use_score)) +
geom_freqpoly(mapping = aes(colour = region), binwidth = 10)

Two categorical variables
data2 %>%
count(region, data_use_score) %>%
ggplot(aes(x = region, y = data_use_score)) +
geom_tile(aes(fill = n))

Two continous variables
final_data <- data_clean %>%
mutate(
year = as.numeric(year),
data_services_score = as.numeric(data_services_score)
) %>%
filter(year >= 2015)
# Now plot using final_data
ggplot(data = final_data, aes(x = data_services_score, y = year)) +
geom_point(alpha = 0.3)

Patterns and models