1 Question 1

Import data and organize it:

gdp_per_capita <- read_csv("q1gdp.csv") 

gdp_per_capita <- gdp_per_capita %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
  group_by (REF_AREA) %>%
  filter (TIME_PERIOD %in% c(1960, 1980, 2000, max(TIME_PERIOD))) %>%
  ungroup 

country_meta <- countriesCoarse@data %>%
  select(NAME, ISO3, TYPE, POP_EST, LAT, LON, GDP_MD_EST, continent, REGION, LLDC)

merged_data <- merge(gdp_per_capita, country_meta, by.x="REF_AREA", by.y="ISO3") 

Graph:

formula1 <- y ~ x 

p1<- ggplot(data = filter(merged_data, TIME_PERIOD == 1960), aes(x=OBS_VALUE, LAT)) +
  geom_point(aes(color=continent)) +
  geom_smooth(method = "lm", color="black", formula = formula1) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) +
  scale_x_continuous(trans='log2') +   
  xlab("GDP Per Capita") +
  ylab("Latitude") +
  labs(title = "1960") +
  ylim(-70,70)

p2<- ggplot(data = filter(merged_data, TIME_PERIOD == 1980), aes(x=OBS_VALUE, LAT)) +
  geom_point(aes(color=continent)) +
  geom_smooth(method = "lm", color="black", formula = formula1) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) +
  scale_x_continuous(trans='log2') +
  xlab("GDP Per Capita") +
  ylab("Latitude") +
  labs(title = "1980") +
  ylim(-70,70)

p3<- ggplot(data = filter(merged_data, TIME_PERIOD == 2000), aes(x=OBS_VALUE , LAT)) +
  geom_point(aes(color=continent)) +
  geom_smooth(method = "lm", color="black", formula = formula1) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) +
  scale_x_continuous(trans='log2') +
  xlab("GDP Per Capita") +
  ylab("Latitude") +
  labs(title = "2000") +
  ylim(-70,70)

p4<- ggplot(data = filter(merged_data, TIME_PERIOD == max(TIME_PERIOD)), aes(x=OBS_VALUE, LAT)) +
  geom_point(aes(color=continent)) +
  geom_smooth(method = "lm", color="black", formula = formula1) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) +
  scale_x_continuous(trans='log2') +
  xlab("GDP Per Capita") +
  ylab("Latitude") +
  labs(title = "Latest Year") +
  ylim(-70,70)

grid.arrange(p1,p2,p3,p4, ncol=2)

2 Question 2

Import data and organize it:

internet <- read_csv("fixedbroadband100.csv")

internet <- internet %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE)

merged_data <- merge(internet, country_meta, by.x="REF_AREA", by.y="ISO3")

#Filter by Africa and only data for the latest year
internet_latest_africa <- merged_data %>% 
  filter(REGION == "Africa") %>%
  group_by (REF_AREA) %>% 
  filter (TIME_PERIOD == max(TIME_PERIOD)) %>% 
  ungroup ()

Graph:

ggplot(merged_data, aes(x = LLDC, y = OBS_VALUE)) + 
  geom_boxplot() +
  coord_cartesian(ylim=c(0,50)) + 
  ylab("Fixed broadband subscriptions (per 100 people)")

T-Test for statistical significance

# T-Test

t_test <- t.test(OBS_VALUE ~ LLDC, data = merged_data)
t_test
## 
##  Welch Two Sample t-test
## 
## data:  OBS_VALUE by LLDC
## t = -28.126, df = 1877.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group LLDC and group other is not equal to 0
## 95 percent confidence interval:
##  -10.048756  -8.738684
## sample estimates:
##  mean in group LLDC mean in group other 
##            3.018114           12.411834

Generate YoY table of average broadband prevalence in all LLDCs vs. non-LLDCs

internet_over_time <- merged_data %>%
  filter(continent=="Africa") %>% 
  group_by(TIME_PERIOD, LLDC) %>%
  summarise(mean = mean(OBS_VALUE)) 

Graph it:

ggplot(internet_over_time, aes(TIME_PERIOD, mean, group=LLDC)) +
  geom_line(aes(color=LLDC)) +
  theme(axis.text.x = element_text(angle = 45)) +
  scale_x_continuous(
    breaks = unique(merged_data$TIME_PERIOD)) +
  ylab("Fixed broadband subscriptions (per 100 people)") +
  xlab("")

3 Question 3

Import data and organize it:

LifeExp <- read.csv("LifeExp.csv") %>% 
  select(REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE) %>% 
  arrange(REF_AREA)  %>% 
  filter(REF_AREA != "CAF")

Solution:

LifeExp_Ranked <- LifeExp %>%
  group_by(REF_AREA) %>%
  mutate(le_delta = OBS_VALUE - lag(OBS_VALUE)) %>% 
  ungroup()

LifeExp_World_Top <- LifeExp_Ranked %>%
  na.omit() %>%
  arrange(le_delta)
head(LifeExp_World_Top)
## # A tibble: 6 × 6
##   REF_AREA REF_AREA_LABEL         INDICATOR_LABEL TIME_PERIOD OBS_VALUE le_delta
##   <chr>    <chr>                  <chr>                 <int>     <dbl>    <dbl>
## 1 RWA      Rwanda                 Life expectanc…        1994      12.2    -30.8
## 2 KHM      Cambodia               Life expectanc…        1975      12.8    -26.5
## 3 SOM      Somalia                Life expectanc…        1991      25.4    -21.5
## 4 LBN      Lebanon                Life expectanc…        1976      36.4    -21.3
## 5 BIH      Bosnia and Herzegovina Life expectanc…        1992      52.0    -19.8
## 6 BDI      Burundi                Life expectanc…        1972      25.8    -17.4
LifeExp_World_Bottom <- LifeExp_Ranked %>%
  na.omit() %>%
  arrange(desc(le_delta))
head(LifeExp_World_Bottom)
## # A tibble: 6 × 6
##   REF_AREA REF_AREA_LABEL INDICATOR_LABEL         TIME_PERIOD OBS_VALUE le_delta
##   <chr>    <chr>          <chr>                         <int>     <dbl>    <dbl>
## 1 KHM      Cambodia       Life expectancy at bir…        1979      41.1     29.5
## 2 RWA      Rwanda         Life expectancy at bir…        1995      41.5     29.4
## 3 SOM      Somalia        Life expectancy at bir…        1993      50.6     25.5
## 4 SSD      South Sudan    Life expectancy at bir…        1999      44.5     25.5
## 5 LBN      Lebanon        Life expectancy at bir…        1977      60.3     23.9
## 6 BGD      Bangladesh     Life expectancy at bir…        1972      49.6     23.1

Over time in SSF:

LifeExp_SSF_World_Diff <- LifeExp %>%
  filter (REF_AREA==c("SSF", "WLD")) %>%
  mutate(TIME_PERIOD = ifelse(REF_AREA == "WLD", TIME_PERIOD - 1, TIME_PERIOD)) %>% 
  arrange(TIME_PERIOD) %>% 
  group_by(TIME_PERIOD) %>% 
  mutate (diff = OBS_VALUE - lag(OBS_VALUE)) %>% 
  ungroup() %>% 
  filter(REF_AREA == "WLD")

LifeExp_SSF_World <- LifeExp %>%
  filter (REF_AREA==c("SSF", "WLD"))

Graph it

ggplot(LifeExp_SSF_World, aes(TIME_PERIOD, OBS_VALUE, group=REF_AREA)) +
  geom_line(aes(color=REF_AREA)) +
  theme(axis.text.x = element_text(angle = 45)) +
  ylab ("Life Expectancy at Birth") +
  xlab ("")

ggplot(LifeExp_SSF_World_Diff, aes(x = TIME_PERIOD, y = diff)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45)) +
  ylab ("Delta in Life Expectancy (WLD - SSF)") +
  xlab ("")

4 Question 4

gdp_per_capita <- read_csv("q1gdp.csv") %>% 
  select(REF_AREA, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE)
  
LE <- read_csv("LifeExp.csv") %>% 
  select(REF_AREA, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE)

whichcomesfirst <- rbind(LE, gdp_per_capita) %>% 
  filter (REF_AREA %in% c("MYS", "BGD", "KOR")) %>%
  dcast(REF_AREA + TIME_PERIOD ~ INDICATOR_LABEL, value.var="OBS_VALUE") %>%
  arrange(`REF_AREA`)

Graph:

anim <- ggplot(whichcomesfirst, aes(`Life expectancy at birth, total (years)`, `GDP per capita (current US$)`, color=`REF_AREA`)) +
  geom_point() +
  transition_states(TIME_PERIOD, transition_length = 10,state_length = 10) +
  shadow_trail(0.05)

animate(anim, renderer = gifski_renderer())

5 Question 5

GII <- read.csv("innovation.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
  filter(UNIT_MEASURE == "0_TO_100") %>% 
  filter (TIME_PERIOD == max(TIME_PERIOD))

Stability <- read.csv("stability.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
  filter(UNIT_MEASURE == "1_TO_7") %>%   #Notice how this is 1 a to 7 scale; the last metric was 0 to 100. Important to give them an example where they can spot those small things.7
  filter (TIME_PERIOD == max(TIME_PERIOD))

Legal <- read.csv("legal.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
  filter(UNIT_MEASURE == "1_TO_7") %>% 
  filter (TIME_PERIOD == max(TIME_PERIOD))

Barriers <- read.csv("Barriers.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
  filter(UNIT_MEASURE == "1_TO_7") %>% 
  filter (TIME_PERIOD == max(TIME_PERIOD))

Institutions <- read.csv("institutions.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
  filter(UNIT_MEASURE == "0_TO_100") %>% 
  filter (TIME_PERIOD == max(TIME_PERIOD))

Master <- rbind (GII, Stability, Legal, Barriers, Institutions) %>% 
  dcast(REF_AREA + TIME_PERIOD ~ INDICATOR_LABEL, value.var="OBS_VALUE") %>%
  arrange(`REF_AREA`)

Master <- merge (Master, country_meta, by.x="REF_AREA", by.y="ISO3") %>% 
  mutate(GDP_per_Capita = GDP_MD_EST / POP_EST * 1e6 ,
  Income_Level = case_when(
    GDP_per_Capita < 1145 ~ "Low Income",
    GDP_per_Capita >= 1146 & GDP_per_Capita < 4515 ~ "Lower-Middle Income",
    GDP_per_Capita >= 4516 & GDP_per_Capita < 14005 ~ "Upper-Middle Income",
    GDP_per_Capita >= 14006 ~ "High Income"),
  Income_Level =  factor(Income_Level, levels = c("Low Income", "Lower-Middle Income", "Upper-Middle Income", "High Income")))

Graph:

formula1 <- y ~ x
formula2 <- y ~ poly(x,2)

p1 <- ggplot(Master, aes(`GCI 4.0: Government ensuring policy stability (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`)) +
  geom_point(aes(color=REGION)) +
  geom_smooth(method = "lm", se = FALSE, formula = formula1) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) 

p2 <- ggplot(Master, aes(`GCI 4.0: Efficiency of legal framework to settle disputes (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`)) +
  geom_point(aes(color=REGION)) +
  geom_smooth(method = "lm", se = FALSE, formula = formula2) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula2, parse = TRUE, size = 5)

p3 <- ggplot(Master, aes(`GCI 4.0: Prevalence of non-tariff barriers (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`)) +
  geom_point(aes(color=REGION)) +
  geom_smooth(method = "lm", se = FALSE, formula = formula2) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula2, parse = TRUE, size = 5)

p4 <- ggplot(Master, aes(`GCI 4.0: 1st pillar: Institutions`, `GCI 4.0: 12th pillar: Innovation capability`)) +
  geom_point(aes(color=REGION)) +
  geom_smooth(method = "lm", se = FALSE, formula = formula1) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5)

grid.arrange(p1,p2,p3,p4, ncol=1)

#Income disaggregation

ggplot(Master, aes(`GCI 4.0: Government ensuring policy stability (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~Income_Level, scales = "free_x") +
  scale_color_viridis_d(name = "Income Group") +
  scale_size_continuous(labels = comma, name = "Population") +
  theme_minimal(base_size = 14) +
  labs(
    title = "Policy Stability vs. Innovation Capability",
    subtitle = "Analysis faceted by World Bank Income Level Classifications",
    x = "Government Ensuring Policy Stability (1-7)",
    y = "12th Pillar: Innovation Capability",
    caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
  ) +
  theme(
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "#f0f0f0", color = NA),
    strip.text = element_text(face = "bold")
  ) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) 
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Master, aes(`GCI 4.0: Efficiency of legal framework to settle disputes (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~Income_Level, scales = "free_x") +
  scale_color_viridis_d(name = "Income Group") +
  scale_size_continuous(labels = comma, name = "Population") +
  theme_minimal(base_size = 14) +
  labs(
    title = "Efficiency of legal framework to settle disputes vs. Innovation Capability",
    subtitle = "Analysis faceted by World Bank Income Level Classifications",
    x = "Efficiency of legal framework to settle disputes",
    y = "12th Pillar: Innovation Capability",
    caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
  ) +
  theme(
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "#f0f0f0", color = NA),
    strip.text = element_text(face = "bold")
  ) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) 
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Master, aes(`GCI 4.0: Prevalence of non-tariff barriers (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~Income_Level, scales = "free_x") +
  scale_color_viridis_d(name = "Income Group") +
  scale_size_continuous(labels = comma, name = "Population") +
  theme_minimal(base_size = 14) +
  labs(
    title = "Prevalence of non-tariff barriers vs. Innovation Capability",
    subtitle = "Analysis faceted by World Bank Income Level Classifications",
    x = "Prevalence of non-tariff barriersuj",
    y = "12th Pillar: Innovation Capability",
    caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
  ) +
  theme(
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "#f0f0f0", color = NA),
    strip.text = element_text(face = "bold")
  ) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5) 
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Master, aes(`GCI 4.0: 1st pillar: Institutions`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~Income_Level, scales = "free_x") +
  scale_color_viridis_d(name = "Income Group") +
  scale_size_continuous(labels = comma, name = "Population") +
  theme_minimal(base_size = 14) +
  labs(
    title = "Institutions vs. Innovation Capability",
    subtitle = "Analysis faceted by World Bank Income Level Classifications",
    x = "1st pillar: Institutions (1-7)",
    y = "12th Pillar: Innovation Capability",
    caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
  ) +
  theme(
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "#f0f0f0", color = NA),
    strip.text = element_text(face = "bold")
  ) +
  stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")), 
               formula = formula1, parse = TRUE, size = 5)
## `geom_smooth()` using formula = 'y ~ x'

6 Question 6

exp <- read.csv("techexp.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>% 
  group_by(REF_AREA) %>% 
  filter(TIME_PERIOD == max(TIME_PERIOD)) %>% 
  ungroup()

gdp_per_capita <- read.csv("q1gdp.csv") %>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>% 
  group_by(REF_AREA) %>% 
  filter(TIME_PERIOD == max(TIME_PERIOD)) %>% 
  ungroup()

exp <- exp %>% 
  merge(gdp_per_capita, by="REF_AREA") %>% 
  merge(country_meta, by.x="REF_AREA", by.y="ISO3") %>% 
  rename (
    `GDP per Capita` = OBS_VALUE.y,
    `High-technology exports (% of manufactured exports)` = OBS_VALUE.x,
    `Country Name` = REF_AREA_LABEL.y
  )

Graph:

p <- ggplot(data = exp, aes(x = `GDP per Capita`, y = `High-technology exports (% of manufactured exports)`, color = `GDP per Capita`, size = POP_EST)) +
  geom_smooth(method = "lm", color = "red", fullrange = TRUE, se=FALSE) +
  scale_color_viridis_c() +
  scale_x_log10() +
  geom_point(alpha = 0.6, aes(text="Country Name")) +
  scale_size(range = c(1.4, 19), name="Population (M)")

#You need to show them in the tutorial exactly how to generate a bubble plot

pp <- ggplotly(p, tooltip="Country Name")
## `geom_smooth()` using formula = 'y ~ x'
pp

7 Question 7

fin <- read.csv('fininc.csv') %>% 
  filter(SEX_LABEL == "Total") %>% 
  filter(AGE_LABEL == "15 years old and over") %>% 
  filter(URBANISATION_LABEL=="Total") %>% 
  filter(COMP_BREAKDOWN_1_LABEL == "Total") %>% 
  filter(COMP_BREAKDOWN_2_LABEL == "Total") %>% 
  filter(COMP_BREAKDOWN_3_LABEL == "Total")%>% 
  select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>% 
  group_by(REF_AREA) %>% 
  filter(TIME_PERIOD==max(TIME_PERIOD)) %>% 
  ungroup()

mapdata <- joinCountryData2Map(fin, joinCode = "ISO3", nameJoinColumn = "REF_AREA", nameCountryColumn="REF_AREA_LABEL")
## 158 codes from your data successfully matched countries in the map
## 11 codes from your data failed to match with a country code in the map
## 85 codes from the map weren't represented in your data
cp <- brewer.pal(8,'RdYlGn')

Map:

par(mar=c(8, 0, 0, 0))
mapParams <- mapCountryData(mapdata, nameColumnToPlot="OBS_VALUE", 
                            addLegend=TRUE, mapTitle="", mapRegion="Africa", 
                            catMethod="logFixedWidth", numCats=10, colourPalette="heat", 
                            borderCol="black")

Over time:

fin <- read.csv('fininc.csv') %>% 
  filter(SEX_LABEL == "Total") %>% 
  filter(AGE_LABEL == "15 years old and over") %>% 
  filter(URBANISATION_LABEL=="Total") %>% 
  filter(COMP_BREAKDOWN_1_LABEL == "Total") %>% 
  filter(COMP_BREAKDOWN_2_LABEL == "Total") %>% 
  filter(COMP_BREAKDOWN_3_LABEL == "Total") %>% 
  filter(REF_AREA == "KEN" | REF_AREA == "SSA") %>% 
  rename (`Country/Region` = REF_AREA_LABEL)

ggplot(data = fin, aes(x = TIME_PERIOD, y = OBS_VALUE, color = `Country/Region`)) +
  geom_line() +
  geom_point() +
  xlab("") +
  ylab ("% Borrowed any money using a mobile money account")