Import data and organize it:
gdp_per_capita <- read_csv("q1gdp.csv")
gdp_per_capita <- gdp_per_capita %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
group_by (REF_AREA) %>%
filter (TIME_PERIOD %in% c(1960, 1980, 2000, max(TIME_PERIOD))) %>%
ungroup
country_meta <- countriesCoarse@data %>%
select(NAME, ISO3, TYPE, POP_EST, LAT, LON, GDP_MD_EST, continent, REGION, LLDC)
merged_data <- merge(gdp_per_capita, country_meta, by.x="REF_AREA", by.y="ISO3")
Graph:
formula1 <- y ~ x
p1<- ggplot(data = filter(merged_data, TIME_PERIOD == 1960), aes(x=OBS_VALUE, LAT)) +
geom_point(aes(color=continent)) +
geom_smooth(method = "lm", color="black", formula = formula1) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5) +
scale_x_continuous(trans='log2') +
xlab("GDP Per Capita") +
ylab("Latitude") +
labs(title = "1960") +
ylim(-70,70)
p2<- ggplot(data = filter(merged_data, TIME_PERIOD == 1980), aes(x=OBS_VALUE, LAT)) +
geom_point(aes(color=continent)) +
geom_smooth(method = "lm", color="black", formula = formula1) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5) +
scale_x_continuous(trans='log2') +
xlab("GDP Per Capita") +
ylab("Latitude") +
labs(title = "1980") +
ylim(-70,70)
p3<- ggplot(data = filter(merged_data, TIME_PERIOD == 2000), aes(x=OBS_VALUE , LAT)) +
geom_point(aes(color=continent)) +
geom_smooth(method = "lm", color="black", formula = formula1) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5) +
scale_x_continuous(trans='log2') +
xlab("GDP Per Capita") +
ylab("Latitude") +
labs(title = "2000") +
ylim(-70,70)
p4<- ggplot(data = filter(merged_data, TIME_PERIOD == max(TIME_PERIOD)), aes(x=OBS_VALUE, LAT)) +
geom_point(aes(color=continent)) +
geom_smooth(method = "lm", color="black", formula = formula1) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5) +
scale_x_continuous(trans='log2') +
xlab("GDP Per Capita") +
ylab("Latitude") +
labs(title = "Latest Year") +
ylim(-70,70)
grid.arrange(p1,p2,p3,p4, ncol=2)
Import data and organize it:
internet <- read_csv("fixedbroadband100.csv")
internet <- internet %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE)
merged_data <- merge(internet, country_meta, by.x="REF_AREA", by.y="ISO3")
#Filter by Africa and only data for the latest year
internet_latest_africa <- merged_data %>%
filter(REGION == "Africa") %>%
group_by (REF_AREA) %>%
filter (TIME_PERIOD == max(TIME_PERIOD)) %>%
ungroup ()
Graph:
ggplot(merged_data, aes(x = LLDC, y = OBS_VALUE)) +
geom_boxplot() +
coord_cartesian(ylim=c(0,50)) +
ylab("Fixed broadband subscriptions (per 100 people)")
T-Test for statistical significance
# T-Test
t_test <- t.test(OBS_VALUE ~ LLDC, data = merged_data)
t_test
##
## Welch Two Sample t-test
##
## data: OBS_VALUE by LLDC
## t = -28.126, df = 1877.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group LLDC and group other is not equal to 0
## 95 percent confidence interval:
## -10.048756 -8.738684
## sample estimates:
## mean in group LLDC mean in group other
## 3.018114 12.411834
Generate YoY table of average broadband prevalence in all LLDCs vs. non-LLDCs
internet_over_time <- merged_data %>%
filter(continent=="Africa") %>%
group_by(TIME_PERIOD, LLDC) %>%
summarise(mean = mean(OBS_VALUE))
Graph it:
ggplot(internet_over_time, aes(TIME_PERIOD, mean, group=LLDC)) +
geom_line(aes(color=LLDC)) +
theme(axis.text.x = element_text(angle = 45)) +
scale_x_continuous(
breaks = unique(merged_data$TIME_PERIOD)) +
ylab("Fixed broadband subscriptions (per 100 people)") +
xlab("")
Import data and organize it:
LifeExp <- read.csv("LifeExp.csv") %>%
select(REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE) %>%
arrange(REF_AREA) %>%
filter(REF_AREA != "CAF")
Solution:
LifeExp_Ranked <- LifeExp %>%
group_by(REF_AREA) %>%
mutate(le_delta = OBS_VALUE - lag(OBS_VALUE)) %>%
ungroup()
LifeExp_World_Top <- LifeExp_Ranked %>%
na.omit() %>%
arrange(le_delta)
head(LifeExp_World_Top)
## # A tibble: 6 × 6
## REF_AREA REF_AREA_LABEL INDICATOR_LABEL TIME_PERIOD OBS_VALUE le_delta
## <chr> <chr> <chr> <int> <dbl> <dbl>
## 1 RWA Rwanda Life expectanc… 1994 12.2 -30.8
## 2 KHM Cambodia Life expectanc… 1975 12.8 -26.5
## 3 SOM Somalia Life expectanc… 1991 25.4 -21.5
## 4 LBN Lebanon Life expectanc… 1976 36.4 -21.3
## 5 BIH Bosnia and Herzegovina Life expectanc… 1992 52.0 -19.8
## 6 BDI Burundi Life expectanc… 1972 25.8 -17.4
LifeExp_World_Bottom <- LifeExp_Ranked %>%
na.omit() %>%
arrange(desc(le_delta))
head(LifeExp_World_Bottom)
## # A tibble: 6 × 6
## REF_AREA REF_AREA_LABEL INDICATOR_LABEL TIME_PERIOD OBS_VALUE le_delta
## <chr> <chr> <chr> <int> <dbl> <dbl>
## 1 KHM Cambodia Life expectancy at bir… 1979 41.1 29.5
## 2 RWA Rwanda Life expectancy at bir… 1995 41.5 29.4
## 3 SOM Somalia Life expectancy at bir… 1993 50.6 25.5
## 4 SSD South Sudan Life expectancy at bir… 1999 44.5 25.5
## 5 LBN Lebanon Life expectancy at bir… 1977 60.3 23.9
## 6 BGD Bangladesh Life expectancy at bir… 1972 49.6 23.1
Over time in SSF:
LifeExp_SSF_World_Diff <- LifeExp %>%
filter (REF_AREA==c("SSF", "WLD")) %>%
mutate(TIME_PERIOD = ifelse(REF_AREA == "WLD", TIME_PERIOD - 1, TIME_PERIOD)) %>%
arrange(TIME_PERIOD) %>%
group_by(TIME_PERIOD) %>%
mutate (diff = OBS_VALUE - lag(OBS_VALUE)) %>%
ungroup() %>%
filter(REF_AREA == "WLD")
LifeExp_SSF_World <- LifeExp %>%
filter (REF_AREA==c("SSF", "WLD"))
Graph it
ggplot(LifeExp_SSF_World, aes(TIME_PERIOD, OBS_VALUE, group=REF_AREA)) +
geom_line(aes(color=REF_AREA)) +
theme(axis.text.x = element_text(angle = 45)) +
ylab ("Life Expectancy at Birth") +
xlab ("")
ggplot(LifeExp_SSF_World_Diff, aes(x = TIME_PERIOD, y = diff)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45)) +
ylab ("Delta in Life Expectancy (WLD - SSF)") +
xlab ("")
gdp_per_capita <- read_csv("q1gdp.csv") %>%
select(REF_AREA, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE)
LE <- read_csv("LifeExp.csv") %>%
select(REF_AREA, INDICATOR_LABEL, TIME_PERIOD, OBS_VALUE)
whichcomesfirst <- rbind(LE, gdp_per_capita) %>%
filter (REF_AREA %in% c("MYS", "BGD", "KOR")) %>%
dcast(REF_AREA + TIME_PERIOD ~ INDICATOR_LABEL, value.var="OBS_VALUE") %>%
arrange(`REF_AREA`)
Graph:
anim <- ggplot(whichcomesfirst, aes(`Life expectancy at birth, total (years)`, `GDP per capita (current US$)`, color=`REF_AREA`)) +
geom_point() +
transition_states(TIME_PERIOD, transition_length = 10,state_length = 10) +
shadow_trail(0.05)
animate(anim, renderer = gifski_renderer())
GII <- read.csv("innovation.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
filter(UNIT_MEASURE == "0_TO_100") %>%
filter (TIME_PERIOD == max(TIME_PERIOD))
Stability <- read.csv("stability.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
filter(UNIT_MEASURE == "1_TO_7") %>% #Notice how this is 1 a to 7 scale; the last metric was 0 to 100. Important to give them an example where they can spot those small things.7
filter (TIME_PERIOD == max(TIME_PERIOD))
Legal <- read.csv("legal.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
filter(UNIT_MEASURE == "1_TO_7") %>%
filter (TIME_PERIOD == max(TIME_PERIOD))
Barriers <- read.csv("Barriers.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
filter(UNIT_MEASURE == "1_TO_7") %>%
filter (TIME_PERIOD == max(TIME_PERIOD))
Institutions <- read.csv("institutions.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
filter(UNIT_MEASURE == "0_TO_100") %>%
filter (TIME_PERIOD == max(TIME_PERIOD))
Master <- rbind (GII, Stability, Legal, Barriers, Institutions) %>%
dcast(REF_AREA + TIME_PERIOD ~ INDICATOR_LABEL, value.var="OBS_VALUE") %>%
arrange(`REF_AREA`)
Master <- merge (Master, country_meta, by.x="REF_AREA", by.y="ISO3") %>%
mutate(GDP_per_Capita = GDP_MD_EST / POP_EST * 1e6 ,
Income_Level = case_when(
GDP_per_Capita < 1145 ~ "Low Income",
GDP_per_Capita >= 1146 & GDP_per_Capita < 4515 ~ "Lower-Middle Income",
GDP_per_Capita >= 4516 & GDP_per_Capita < 14005 ~ "Upper-Middle Income",
GDP_per_Capita >= 14006 ~ "High Income"),
Income_Level = factor(Income_Level, levels = c("Low Income", "Lower-Middle Income", "Upper-Middle Income", "High Income")))
Graph:
formula1 <- y ~ x
formula2 <- y ~ poly(x,2)
p1 <- ggplot(Master, aes(`GCI 4.0: Government ensuring policy stability (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`)) +
geom_point(aes(color=REGION)) +
geom_smooth(method = "lm", se = FALSE, formula = formula1) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5)
p2 <- ggplot(Master, aes(`GCI 4.0: Efficiency of legal framework to settle disputes (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`)) +
geom_point(aes(color=REGION)) +
geom_smooth(method = "lm", se = FALSE, formula = formula2) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula2, parse = TRUE, size = 5)
p3 <- ggplot(Master, aes(`GCI 4.0: Prevalence of non-tariff barriers (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`)) +
geom_point(aes(color=REGION)) +
geom_smooth(method = "lm", se = FALSE, formula = formula2) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula2, parse = TRUE, size = 5)
p4 <- ggplot(Master, aes(`GCI 4.0: 1st pillar: Institutions`, `GCI 4.0: 12th pillar: Innovation capability`)) +
geom_point(aes(color=REGION)) +
geom_smooth(method = "lm", se = FALSE, formula = formula1) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5)
grid.arrange(p1,p2,p3,p4, ncol=1)
#Income disaggregation
ggplot(Master, aes(`GCI 4.0: Government ensuring policy stability (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
facet_wrap(~Income_Level, scales = "free_x") +
scale_color_viridis_d(name = "Income Group") +
scale_size_continuous(labels = comma, name = "Population") +
theme_minimal(base_size = 14) +
labs(
title = "Policy Stability vs. Innovation Capability",
subtitle = "Analysis faceted by World Bank Income Level Classifications",
x = "Government Ensuring Policy Stability (1-7)",
y = "12th Pillar: Innovation Capability",
caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
) +
theme(
legend.position = "bottom",
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold")
) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(Master, aes(`GCI 4.0: Efficiency of legal framework to settle disputes (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
facet_wrap(~Income_Level, scales = "free_x") +
scale_color_viridis_d(name = "Income Group") +
scale_size_continuous(labels = comma, name = "Population") +
theme_minimal(base_size = 14) +
labs(
title = "Efficiency of legal framework to settle disputes vs. Innovation Capability",
subtitle = "Analysis faceted by World Bank Income Level Classifications",
x = "Efficiency of legal framework to settle disputes",
y = "12th Pillar: Innovation Capability",
caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
) +
theme(
legend.position = "bottom",
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold")
) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(Master, aes(`GCI 4.0: Prevalence of non-tariff barriers (1-7)`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
facet_wrap(~Income_Level, scales = "free_x") +
scale_color_viridis_d(name = "Income Group") +
scale_size_continuous(labels = comma, name = "Population") +
theme_minimal(base_size = 14) +
labs(
title = "Prevalence of non-tariff barriers vs. Innovation Capability",
subtitle = "Analysis faceted by World Bank Income Level Classifications",
x = "Prevalence of non-tariff barriersuj",
y = "12th Pillar: Innovation Capability",
caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
) +
theme(
legend.position = "bottom",
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold")
) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(Master, aes(`GCI 4.0: 1st pillar: Institutions`, `GCI 4.0: 12th pillar: Innovation capability`, color = Income_Level)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
facet_wrap(~Income_Level, scales = "free_x") +
scale_color_viridis_d(name = "Income Group") +
scale_size_continuous(labels = comma, name = "Population") +
theme_minimal(base_size = 14) +
labs(
title = "Institutions vs. Innovation Capability",
subtitle = "Analysis faceted by World Bank Income Level Classifications",
x = "1st pillar: Institutions (1-7)",
y = "12th Pillar: Innovation Capability",
caption = "Source: GCI 4.0 | Note: Size represents Population Estimate"
) +
theme(
legend.position = "bottom",
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold")
) +
stat_poly_eq(aes(label = paste(..rr.label.., sep = "~~~")),
formula = formula1, parse = TRUE, size = 5)
## `geom_smooth()` using formula = 'y ~ x'
exp <- read.csv("techexp.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
group_by(REF_AREA) %>%
filter(TIME_PERIOD == max(TIME_PERIOD)) %>%
ungroup()
gdp_per_capita <- read.csv("q1gdp.csv") %>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
group_by(REF_AREA) %>%
filter(TIME_PERIOD == max(TIME_PERIOD)) %>%
ungroup()
exp <- exp %>%
merge(gdp_per_capita, by="REF_AREA") %>%
merge(country_meta, by.x="REF_AREA", by.y="ISO3") %>%
rename (
`GDP per Capita` = OBS_VALUE.y,
`High-technology exports (% of manufactured exports)` = OBS_VALUE.x,
`Country Name` = REF_AREA_LABEL.y
)
Graph:
p <- ggplot(data = exp, aes(x = `GDP per Capita`, y = `High-technology exports (% of manufactured exports)`, color = `GDP per Capita`, size = POP_EST)) +
geom_smooth(method = "lm", color = "red", fullrange = TRUE, se=FALSE) +
scale_color_viridis_c() +
scale_x_log10() +
geom_point(alpha = 0.6, aes(text="Country Name")) +
scale_size(range = c(1.4, 19), name="Population (M)")
#You need to show them in the tutorial exactly how to generate a bubble plot
pp <- ggplotly(p, tooltip="Country Name")
## `geom_smooth()` using formula = 'y ~ x'
pp
fin <- read.csv('fininc.csv') %>%
filter(SEX_LABEL == "Total") %>%
filter(AGE_LABEL == "15 years old and over") %>%
filter(URBANISATION_LABEL=="Total") %>%
filter(COMP_BREAKDOWN_1_LABEL == "Total") %>%
filter(COMP_BREAKDOWN_2_LABEL == "Total") %>%
filter(COMP_BREAKDOWN_3_LABEL == "Total")%>%
select (REF_AREA, REF_AREA_LABEL, INDICATOR_LABEL, TIME_PERIOD, UNIT_MEASURE, OBS_VALUE) %>%
group_by(REF_AREA) %>%
filter(TIME_PERIOD==max(TIME_PERIOD)) %>%
ungroup()
mapdata <- joinCountryData2Map(fin, joinCode = "ISO3", nameJoinColumn = "REF_AREA", nameCountryColumn="REF_AREA_LABEL")
## 158 codes from your data successfully matched countries in the map
## 11 codes from your data failed to match with a country code in the map
## 85 codes from the map weren't represented in your data
cp <- brewer.pal(8,'RdYlGn')
Map:
par(mar=c(8, 0, 0, 0))
mapParams <- mapCountryData(mapdata, nameColumnToPlot="OBS_VALUE",
addLegend=TRUE, mapTitle="", mapRegion="Africa",
catMethod="logFixedWidth", numCats=10, colourPalette="heat",
borderCol="black")
Over time:
fin <- read.csv('fininc.csv') %>%
filter(SEX_LABEL == "Total") %>%
filter(AGE_LABEL == "15 years old and over") %>%
filter(URBANISATION_LABEL=="Total") %>%
filter(COMP_BREAKDOWN_1_LABEL == "Total") %>%
filter(COMP_BREAKDOWN_2_LABEL == "Total") %>%
filter(COMP_BREAKDOWN_3_LABEL == "Total") %>%
filter(REF_AREA == "KEN" | REF_AREA == "SSA") %>%
rename (`Country/Region` = REF_AREA_LABEL)
ggplot(data = fin, aes(x = TIME_PERIOD, y = OBS_VALUE, color = `Country/Region`)) +
geom_line() +
geom_point() +
xlab("") +
ylab ("% Borrowed any money using a mobile money account")