life_expectance data set link
# This sets plot images to a nice size
options(repr.plot.width = 6, repr.plot.height = 6)
# Loading packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
# Loading data
life_expectancy <- read.csv("C:/Users/maiam/Dropbox/PROFESSIONAL DEVELOPMENT/DATA SCIENCE/01_R/Visualizing Inequalities in Life Expectancy//UNdata.csv")
head(life_expectancy)
## Country.or.Area Subgroup Year
## 1 Afghanistan Female 2000-2005
## 2 Afghanistan Female 1995-2000
## 3 Afghanistan Female 1990-1995
## 4 Afghanistan Female 1985-1990
## 5 Afghanistan Male 2000-2005
## 6 Afghanistan Male 1995-2000
## Source Unit
## 1 UNPD_World Population Prospects_2006 (International estimate) Years
## 2 UNPD_World Population Prospects_2006 (International estimate) Years
## 3 UNPD_World Population Prospects_2006 (International estimate) Years
## 4 UNPD_World Population Prospects_2006 (International estimate) Years
## 5 UNPD_World Population Prospects_2006 (International estimate) Years
## 6 UNPD_World Population Prospects_2006 (International estimate) Years
## Value Value.Footnotes
## 1 42 NA
## 2 42 NA
## 3 42 NA
## 4 41 NA
## 5 42 NA
## 6 42 NA
# Subsetting and reshaping the life expectancy data
subdata <- life_expectancy %>%
filter (Year=="2000-2005")%>%
subset(select=c(Country.or.Area,Subgroup,Value))%>%
spread( key = Subgroup, value=Value)
# Taking a look at the first few rows
head(subdata)
## Country.or.Area Female Male
## 1 Afghanistan 42 42
## 2 Albania 79 73
## 3 Algeria 72 70
## 4 Angola 43 39
## 5 Argentina 78 71
## 6 Armenia 75 68
# Plotting male and female life expectancy
ggplot(subdata,aes(x=Male, y=Female) )+
geom_point()

# Adding an abline and changing the scale of axes of the previous plots
ggplot(subdata,aes(x=Male, y=Female) )+
geom_point()+
geom_abline(intercept = 0, slope=1)+
coord_cartesian(xlim = c(35, 85), ylim=c(35,85))

# Adding labels to previous plot
ggplot(subdata, aes(x=Male, y=Female))+
geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
geom_abline(intercept = 0, slope = 1, linetype=2)+
scale_x_continuous(limits=c(35,85))+
scale_y_continuous(limits=c(35,85))+
labs(title="Life Expectancy at Birth by Country",
subtitle="Years. Period: 2000-2005. Average.",
caption="Source: United Nations Statistics Division",
x="Males",
y="Females")

# Subseting data to obtain countries of interest
top_male <- subdata %>% arrange(Male-Female) %>% head(3)
top_female <- subdata %>% arrange(Female-Male) %>% head(3)
# Adding text to the previous plot to label countries of interest
ggplot(subdata, aes(x=Male, y=Female, label=Country.or.Area))+
geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
geom_abline(intercept = 0, slope = 1, linetype=2)+
scale_x_continuous(limits=c(35,85))+
scale_y_continuous(limits=c(35,85))+
labs(title="Life Expectancy at Birth by Country",
subtitle="Years. Period: 2000-2005. Average.",
caption="Source: United Nations Statistics Division",
x="Males",
y="Females")+
geom_text()+
theme_bw()

# Subsetting, mutating and reshaping the life expectancy data
subdata2 <- life_expectancy %>%
filter(Year %in% c("1985-1990", "2000-2005")) %>%
mutate(Sub_Year=paste(Subgroup, Year, sep="_")) %>%
mutate(Sub_Year=gsub("-", "_", Sub_Year)) %>%
select(-Subgroup, -Year) %>%
spread( key = Sub_Year, value=Value)
subdata2<-subdata2 %>%
mutate(diff_Female= Female_2000_2005 -Female_1985_1990) %>%
mutate(diff_Male = Male_2000_2005 -Male_1985_1990)
# Taking a look at the first few rows
head(subdata2)
## Country.or.Area
## 1 Afghanistan
## 2 Albania
## 3 Algeria
## 4 Angola
## 5 Argentina
## 6 Armenia
## Source Unit
## 1 UNPD_World Population Prospects_2006 (International estimate) Years
## 2 UNPD_World Population Prospects_2006 (International estimate) Years
## 3 UNPD_World Population Prospects_2006 (International estimate) Years
## 4 UNPD_World Population Prospects_2006 (International estimate) Years
## 5 UNPD_World Population Prospects_2006 (International estimate) Years
## 6 UNPD_World Population Prospects_2006 (International estimate) Years
## Value.Footnotes Female_1985_1990 Female_2000_2005 Male_1985_1990
## 1 NA 41 42 41
## 2 NA 75 79 69
## 3 NA 67 72 65
## 4 NA 42 43 38
## 5 NA 75 78 68
## 6 NA 71 75 66
## Male_2000_2005 diff_Female diff_Male
## 1 42 1 1
## 2 73 4 4
## 3 70 5 5
## 4 39 1 1
## 5 71 3 3
## 6 68 4 2
# Doing a nice first version of the plot with abline, scaling axis and adding labels
ggplot(subdata2, aes(x=diff_Male, y=diff_Female, label=Country.or.Area))+
geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
geom_abline(intercept = 0, slope = 1, linetype=2)+
scale_x_continuous(limits=c(-25,25))+
scale_y_continuous(limits=c(-25,25))+
labs(title="Life Expectancy at Birth by Country in Years",
subtitle="Difference between 1985-1990 and 2000-2005. Average.",
caption="Source: United Nations Statistics Division",
x="Males",
y="Females")+
theme_bw()

# Adding an hline and vline to previous plots
ggplot(subdata2, aes(x=diff_Male, y=diff_Female, label=Country.or.Area))+
geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
geom_abline(intercept = 0, slope = 1, linetype=2)+
scale_x_continuous(limits=c(-25,25))+
scale_y_continuous(limits=c(-25,25))+
geom_vline(xintercept = 0)+
geom_hline(yintercept = 0)+
labs(title="Life Expectancy at Birth by Country",
subtitle="Years. Difference between 1985-1990 and 2000-2005. Average.",
caption="Source: United Nations Statistics Division",
x="Males",
y="Females") +
theme_bw()

# Subseting data to obtain countries of interest
top <- subdata2 %>% arrange(diff_Male+diff_Female) %>% head(3)
bottom <- subdata2 %>% arrange(diff_Male+diff_Female) %>% tail(3)
# Adding text to the previous plot to label countries of interest
ggplot(subdata2, aes(x=diff_Male, y=diff_Female, label=Country.or.Area), guide=FALSE)+
geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
geom_abline(intercept = 0, slope = 1, linetype=2)+
scale_x_continuous(limits=c(-25,25))+
scale_y_continuous(limits=c(-25,25))+
geom_hline(yintercept=0, linetype=2)+
geom_vline(xintercept=0, linetype=2)+
labs(title="Life Expectancy at Birth by Country",
subtitle="Years. Difference between 1985-1990 and 2000-2005. Average.",
caption="Source: United Nations Statistics Division",
x="Males",
y="Females")+
geom_text(aes(label=Country.or.Area),data=subdata2[subdata2$Country.or.Area %in% top$Country.or.Area,])+
geom_text(aes(label=Country.or.Area),data=subdata2[subdata2$Country.or.Area %in% bottom$Country.or.Area,])+
theme_bw()
