life_expectance data set link

# This sets plot images to a nice size
options(repr.plot.width = 6, repr.plot.height = 6)

# Loading packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
# Loading data
life_expectancy <- read.csv("C:/Users/maiam/Dropbox/PROFESSIONAL DEVELOPMENT/DATA SCIENCE/01_R/Visualizing Inequalities in Life Expectancy//UNdata.csv")

head(life_expectancy)
##   Country.or.Area Subgroup      Year
## 1     Afghanistan   Female 2000-2005
## 2     Afghanistan   Female 1995-2000
## 3     Afghanistan   Female 1990-1995
## 4     Afghanistan   Female 1985-1990
## 5     Afghanistan     Male 2000-2005
## 6     Afghanistan     Male 1995-2000
##                                                          Source  Unit
## 1 UNPD_World Population Prospects_2006 (International estimate) Years
## 2 UNPD_World Population Prospects_2006 (International estimate) Years
## 3 UNPD_World Population Prospects_2006 (International estimate) Years
## 4 UNPD_World Population Prospects_2006 (International estimate) Years
## 5 UNPD_World Population Prospects_2006 (International estimate) Years
## 6 UNPD_World Population Prospects_2006 (International estimate) Years
##   Value Value.Footnotes
## 1    42              NA
## 2    42              NA
## 3    42              NA
## 4    41              NA
## 5    42              NA
## 6    42              NA
# Subsetting and reshaping the life expectancy data
subdata <- life_expectancy  %>% 
    filter (Year=="2000-2005")%>%
    subset(select=c(Country.or.Area,Subgroup,Value))%>%
    spread( key = Subgroup, value=Value)
    

# Taking a look at the first few rows
head(subdata)
##   Country.or.Area Female Male
## 1     Afghanistan     42   42
## 2         Albania     79   73
## 3         Algeria     72   70
## 4          Angola     43   39
## 5       Argentina     78   71
## 6         Armenia     75   68
# Plotting male and female life expectancy
ggplot(subdata,aes(x=Male, y=Female) )+
geom_point()

# Adding an abline and changing the scale of axes of the previous plots
ggplot(subdata,aes(x=Male, y=Female) )+
geom_point()+
geom_abline(intercept = 0, slope=1)+
coord_cartesian(xlim = c(35, 85), ylim=c(35,85))

# Adding labels to previous plot
ggplot(subdata, aes(x=Male, y=Female))+
  geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
  geom_abline(intercept = 0, slope = 1, linetype=2)+
  scale_x_continuous(limits=c(35,85))+
  scale_y_continuous(limits=c(35,85))+
  labs(title="Life Expectancy at Birth by Country",
       subtitle="Years. Period: 2000-2005. Average.",
       caption="Source: United Nations Statistics Division",
       x="Males",
       y="Females")

# Subseting data to obtain countries of interest
top_male <- subdata %>% arrange(Male-Female) %>% head(3)
top_female <- subdata %>% arrange(Female-Male) %>% head(3)

# Adding text to the previous plot to label countries of interest
ggplot(subdata, aes(x=Male, y=Female, label=Country.or.Area))+
  geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
  geom_abline(intercept = 0, slope = 1, linetype=2)+
  scale_x_continuous(limits=c(35,85))+
  scale_y_continuous(limits=c(35,85))+
  labs(title="Life Expectancy at Birth by Country",
       subtitle="Years. Period: 2000-2005. Average.",
       caption="Source: United Nations Statistics Division",
       x="Males",
       y="Females")+
  geom_text()+
  theme_bw()

# Subsetting, mutating and reshaping the life expectancy data
subdata2 <- life_expectancy %>% 
  filter(Year %in% c("1985-1990", "2000-2005")) %>% 
  mutate(Sub_Year=paste(Subgroup, Year, sep="_")) %>% 
  mutate(Sub_Year=gsub("-", "_", Sub_Year)) %>% 
  select(-Subgroup, -Year) %>% 
  spread( key = Sub_Year, value=Value)

subdata2<-subdata2 %>%
    mutate(diff_Female= Female_2000_2005 -Female_1985_1990) %>%
    mutate(diff_Male  = Male_2000_2005 -Male_1985_1990) 

# Taking a look at the first few rows
head(subdata2)
##   Country.or.Area
## 1     Afghanistan
## 2         Albania
## 3         Algeria
## 4          Angola
## 5       Argentina
## 6         Armenia
##                                                          Source  Unit
## 1 UNPD_World Population Prospects_2006 (International estimate) Years
## 2 UNPD_World Population Prospects_2006 (International estimate) Years
## 3 UNPD_World Population Prospects_2006 (International estimate) Years
## 4 UNPD_World Population Prospects_2006 (International estimate) Years
## 5 UNPD_World Population Prospects_2006 (International estimate) Years
## 6 UNPD_World Population Prospects_2006 (International estimate) Years
##   Value.Footnotes Female_1985_1990 Female_2000_2005 Male_1985_1990
## 1              NA               41               42             41
## 2              NA               75               79             69
## 3              NA               67               72             65
## 4              NA               42               43             38
## 5              NA               75               78             68
## 6              NA               71               75             66
##   Male_2000_2005 diff_Female diff_Male
## 1             42           1         1
## 2             73           4         4
## 3             70           5         5
## 4             39           1         1
## 5             71           3         3
## 6             68           4         2
# Doing a nice first version of the plot with abline, scaling axis and adding labels
ggplot(subdata2, aes(x=diff_Male, y=diff_Female, label=Country.or.Area))+
  geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
  geom_abline(intercept = 0, slope = 1, linetype=2)+
  scale_x_continuous(limits=c(-25,25))+
  scale_y_continuous(limits=c(-25,25))+
  labs(title="Life Expectancy at Birth by Country in Years",
       subtitle="Difference between 1985-1990 and 2000-2005. Average.",
       caption="Source: United Nations Statistics Division",
       x="Males",
       y="Females")+
theme_bw()

# Adding an hline and vline to previous plots
ggplot(subdata2, aes(x=diff_Male, y=diff_Female, label=Country.or.Area))+
  geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
  geom_abline(intercept = 0, slope = 1, linetype=2)+
  scale_x_continuous(limits=c(-25,25))+
  scale_y_continuous(limits=c(-25,25))+
  geom_vline(xintercept = 0)+
  geom_hline(yintercept = 0)+
  labs(title="Life Expectancy at Birth by Country",
       subtitle="Years. Difference between 1985-1990 and 2000-2005. Average.",
       caption="Source: United Nations Statistics Division",
       x="Males",
       y="Females") +
    theme_bw()

# Subseting data to obtain countries of interest
top <- subdata2 %>% arrange(diff_Male+diff_Female) %>% head(3)
bottom <- subdata2 %>% arrange(diff_Male+diff_Female) %>% tail(3)
# Adding text to the previous plot to label countries of interest
ggplot(subdata2, aes(x=diff_Male, y=diff_Female, label=Country.or.Area), guide=FALSE)+
  geom_point(colour="white", fill="chartreuse3", shape=21, alpha=.55, size=5)+
  geom_abline(intercept = 0, slope = 1, linetype=2)+
  scale_x_continuous(limits=c(-25,25))+
  scale_y_continuous(limits=c(-25,25))+
  geom_hline(yintercept=0, linetype=2)+
  geom_vline(xintercept=0, linetype=2)+
  labs(title="Life Expectancy at Birth by Country",
       subtitle="Years. Difference between 1985-1990 and 2000-2005. Average.",
       caption="Source: United Nations Statistics Division",
       x="Males",
       y="Females")+  
  geom_text(aes(label=Country.or.Area),data=subdata2[subdata2$Country.or.Area %in% top$Country.or.Area,])+
  geom_text(aes(label=Country.or.Area),data=subdata2[subdata2$Country.or.Area %in% bottom$Country.or.Area,])+
  theme_bw()