R Markdown

******* FIRST PART *********

setwd("C:/Users/23043/Dropbox/UDLAP/Cursos/2022 Primavera/Pensiones y SS/Datos")


library(doBy)
## Warning: package 'doBy' was built under R version 4.1.2
library(ggplot2)
library(reshape2)

pop<-read.csv("poblacion.csv")

Lets explore the variables we have:

names(pop)
## [1] "ANIO"      "ENTIDAD"   "CVE_GEO"   "EDAD"      "SEXO"      "POBLACION"
head(pop)
##   ANIO            ENTIDAD CVE_GEO EDAD    SEXO POBLACION
## 1 1950 República Mexicana       0    0 Hombres    572103
## 2 1950 República Mexicana       0    0 Mujeres    559162
## 3 1950 República Mexicana       0    1 Hombres    514540
## 4 1950 República Mexicana       0    1 Mujeres    505269
## 5 1950 República Mexicana       0    2 Hombres    478546
## 6 1950 República Mexicana       0    2 Mujeres    469322

For this example, lets work with information at the national level, for this, we have to restrict our data to those rows where the code for the state is 0.

pop<-pop[which(pop$CVE_GEO==0),]

Vamos a explorar un poco la poblacion en distintos momentos. Primero estimaremos las proporciones de hombres y de mujeres en cada edad y anio.

poptot<-summaryBy(POBLACION ~ ANIO, FUN=sum,data=pop)

pop<-merge(pop,poptot,by="ANIO")
remove(poptot)

pop$prop<-ifelse(pop$SEXO=="Hombres",
                 -1*pop$POBLACION/pop$POBLACION.sum,
                 pop$POBLACION/pop$POBLACION.sum)

Ahora vamos a dibujar cuatro piramides, para: - 1950 - 1995 - 2020 - 2050

pop$lon<-0
data_pyr<-pop

p<-ggplot(data=subset(data_pyr,ANIO==1950),
          mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))

p + geom_ribbon(alpha=0.5)+
  scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
  scale_x_continuous(breaks=seq(0,100,10))+
  scale_fill_manual(values=c("red","blue"))+
  guides(fill=guide_legend(reverse=T))+
  labs(x="Age",y="Percent of Population",
       title="Age distribution of the population of Mexico: 1950",
       subtitle="Age is top-coded at 100",
       caption="Data: CONAPO",
       fill="Sex")+
  theme_minimal()+
  theme(legend.position="bottom",
        plot.title=element_text(size=rel(0.8), face="bold"),
        plot.subtitle=element_text(size=rel(0.8)),
        plot.caption=element_text(size=rel(0.8)),
        axis.text.y=element_text(size=rel(0.8)),
        axis.text.x=element_text(size=rel(0.8)))+
  coord_flip() 

La segunda

p<-ggplot(data=subset(data_pyr,ANIO==1995),
          mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))

p + geom_ribbon(alpha=0.5)+
  scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
  scale_x_continuous(breaks=seq(0,100,10))+
  scale_fill_manual(values=c("red","blue"))+
  guides(fill=guide_legend(reverse=T))+
  labs(x="Age",y="Percent of Population",
       title="Age distribution of the population of Mexico: 1995",
       subtitle="Age is top-coded at 100",
       caption="Data: CONAPO",
       fill="Sex")+
  theme_minimal()+
  theme(legend.position="bottom",
        plot.title=element_text(size=rel(0.8), face="bold"),
        plot.subtitle=element_text(size=rel(0.8)),
        plot.caption=element_text(size=rel(0.8)),
        axis.text.y=element_text(size=rel(0.8)),
        axis.text.x=element_text(size=rel(0.8)))+
  coord_flip() 

La tercera

p<-ggplot(data=subset(data_pyr,ANIO==2020),
          mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))

p + geom_ribbon(alpha=0.5)+
  scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
  scale_x_continuous(breaks=seq(0,100,10))+
  scale_fill_manual(values=c("red","blue"))+
  guides(fill=guide_legend(reverse=T))+
  labs(x="Age",y="Percent of Population",
       title="Age distribution of the population of Mexico: 2020",
       subtitle="Age is top-coded at 100",
       caption="Data: CONAPO",
       fill="Sex")+
  theme_minimal()+
  theme(legend.position="bottom",
        plot.title=element_text(size=rel(0.8), face="bold"),
        plot.subtitle=element_text(size=rel(0.8)),
        plot.caption=element_text(size=rel(0.8)),
        axis.text.y=element_text(size=rel(0.8)),
        axis.text.x=element_text(size=rel(0.8)))+
  coord_flip() 

La cuarta

p<-ggplot(data=subset(data_pyr,ANIO==2050),
          mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))

p + geom_ribbon(alpha=0.5)+
  scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
  scale_x_continuous(breaks=seq(0,100,10))+
  scale_fill_manual(values=c("red","blue"))+
  guides(fill=guide_legend(reverse=T))+
  labs(x="Age",y="Percent of Population",
       title="Age distribution of the population of Mexico: 2050",
       subtitle="Age is top-coded at 100",
       caption="Data: CONAPO",
       fill="Sex")+
  theme_minimal()+
  theme(legend.position="bottom",
        plot.title=element_text(size=rel(0.8), face="bold"),
        plot.subtitle=element_text(size=rel(0.8)),
        plot.caption=element_text(size=rel(0.8)),
        axis.text.y=element_text(size=rel(0.8)),
        axis.text.x=element_text(size=rel(0.8)))+
  coord_flip() 

remove(pop,p,data_pyr)

******* FIRST PART *********

In this second part we will use a different dataset. Now we will use the dem_ind, a dataset including basic demographic indicators of interest.

data<-read.csv("ind_dem.csv")

names(data)
##  [1] "RENGLON"       "ANIO"          "ENTIDAD"       "CVE_GEO"      
##  [5] "CRE_NAT"       "CRE_SOC"       "CRE_TOT"       "DEF"          
##  [9] "EDAD_MED"      "EMI_EST"       "EMI_INT"       "EVH"          
## [13] "EVM"           "EV"            "HOM_MIT_AÑO"   "IND_ENV"      
## [17] "INM_EST"       "INM_INT"       "MIG_NET_EST"   "MIG_NET_INT"  
## [21] "MUJ_MIT_AÑO"   "NAC"           "POB_MIT_AÑO"   "MUJ_12_14"    
## [25] "MUJ_15_17"     "MUJ_15_19"     "MUJ_15_29"     "MUJ_15_49"    
## [29] "MUJ_18_24"     "MUJ_20_24"     "MUJ_3_5"       "MUJ_30_64"    
## [33] "MUJ_6_11"      "MUJ_65_MAS"    "HOM_12_14"     "HOM_15_17"    
## [37] "HOM_15_19"     "HOM_15_29"     "HOM_15_49"     "HOM_18_24"    
## [41] "HOM_20_24"     "HOM_3_5"       "HOM_30_64"     "HOM_6_11"     
## [45] "HOM_65_MAS"    "POB_12_14"     "POB_15_17"     "POB_15_19"    
## [49] "POB_15_29"     "POB_15_49"     "POB_18_24"     "POB_20_24"    
## [53] "POB_3_5"       "POB_30_64"     "POB_6_11"      "POB_65_MAS"   
## [57] "RAZ_DEP_ADU"   "RAZ_DEP_INF"   "RAZ_DEP"       "T_BRU_MOR"    
## [61] "T_BRU_NAT"     "T_CRE_NAT"     "T_CRE_SOC"     "T_CRE_TOT"    
## [65] "T_EMI_EST"     "T_INM_EST"     "T_MIG_NET_EST" "T_MIG_NET_INT"
## [69] "TMIH"          "TMIM"          "TMI"           "TEF_ADO"      
## [73] "TGF"

I will select Mexico

mex<- data[which(data$CVE_GEO==0),]

Lets start with the percentage of population ages 65 and older:

mex$pp65m<- mex$POB_65_MAS/mex$POB_MIT_AÑO*100

ggplot(mex,aes(x=ANIO,y=pp65m))+geom_line()+
  labs(title="Mexico:% Population 65 and older, 1950-2050",
       caption="Data: CONAPO")+
       xlab("Year")+ylab("% Population")

Now, growth rates of population by age groups. Lets estimate this for ages <15, 15-64 and 65+

# Growth rates for <15
mex$Pob<-mex$POB_MIT_AÑO-(mex$POB_15_29+mex$POB_30_64)
mex$r1<-0
for (i in 2:100){
  mex[i,76]<-log(mex[i,75]/mex[i-1,75])*100
}
# Growth rates for 15-64
mex$Pob<-mex$POB_15_29+mex$POB_30_64
mex$r2<-0
for (i in 2:100){
  mex[i,77]<-log(mex[i,75]/mex[i-1,75])*100
}
# Growth rates for 15-64
mex$Pob<-mex$POB_65_MAS
mex$r3<-0
for (i in 2:100){
  mex[i,78]<-log(mex[i,75]/mex[i-1,75])*100
}

mex_g<-mex[,c("ANIO","r1","r2","r3")]
mex_g<-melt(mex_g,id="ANIO")
mex_g$variable2<-0
mex_g$variable2<-ifelse(mex_g$variable=="r1","P0-14",
              ifelse(mex_g$variable=="r2","P15-64","P65+"))
ggplot(mex_g,aes(x=ANIO,y=value,color=variable2))+
  geom_line()+
  labs(title="Mexico:% Growth rates by age groups, 1951-2050",
       subtitle="Exponential growth",
       caption="Data: CONAPO",
       x="Year",y="Rate(%)",
       colour="Age groups")+xlim(1951,2049)
## Warning: Removed 6 row(s) containing missing values (geom_path).

Now we estimate the dependency ratio:

# Dependency ratio
mex$P0_14<-mex$POB_MIT_AÑO-(mex$POB_15_29+mex$POB_30_64+
                              mex$POB_65_MAS)
mex$P15_64<-mex$POB_15_29+mex$POB_30_64
mex$P65mas<- mex$POB_65_MAS

mex$TDR<-((mex$P0_14+mex$P65mas)/mex$P15_64)*100

ggplot(mex)+geom_line(aes(x=ANIO,y=TDR),colour="red")+
  labs(title="Mexico: Dependency Ratio, 1950-2050",
       caption="Data: CONAPO")+
  xlab("Year")+ylab("Ratio")

And the index of ageing:

mex$IA<-(mex$P65mas/mex$P0_14)*100

ggplot(mex)+geom_line(aes(x=ANIO,y=IA),colour="red")+
  labs(title="Mexico: Ageing Index, 1950-2050",
       caption="Data: CONAPO")+
  xlab("Year")+ylab("Index")

Now, lets explore some other associations between variables. For this, we will return to our original dataset “data” and create a new variable with labels that we will use later:

remove(mex,mex_g)

data$lab<-NA
data$lab<-ifelse(data$CVE_GEO==0,"NAC",data$lab)
data$lab<-ifelse(data$CVE_GEO==1,"AGS",data$lab)
data$lab<-ifelse(data$CVE_GEO==2,"BCN",data$lab)
data$lab<-ifelse(data$CVE_GEO==3,"BCS",data$lab)
data$lab<-ifelse(data$CVE_GEO==4,"CAM",data$lab)
data$lab<-ifelse(data$CVE_GEO==5,"COA",data$lab)
data$lab<-ifelse(data$CVE_GEO==6,"COL",data$lab)
data$lab<-ifelse(data$CVE_GEO==7,"CHI",data$lab)
data$lab<-ifelse(data$CVE_GEO==8,"CHH",data$lab)
data$lab<-ifelse(data$CVE_GEO==9,"CMX",data$lab)
data$lab<-ifelse(data$CVE_GEO==10,"DGO",data$lab)
data$lab<-ifelse(data$CVE_GEO==11,"GTO",data$lab)
data$lab<-ifelse(data$CVE_GEO==12,"GRO",data$lab)
data$lab<-ifelse(data$CVE_GEO==13,"HGO",data$lab)
data$lab<-ifelse(data$CVE_GEO==14,"JAL",data$lab)
data$lab<-ifelse(data$CVE_GEO==15,"MEX",data$lab)
data$lab<-ifelse(data$CVE_GEO==16,"MIC",data$lab)
data$lab<-ifelse(data$CVE_GEO==17,"MOR",data$lab)
data$lab<-ifelse(data$CVE_GEO==18,"NAY",data$lab)
data$lab<-ifelse(data$CVE_GEO==19,"NLN",data$lab)
data$lab<-ifelse(data$CVE_GEO==20,"OAX",data$lab)
data$lab<-ifelse(data$CVE_GEO==21,"PUE",data$lab)
data$lab<-ifelse(data$CVE_GEO==22,"QRT",data$lab)
data$lab<-ifelse(data$CVE_GEO==23,"QTR",data$lab)
data$lab<-ifelse(data$CVE_GEO==24,"SLP",data$lab)
data$lab<-ifelse(data$CVE_GEO==25,"SIN",data$lab)
data$lab<-ifelse(data$CVE_GEO==26,"SON",data$lab)
data$lab<-ifelse(data$CVE_GEO==27,"TAB",data$lab)
data$lab<-ifelse(data$CVE_GEO==28,"TAM",data$lab)
data$lab<-ifelse(data$CVE_GEO==29,"TLX",data$lab)
data$lab<-ifelse(data$CVE_GEO==30,"VER",data$lab)
data$lab<-ifelse(data$CVE_GEO==31,"YUC",data$lab)
data$lab<-ifelse(data$CVE_GEO==32,"ZAC",data$lab)

data$P0_14<-data$POB_MIT_AÑO-(data$POB_15_29+data$POB_30_64+
                              data$POB_65_MAS)
data$P15_64<-data$POB_15_29+data$POB_30_64
data$P65mas<- data$POB_65_MAS

data$AI<- (data$P65mas/data$P0_14)*100

data$TDR<-((data$P0_14+data$P65mas)/data$P15_64)*100

Now, lets explore some associations between the life expectancy and TFR in 1970:

# select for 1970
data1970<-data[which(data$ANIO==1970),]

ggplot(data1970,aes(x=EV,y=TGF))+geom_point()

We can make this plot look better

# select for 1970
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.1.2
ggplot(data1970,aes(x=EV,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Life expectancy at birth and 
       Total Fertility Rate, 1970",
       caption="Data: CONAPO")+
  xlab("Life Expectancy")+ylab("Total Fertility Rate")

Now, lets plot the TDR and TFR

ggplot(data1970,aes(x=TDR,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Dependency Ratio and 
       Total Fertility Rate, 1970",
       caption="Data: CONAPO")+
  xlab("Dependency Ratio")+ylab("Total Fertility Rate")

Now, lets plot the Ageing Index and TFR

ggplot(data1970,aes(x=AI,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Ageing Index and 
       Total Fertility Rate, 1970",
       caption="Data: CONAPO")+
  xlab("Ageing Index")+ylab("Total Fertility Rate")

We repeat for 2000 and 2050

data2000<-data[which(data$ANIO==2000),]

ggplot(data2000,aes(x=EV,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Life expectancy at birth and 
       Total Fertility Rate, 2000",
       caption="Data: CONAPO")+
  xlab("Life Expectancy")+ylab("Total Fertility Rate")

ggplot(data2000,aes(x=TDR,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Dependency Ratio and 
       Total Fertility Rate, 2000",
       caption="Data: CONAPO")+
  xlab("Dependency Ratio")+ylab("Total Fertility Rate")

ggplot(data2000,aes(x=AI,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Ageing Index and 
       Total Fertility Rate, 2000",
       caption="Data: CONAPO")+
  xlab("Ageing Index")+ylab("Total Fertility Rate")

data2050<-data[which(data$ANIO==2050),]

ggplot(data2050,aes(x=EV,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Life expectancy at birth and 
       Total Fertility Rate, 2050",
       caption="Data: CONAPO")+
  xlab("Life Expectancy")+ylab("Total Fertility Rate")

ggplot(data2050,aes(x=TDR,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Dependency Ratio and 
       Total Fertility Rate, 2050",
       caption="Data: CONAPO")+
  xlab("Dependency Ratio")+ylab("Total Fertility Rate")

ggplot(data2050,aes(x=AI,y=TGF))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Ageing Index and 
       Total Fertility Rate, 2050",
       caption="Data: CONAPO")+
  xlab("Ageing Index")+ylab("Total Fertility Rate")
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggplot(data1970,aes(x=TDR,y=AI))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Dependency Ratio and Ageing INdex, 1970",
       caption="Data: CONAPO")+
  xlab("Dependency Ratio")+ylab("Ageing Index")

ggplot(data2000,aes(x=TDR,y=AI))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Dependency Ratio and Ageing INdex, 2000",
       caption="Data: CONAPO")+
  xlab("Dependency Ratio")+ylab("Ageing Index")

ggplot(data2050,aes(x=TDR,y=AI))+geom_point()+
  geom_text_repel(aes(label=lab),size=2)+
  labs(title="Dependency Ratio and Ageing INdex, 2050",
       caption="Data: CONAPO")+
  xlab("Dependency Ratio")+ylab("Ageing Index")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps