******* FIRST PART *********
setwd("C:/Users/23043/Dropbox/UDLAP/Cursos/2022 Primavera/Pensiones y SS/Datos")
library(doBy)
## Warning: package 'doBy' was built under R version 4.1.2
library(ggplot2)
library(reshape2)
pop<-read.csv("poblacion.csv")
Lets explore the variables we have:
names(pop)
## [1] "ANIO" "ENTIDAD" "CVE_GEO" "EDAD" "SEXO" "POBLACION"
head(pop)
## ANIO ENTIDAD CVE_GEO EDAD SEXO POBLACION
## 1 1950 República Mexicana 0 0 Hombres 572103
## 2 1950 República Mexicana 0 0 Mujeres 559162
## 3 1950 República Mexicana 0 1 Hombres 514540
## 4 1950 República Mexicana 0 1 Mujeres 505269
## 5 1950 República Mexicana 0 2 Hombres 478546
## 6 1950 República Mexicana 0 2 Mujeres 469322
For this example, lets work with information at the national level, for this, we have to restrict our data to those rows where the code for the state is 0.
pop<-pop[which(pop$CVE_GEO==0),]
Vamos a explorar un poco la poblacion en distintos momentos. Primero estimaremos las proporciones de hombres y de mujeres en cada edad y anio.
poptot<-summaryBy(POBLACION ~ ANIO, FUN=sum,data=pop)
pop<-merge(pop,poptot,by="ANIO")
remove(poptot)
pop$prop<-ifelse(pop$SEXO=="Hombres",
-1*pop$POBLACION/pop$POBLACION.sum,
pop$POBLACION/pop$POBLACION.sum)
Ahora vamos a dibujar cuatro piramides, para: - 1950 - 1995 - 2020 - 2050
pop$lon<-0
data_pyr<-pop
p<-ggplot(data=subset(data_pyr,ANIO==1950),
mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))
p + geom_ribbon(alpha=0.5)+
scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
scale_x_continuous(breaks=seq(0,100,10))+
scale_fill_manual(values=c("red","blue"))+
guides(fill=guide_legend(reverse=T))+
labs(x="Age",y="Percent of Population",
title="Age distribution of the population of Mexico: 1950",
subtitle="Age is top-coded at 100",
caption="Data: CONAPO",
fill="Sex")+
theme_minimal()+
theme(legend.position="bottom",
plot.title=element_text(size=rel(0.8), face="bold"),
plot.subtitle=element_text(size=rel(0.8)),
plot.caption=element_text(size=rel(0.8)),
axis.text.y=element_text(size=rel(0.8)),
axis.text.x=element_text(size=rel(0.8)))+
coord_flip()
La segunda
p<-ggplot(data=subset(data_pyr,ANIO==1995),
mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))
p + geom_ribbon(alpha=0.5)+
scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
scale_x_continuous(breaks=seq(0,100,10))+
scale_fill_manual(values=c("red","blue"))+
guides(fill=guide_legend(reverse=T))+
labs(x="Age",y="Percent of Population",
title="Age distribution of the population of Mexico: 1995",
subtitle="Age is top-coded at 100",
caption="Data: CONAPO",
fill="Sex")+
theme_minimal()+
theme(legend.position="bottom",
plot.title=element_text(size=rel(0.8), face="bold"),
plot.subtitle=element_text(size=rel(0.8)),
plot.caption=element_text(size=rel(0.8)),
axis.text.y=element_text(size=rel(0.8)),
axis.text.x=element_text(size=rel(0.8)))+
coord_flip()
La tercera
p<-ggplot(data=subset(data_pyr,ANIO==2020),
mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))
p + geom_ribbon(alpha=0.5)+
scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
scale_x_continuous(breaks=seq(0,100,10))+
scale_fill_manual(values=c("red","blue"))+
guides(fill=guide_legend(reverse=T))+
labs(x="Age",y="Percent of Population",
title="Age distribution of the population of Mexico: 2020",
subtitle="Age is top-coded at 100",
caption="Data: CONAPO",
fill="Sex")+
theme_minimal()+
theme(legend.position="bottom",
plot.title=element_text(size=rel(0.8), face="bold"),
plot.subtitle=element_text(size=rel(0.8)),
plot.caption=element_text(size=rel(0.8)),
axis.text.y=element_text(size=rel(0.8)),
axis.text.x=element_text(size=rel(0.8)))+
coord_flip()
La cuarta
p<-ggplot(data=subset(data_pyr,ANIO==2050),
mapping=aes(x=EDAD,ymin=lon,ymax=prop*100, fill=SEXO))
p + geom_ribbon(alpha=0.5)+
scale_y_continuous(labels=abs,limits=max(data_pyr$prop,na.rm=T)*c(-1.1,1.1)*100)+
scale_x_continuous(breaks=seq(0,100,10))+
scale_fill_manual(values=c("red","blue"))+
guides(fill=guide_legend(reverse=T))+
labs(x="Age",y="Percent of Population",
title="Age distribution of the population of Mexico: 2050",
subtitle="Age is top-coded at 100",
caption="Data: CONAPO",
fill="Sex")+
theme_minimal()+
theme(legend.position="bottom",
plot.title=element_text(size=rel(0.8), face="bold"),
plot.subtitle=element_text(size=rel(0.8)),
plot.caption=element_text(size=rel(0.8)),
axis.text.y=element_text(size=rel(0.8)),
axis.text.x=element_text(size=rel(0.8)))+
coord_flip()
remove(pop,p,data_pyr)
******* FIRST PART *********
In this second part we will use a different dataset. Now we will use the dem_ind, a dataset including basic demographic indicators of interest.
data<-read.csv("ind_dem.csv")
names(data)
## [1] "RENGLON" "ANIO" "ENTIDAD" "CVE_GEO"
## [5] "CRE_NAT" "CRE_SOC" "CRE_TOT" "DEF"
## [9] "EDAD_MED" "EMI_EST" "EMI_INT" "EVH"
## [13] "EVM" "EV" "HOM_MIT_AÑO" "IND_ENV"
## [17] "INM_EST" "INM_INT" "MIG_NET_EST" "MIG_NET_INT"
## [21] "MUJ_MIT_AÑO" "NAC" "POB_MIT_AÑO" "MUJ_12_14"
## [25] "MUJ_15_17" "MUJ_15_19" "MUJ_15_29" "MUJ_15_49"
## [29] "MUJ_18_24" "MUJ_20_24" "MUJ_3_5" "MUJ_30_64"
## [33] "MUJ_6_11" "MUJ_65_MAS" "HOM_12_14" "HOM_15_17"
## [37] "HOM_15_19" "HOM_15_29" "HOM_15_49" "HOM_18_24"
## [41] "HOM_20_24" "HOM_3_5" "HOM_30_64" "HOM_6_11"
## [45] "HOM_65_MAS" "POB_12_14" "POB_15_17" "POB_15_19"
## [49] "POB_15_29" "POB_15_49" "POB_18_24" "POB_20_24"
## [53] "POB_3_5" "POB_30_64" "POB_6_11" "POB_65_MAS"
## [57] "RAZ_DEP_ADU" "RAZ_DEP_INF" "RAZ_DEP" "T_BRU_MOR"
## [61] "T_BRU_NAT" "T_CRE_NAT" "T_CRE_SOC" "T_CRE_TOT"
## [65] "T_EMI_EST" "T_INM_EST" "T_MIG_NET_EST" "T_MIG_NET_INT"
## [69] "TMIH" "TMIM" "TMI" "TEF_ADO"
## [73] "TGF"
I will select Mexico
mex<- data[which(data$CVE_GEO==0),]
Lets start with the percentage of population ages 65 and older:
mex$pp65m<- mex$POB_65_MAS/mex$POB_MIT_AÑO*100
ggplot(mex,aes(x=ANIO,y=pp65m))+geom_line()+
labs(title="Mexico:% Population 65 and older, 1950-2050",
caption="Data: CONAPO")+
xlab("Year")+ylab("% Population")
Now, growth rates of population by age groups. Lets estimate this for ages <15, 15-64 and 65+
# Growth rates for <15
mex$Pob<-mex$POB_MIT_AÑO-(mex$POB_15_29+mex$POB_30_64)
mex$r1<-0
for (i in 2:100){
mex[i,76]<-log(mex[i,75]/mex[i-1,75])*100
}
# Growth rates for 15-64
mex$Pob<-mex$POB_15_29+mex$POB_30_64
mex$r2<-0
for (i in 2:100){
mex[i,77]<-log(mex[i,75]/mex[i-1,75])*100
}
# Growth rates for 15-64
mex$Pob<-mex$POB_65_MAS
mex$r3<-0
for (i in 2:100){
mex[i,78]<-log(mex[i,75]/mex[i-1,75])*100
}
mex_g<-mex[,c("ANIO","r1","r2","r3")]
mex_g<-melt(mex_g,id="ANIO")
mex_g$variable2<-0
mex_g$variable2<-ifelse(mex_g$variable=="r1","P0-14",
ifelse(mex_g$variable=="r2","P15-64","P65+"))
ggplot(mex_g,aes(x=ANIO,y=value,color=variable2))+
geom_line()+
labs(title="Mexico:% Growth rates by age groups, 1951-2050",
subtitle="Exponential growth",
caption="Data: CONAPO",
x="Year",y="Rate(%)",
colour="Age groups")+xlim(1951,2049)
## Warning: Removed 6 row(s) containing missing values (geom_path).
Now we estimate the dependency ratio:
# Dependency ratio
mex$P0_14<-mex$POB_MIT_AÑO-(mex$POB_15_29+mex$POB_30_64+
mex$POB_65_MAS)
mex$P15_64<-mex$POB_15_29+mex$POB_30_64
mex$P65mas<- mex$POB_65_MAS
mex$TDR<-((mex$P0_14+mex$P65mas)/mex$P15_64)*100
ggplot(mex)+geom_line(aes(x=ANIO,y=TDR),colour="red")+
labs(title="Mexico: Dependency Ratio, 1950-2050",
caption="Data: CONAPO")+
xlab("Year")+ylab("Ratio")
And the index of ageing:
mex$IA<-(mex$P65mas/mex$P0_14)*100
ggplot(mex)+geom_line(aes(x=ANIO,y=IA),colour="red")+
labs(title="Mexico: Ageing Index, 1950-2050",
caption="Data: CONAPO")+
xlab("Year")+ylab("Index")
Now, lets explore some other associations between variables. For this, we will return to our original dataset “data” and create a new variable with labels that we will use later:
remove(mex,mex_g)
data$lab<-NA
data$lab<-ifelse(data$CVE_GEO==0,"NAC",data$lab)
data$lab<-ifelse(data$CVE_GEO==1,"AGS",data$lab)
data$lab<-ifelse(data$CVE_GEO==2,"BCN",data$lab)
data$lab<-ifelse(data$CVE_GEO==3,"BCS",data$lab)
data$lab<-ifelse(data$CVE_GEO==4,"CAM",data$lab)
data$lab<-ifelse(data$CVE_GEO==5,"COA",data$lab)
data$lab<-ifelse(data$CVE_GEO==6,"COL",data$lab)
data$lab<-ifelse(data$CVE_GEO==7,"CHI",data$lab)
data$lab<-ifelse(data$CVE_GEO==8,"CHH",data$lab)
data$lab<-ifelse(data$CVE_GEO==9,"CMX",data$lab)
data$lab<-ifelse(data$CVE_GEO==10,"DGO",data$lab)
data$lab<-ifelse(data$CVE_GEO==11,"GTO",data$lab)
data$lab<-ifelse(data$CVE_GEO==12,"GRO",data$lab)
data$lab<-ifelse(data$CVE_GEO==13,"HGO",data$lab)
data$lab<-ifelse(data$CVE_GEO==14,"JAL",data$lab)
data$lab<-ifelse(data$CVE_GEO==15,"MEX",data$lab)
data$lab<-ifelse(data$CVE_GEO==16,"MIC",data$lab)
data$lab<-ifelse(data$CVE_GEO==17,"MOR",data$lab)
data$lab<-ifelse(data$CVE_GEO==18,"NAY",data$lab)
data$lab<-ifelse(data$CVE_GEO==19,"NLN",data$lab)
data$lab<-ifelse(data$CVE_GEO==20,"OAX",data$lab)
data$lab<-ifelse(data$CVE_GEO==21,"PUE",data$lab)
data$lab<-ifelse(data$CVE_GEO==22,"QRT",data$lab)
data$lab<-ifelse(data$CVE_GEO==23,"QTR",data$lab)
data$lab<-ifelse(data$CVE_GEO==24,"SLP",data$lab)
data$lab<-ifelse(data$CVE_GEO==25,"SIN",data$lab)
data$lab<-ifelse(data$CVE_GEO==26,"SON",data$lab)
data$lab<-ifelse(data$CVE_GEO==27,"TAB",data$lab)
data$lab<-ifelse(data$CVE_GEO==28,"TAM",data$lab)
data$lab<-ifelse(data$CVE_GEO==29,"TLX",data$lab)
data$lab<-ifelse(data$CVE_GEO==30,"VER",data$lab)
data$lab<-ifelse(data$CVE_GEO==31,"YUC",data$lab)
data$lab<-ifelse(data$CVE_GEO==32,"ZAC",data$lab)
data$P0_14<-data$POB_MIT_AÑO-(data$POB_15_29+data$POB_30_64+
data$POB_65_MAS)
data$P15_64<-data$POB_15_29+data$POB_30_64
data$P65mas<- data$POB_65_MAS
data$AI<- (data$P65mas/data$P0_14)*100
data$TDR<-((data$P0_14+data$P65mas)/data$P15_64)*100
Now, lets explore some associations between the life expectancy and TFR in 1970:
# select for 1970
data1970<-data[which(data$ANIO==1970),]
ggplot(data1970,aes(x=EV,y=TGF))+geom_point()
We can make this plot look better
# select for 1970
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.1.2
ggplot(data1970,aes(x=EV,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Life expectancy at birth and
Total Fertility Rate, 1970",
caption="Data: CONAPO")+
xlab("Life Expectancy")+ylab("Total Fertility Rate")
Now, lets plot the TDR and TFR
ggplot(data1970,aes(x=TDR,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Dependency Ratio and
Total Fertility Rate, 1970",
caption="Data: CONAPO")+
xlab("Dependency Ratio")+ylab("Total Fertility Rate")
Now, lets plot the Ageing Index and TFR
ggplot(data1970,aes(x=AI,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Ageing Index and
Total Fertility Rate, 1970",
caption="Data: CONAPO")+
xlab("Ageing Index")+ylab("Total Fertility Rate")
We repeat for 2000 and 2050
data2000<-data[which(data$ANIO==2000),]
ggplot(data2000,aes(x=EV,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Life expectancy at birth and
Total Fertility Rate, 2000",
caption="Data: CONAPO")+
xlab("Life Expectancy")+ylab("Total Fertility Rate")
ggplot(data2000,aes(x=TDR,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Dependency Ratio and
Total Fertility Rate, 2000",
caption="Data: CONAPO")+
xlab("Dependency Ratio")+ylab("Total Fertility Rate")
ggplot(data2000,aes(x=AI,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Ageing Index and
Total Fertility Rate, 2000",
caption="Data: CONAPO")+
xlab("Ageing Index")+ylab("Total Fertility Rate")
data2050<-data[which(data$ANIO==2050),]
ggplot(data2050,aes(x=EV,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Life expectancy at birth and
Total Fertility Rate, 2050",
caption="Data: CONAPO")+
xlab("Life Expectancy")+ylab("Total Fertility Rate")
ggplot(data2050,aes(x=TDR,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Dependency Ratio and
Total Fertility Rate, 2050",
caption="Data: CONAPO")+
xlab("Dependency Ratio")+ylab("Total Fertility Rate")
ggplot(data2050,aes(x=AI,y=TGF))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Ageing Index and
Total Fertility Rate, 2050",
caption="Data: CONAPO")+
xlab("Ageing Index")+ylab("Total Fertility Rate")
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggplot(data1970,aes(x=TDR,y=AI))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Dependency Ratio and Ageing INdex, 1970",
caption="Data: CONAPO")+
xlab("Dependency Ratio")+ylab("Ageing Index")
ggplot(data2000,aes(x=TDR,y=AI))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Dependency Ratio and Ageing INdex, 2000",
caption="Data: CONAPO")+
xlab("Dependency Ratio")+ylab("Ageing Index")
ggplot(data2050,aes(x=TDR,y=AI))+geom_point()+
geom_text_repel(aes(label=lab),size=2)+
labs(title="Dependency Ratio and Ageing INdex, 2050",
caption="Data: CONAPO")+
xlab("Dependency Ratio")+ylab("Ageing Index")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps