Download the dataset and read it: source:http://dati.istat.it/Index.aspx?DataSetCode=DCIS_MORTALITA1&Lang=en#
require(readr)
IT_life_exp<-read.csv("bio_regions.csv")
head(IT_life_exp)
## ITTER107 Territory TIPO_DATO15 Biometric.functions SEXISTAT1 Gender ETA1
## 1 ITC4 Lombardia SURVIVORS survivors - lx 2 females Y15-19
## 2 ITC4 Lombardia SURVIVORS survivors - lx 2 females Y15-19
## 3 ITC4 Lombardia SURVIVORS survivors - lx 9 total Y15-19
## 4 ITC4 Lombardia SURVIVORS survivors - lx 9 total Y15-19
## 5 ITC4 Lombardia DEATHS deaths - dx 2 females Y15-19
## 6 ITC4 Lombardia DEATHS deaths - dx 2 females Y15-19
## Age.and.age.class TIME Select.time Value Flag.Codes Flags
## 1 15-19 years 2010 2010 99651 NA NA
## 2 15-19 years 2019 2019 99652 NA NA
## 3 15-19 years 2010 2010 99603 NA NA
## 4 15-19 years 2019 2019 99610 NA NA
## 5 15-19 years 2010 2010 75 NA NA
## 6 15-19 years 2019 2019 49 NA NA
Select the column of interest,filter the observation to analyse and set a new dataset to be paired:
require(dplyr)
my_data<-IT_life_exp%>%select(Territory,
TIPO_DATO15,
Biometric.functions,
Gender,
Age.and.age.class,
TIME,
Value)
my_data_le<-my_data%>%
dplyr::filter(Gender=="total",Biometric.functions=="life expectancy - ex")%>%
dplyr::mutate(lifeExp=Value)%>%
dplyr::select(Territory,TIME,lifeExp)
head(my_data_le)
## Territory TIME lifeExp
## 1 Lombardia 2010 67.317
## 2 Lombardia 2019 68.987
## 3 Provincia Autonoma Bolzano / Bozen 2010 68.057
## 4 Provincia Autonoma Bolzano / Bozen 2019 69.273
## 5 Provincia Autonoma Trento 2010 67.784
## 6 Provincia Autonoma Trento 2019 69.645
Check of the obeservation as grouped and counted frequencies:
require(plyr)
count(my_data$Biometric.functions);
## x freq
## 1 deaths - dx 3024
## 2 life expectancy - ex 3024
## 3 probability of death (per thousand) - qx 3024
## 4 projection probability - Px 3024
## 5 survivors - lx 3024
## 6 years lived - Lx 3024
count(my_data$Territory)
## x freq
## 1 Abruzzo 864
## 2 Basilicata 864
## 3 Calabria 864
## 4 Campania 864
## 5 Emilia-Romagna 864
## 6 Friuli-Venezia Giulia 864
## 7 Lazio 864
## 8 Liguria 864
## 9 Lombardia 864
## 10 Marche 864
## 11 Molise 864
## 12 Piemonte 864
## 13 Provincia Autonoma Bolzano / Bozen 864
## 14 Provincia Autonoma Trento 864
## 15 Puglia 864
## 16 Sardegna 864
## 17 Sicilia 864
## 18 Toscana 864
## 19 Umbria 864
## 20 Valle d'Aosta / Vallée d'Aoste 864
## 21 Veneto 864
Check and set the structure of key elements as needed:
head(my_data_le)
## Territory TIME lifeExp
## 1 Lombardia 2010 67.317
## 2 Lombardia 2019 68.987
## 3 Provincia Autonoma Bolzano / Bozen 2010 68.057
## 4 Provincia Autonoma Bolzano / Bozen 2019 69.273
## 5 Provincia Autonoma Trento 2010 67.784
## 6 Provincia Autonoma Trento 2019 69.645
str(my_data_le)
## 'data.frame': 1008 obs. of 3 variables:
## $ Territory: Factor w/ 21 levels "Abruzzo","Basilicata",..: 9 9 13 13 14 14 21 21 3 3 ...
## $ TIME : int 2010 2019 2010 2019 2010 2019 2010 2019 2010 2019 ...
## $ lifeExp : num 67.3 69 68.1 69.3 67.8 ...
#my_data_le$territory<-as.character(my_data_le$territory)
names(my_data_le)<-tolower(names(my_data_le))
Pair the dateset by the time factor:
my_data_paired<-my_data_le%>%
dplyr::mutate(paired = rep(1:(n()/2),each=2),
time=factor(time))
Plot the life expectancy values across the age cathegories between 2010 and 2019:
require(ggplot2)
my_data_paired %>%
group_by(paired) %>%
ggplot(aes(x= lifeexp, y= reorder(territory,lifeexp))) +
geom_line(aes(group = paired),color="grey")+
geom_point(aes(color=time), size=2) +
labs(x="Life Expectancy",y="Region")