# Correlation and linear model for trend in time series
# Data used : https://data.worldbank.org/country/AL
# We have used the yearly data for Albania for 5 indices (Total population, life expectancy, unemployment rate, CPi and inflation)
# library used
library(readxl)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# First let us transform the data into a time series object using the ts() function
# data are saved into an excel file named Te_dhena_MSC
Te_dhena_MSC <- read_excel("~/FSHN-Sem 1-2020-2021/Seri Kohore dhe zbatime MSc -2/Te dhena-MSC.xlsx")
Te_dhena_MSC
## # A tibble: 27 x 6
## Viti `Life exp` Popullsia `Papunesia total` Inflacioni CPI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1992 71.8 3247039 26.5 226. 16.7
## 2 1993 71.9 3227287 22.3 85.0 30.8
## 3 1994 72.0 3207536 18.4 22.6 37.8
## 4 1995 72.2 3187784 12.9 7.79 40.8
## 5 1996 72.5 3168033 12.3 12.7 45.9
## 6 1997 72.8 3148281 14.9 33.2 61.2
## 7 1998 73.2 3128530 17.7 20.6 73.8
## 8 1999 73.6 3108778 18.4 0.389 74.1
## 9 2000 74.0 3089027 16.8 0.0500 74.1
## 10 2001 74.3 3060173 16.4 3.11 76.4
## # ... with 17 more rows
Pop.ts=ts(Te_dhena_MSC$Popullsia,start=1992,frequency=1) # total population
Pop.ts
## Time Series:
## Start = 1992
## End = 2018
## Frequency = 1
## [1] 3247039 3227287 3207536 3187784 3168033 3148281 3128530 3108778 3089027
## [10] 3060173 3051010 3039616 3026939 3011487 2992547 2970017 2947314 2927519
## [19] 2913021 2905195 2900401 2895092 2889104 2880703 2876101 2873457 2866376
Exp.ts=ts(Te_dhena_MSC$'Life exp',start=1992,frequency=1)# life expectancy
Exp.ts
## Time Series:
## Start = 1992
## End = 2018
## Frequency = 1
## [1] 71.802 71.860 71.992 72.205 72.495 72.838 73.208 73.587 73.955 74.288
## [11] 74.579 74.828 75.039 75.228 75.423 75.646 75.912 76.221 76.562 76.914
## [21] 77.252 77.554 77.813 78.025 78.194 78.333 78.458
Unemp.ts=ts(Te_dhena_MSC$`Papunesia total`,start=1992,frequency=1)# Unemployment
Unemp.ts
## Time Series:
## Start = 1992
## End = 2018
## Frequency = 1
## [1] 26.5000 22.3000 18.4000 12.9000 12.3000 14.9000 17.7000 18.4000 16.8000
## [10] 16.4000 15.8000 15.0000 14.4000 14.1000 13.8000 15.9663 13.0599 13.6739
## [19] 14.0860 13.4809 13.3759 15.8659 18.0548 17.1930 15.4176 13.6158 12.3038
CPI.ts=ts(Te_dhena_MSC$CPI,start=1992,frequency=1) # CPI- consumer price index
CPI.ts
## Time Series:
## Start = 1992
## End = 2018
## Frequency = 1
## [1] 16.7 30.8 37.8 40.8 45.9 61.2 73.8 74.1 74.1 76.4 82.4 82.8
## [13] 84.7 86.7 88.7 91.3 94.4 96.5 100.0 103.4 105.5 107.6 109.3 111.4
## [25] 112.8 115.1 117.4
Infl.ts=ts(Te_dhena_MSC$Inflacioni,start=1992,frequency=1) # Inflation
Infl.ts
## Time Series:
## Start = 1992
## End = 2018
## Frequency = 1
## [1] 226.00542125 85.00475124 22.56505269 7.79321854 12.72547781
## [6] 33.18027438 20.64285887 0.38943765 0.05001814 3.10758827
## [11] 7.77052583 0.48400261 2.28001917 2.36658196 2.37072832
## [16] 2.93268248 3.36313757 2.23139683 3.62233541 3.42912325
## [21] 2.03159594 1.93761755 1.62586504 1.89617403 1.27543168
## [26] 1.98666133 2.02805963
# Time series graphical representation
par(mfrow=c(2,3))# we can view all the graphs in one graphical window
plot(Pop.ts,main="Total population , 1992-2018",col="red",xlab="Year",ylab="Population")
plot(Exp.ts,main="Life expectancy , 1992-2018",col="red",xlab="Year",ylab="Years")
plot(Unemp.ts,main="Unemployment rate , 1992-2018",col="red",xlab="Year",ylab="Rate")
plot(CPI.ts,main="CPI , 1992-2018",col="red",xlab="Year",ylab="CPI")
plot(Infl.ts,main="Inflation rate , 1992-2018",col="red",xlab="Year",ylab="Rate")
#
# let us now analyse the correlation among the indices
M=data.frame(Pop.ts,Exp.ts,Unemp.ts,CPI.ts,Infl.ts)# data frame
cor(M)# correlation matrix
## Pop.ts Exp.ts Unemp.ts CPI.ts Infl.ts
## Pop.ts 1.0000000 -0.9881067 0.5207413 -0.9740044 0.5514574
## Exp.ts -0.9881067 1.0000000 -0.4502540 0.9590836 -0.4874743
## Unemp.ts 0.5207413 -0.4502540 1.0000000 -0.5570282 0.7962605
## CPI.ts -0.9740044 0.9590836 -0.5570282 1.0000000 -0.6536084
## Infl.ts 0.5514574 -0.4874743 0.7962605 -0.6536084 1.0000000
# We notice that total population and CPI are strongly (negatively) correlated;Life expectancy and CPI are strongly (positively) correlated
library(corrplot)# library used for graphical display of correlation
## corrplot 0.84 loaded
par(mfrow=c(2,2))# different graphical presentation of the correlation matrix

corrplot(cor(M), method="circle")
corrplot(cor(M), method="pie")
corrplot(cor(M), method="color")
corrplot(cor(M), method="number")

# We can also use the "type" argument to decide on which part of the matrix to show (since they are symetric)
corrplot(cor(M), type ="upper")
corrplot(cor(M), type="lower")
corrplot(cor(M), type="full")
# We can also use a hierarchial cluster to a better view of the correlated variables using argument "order"
par(mfrow=c(1,2))

corrplot(cor(M), method="color",order="hclust",type="upper")
corrplot(cor(M), method="number",type="full",order="hclust")

# if you want to use different colors for the graphics you can use the function colorRampPalette()
Ngjy<- colorRampPalette(c("red", "white", "blue"))(20)# color vector
corrplot(cor(M), type="upper", order="hclust", col=Ngjy)
corrplot(cor(M), type="upper", order="hclust", col=c("black", "white"),bg="lightblue")# ndryshojme background

# Another color package which may be used is RColorBrewer
library(RColorBrewer)
corrplot(cor(M), type="upper", order="hclust", col=brewer.pal(n=8, name="RdBu"))
#
# Computing the p-value of correlations
#### Source code: http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
# mat : is a matrix of data
# ... : further arguments to pass to the native R cor.test function
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
#
#
# p-value matrix for autocorrelations
p.mat <- cor.mtest(cor(M))# p-value matrix
head(p.mat[, 1:5])# in our case we have 5 variables
## Pop.ts Exp.ts Unemp.ts CPI.ts Infl.ts
## Pop.ts 0.000000e+00 1.421007e-05 0.034450905 6.935964e-05 0.028048547
## Exp.ts 1.421007e-05 0.000000e+00 0.042105055 2.491695e-04 0.034666585
## Unemp.ts 3.445091e-02 4.210505e-02 0.000000000 2.658922e-02 0.003301761
## CPI.ts 6.935964e-05 2.491695e-04 0.026589216 0.000000e+00 0.018890169
## Infl.ts 2.804855e-02 3.466658e-02 0.003301761 1.889017e-02 0.000000000
#################################################################
# Correlation matrix and confidence level (observe the changes when substitute sig.level with 0.005 and 0.01))
corrplot(cor(M), type="upper", order="hclust",p.mat = p.mat, sig.level = 0.05)

corrplot(cor(M), type="upper", order="hclust", p.mat = p.mat, sig.level = 0.05, insig = "blank")
corrplot(cor(M), type="upper", order="hclust",p.mat = p.mat, sig.level = 0.01)

corrplot(cor(M), type="upper", order="hclust", p.mat = p.mat, sig.level = 0.01, insig = "blank")
#
par(mfrow=c(1,1))

####################################################
# let now understand the linear trend by using the lm() function
#We first create a time vector t and save it as a time series object
t=ts(seq(1992,2018),start=1992,end=2018,frequency=1)
t
## Time Series:
## Start = 1992
## End = 2018
## Frequency = 1
## [1] 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
plot(Exp.ts,col="blue",type="l",main="Life expectancy",xlab="Year",ylab="Age in years",lwd=2)
Mod.1=lm(Exp.ts~t)# linear model to understand trend in life expectancy
summary(Mod.1)# summary of the linear model
##
## Call:
## lm(formula = Exp.ts ~ t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31652 -0.10275 -0.00851 0.13500 0.21251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.772e+02 7.544e+00 -63.26 <2e-16 ***
## t 2.755e-01 3.762e-03 73.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1523 on 25 degrees of freedom
## Multiple R-squared: 0.9954, Adjusted R-squared: 0.9952
## F-statistic: 5362 on 1 and 25 DF, p-value: < 2.2e-16
# Comments: we observe that Life expectancy has a linear inceasing trend.And a linear regression model will be a good prediction model
# The R-square is 0.9952 , and also the other information p-value of the explained variable (time) and the model p-value are satisfactory and significant
abline(Mod.1,col="red",lwd=2)
text(1995,75,"R-squared=0.9952")
text(2000,76,"Equation:Life_exp=-4.772e+02+2.755e-01*time")
legend(2007,74,c("Life expectancy","Linear model"),fill=c("blue","red"),box.col = "white")

###############################
#
#
#
#
# Next step: analyse the other variables and build a step-wise linear model!
# Thank you for sharing!
# Any comment is appreciated!
# Contact: Eralda Gjika eralda.dhamo@fshn.edu.al
# LinkedIn: https://www.linkedin.com/in/eralda-dhamo-gjika-71879128/