# Correlation and linear model for trend in time series
# Data used : https://data.worldbank.org/country/AL  
# We have used the yearly data for Albania for 5 indices (Total population, life expectancy, unemployment rate, CPi and inflation)
# library used 
library(readxl)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# First let us transform the data into a time series object using the ts() function 
# data are saved into an excel file named Te_dhena_MSC
Te_dhena_MSC <- read_excel("~/FSHN-Sem 1-2020-2021/Seri Kohore dhe zbatime MSc -2/Te dhena-MSC.xlsx")
Te_dhena_MSC
## # A tibble: 27 x 6
##     Viti `Life exp` Popullsia `Papunesia total` Inflacioni   CPI
##    <dbl>      <dbl>     <dbl>             <dbl>      <dbl> <dbl>
##  1  1992       71.8   3247039              26.5   226.      16.7
##  2  1993       71.9   3227287              22.3    85.0     30.8
##  3  1994       72.0   3207536              18.4    22.6     37.8
##  4  1995       72.2   3187784              12.9     7.79    40.8
##  5  1996       72.5   3168033              12.3    12.7     45.9
##  6  1997       72.8   3148281              14.9    33.2     61.2
##  7  1998       73.2   3128530              17.7    20.6     73.8
##  8  1999       73.6   3108778              18.4     0.389   74.1
##  9  2000       74.0   3089027              16.8     0.0500  74.1
## 10  2001       74.3   3060173              16.4     3.11    76.4
## # ... with 17 more rows
Pop.ts=ts(Te_dhena_MSC$Popullsia,start=1992,frequency=1) # total population
Pop.ts
## Time Series:
## Start = 1992 
## End = 2018 
## Frequency = 1 
##  [1] 3247039 3227287 3207536 3187784 3168033 3148281 3128530 3108778 3089027
## [10] 3060173 3051010 3039616 3026939 3011487 2992547 2970017 2947314 2927519
## [19] 2913021 2905195 2900401 2895092 2889104 2880703 2876101 2873457 2866376
Exp.ts=ts(Te_dhena_MSC$'Life exp',start=1992,frequency=1)# life expectancy
Exp.ts
## Time Series:
## Start = 1992 
## End = 2018 
## Frequency = 1 
##  [1] 71.802 71.860 71.992 72.205 72.495 72.838 73.208 73.587 73.955 74.288
## [11] 74.579 74.828 75.039 75.228 75.423 75.646 75.912 76.221 76.562 76.914
## [21] 77.252 77.554 77.813 78.025 78.194 78.333 78.458
Unemp.ts=ts(Te_dhena_MSC$`Papunesia total`,start=1992,frequency=1)# Unemployment
Unemp.ts
## Time Series:
## Start = 1992 
## End = 2018 
## Frequency = 1 
##  [1] 26.5000 22.3000 18.4000 12.9000 12.3000 14.9000 17.7000 18.4000 16.8000
## [10] 16.4000 15.8000 15.0000 14.4000 14.1000 13.8000 15.9663 13.0599 13.6739
## [19] 14.0860 13.4809 13.3759 15.8659 18.0548 17.1930 15.4176 13.6158 12.3038
CPI.ts=ts(Te_dhena_MSC$CPI,start=1992,frequency=1) # CPI- consumer price index
CPI.ts
## Time Series:
## Start = 1992 
## End = 2018 
## Frequency = 1 
##  [1]  16.7  30.8  37.8  40.8  45.9  61.2  73.8  74.1  74.1  76.4  82.4  82.8
## [13]  84.7  86.7  88.7  91.3  94.4  96.5 100.0 103.4 105.5 107.6 109.3 111.4
## [25] 112.8 115.1 117.4
Infl.ts=ts(Te_dhena_MSC$Inflacioni,start=1992,frequency=1) # Inflation
Infl.ts
## Time Series:
## Start = 1992 
## End = 2018 
## Frequency = 1 
##  [1] 226.00542125  85.00475124  22.56505269   7.79321854  12.72547781
##  [6]  33.18027438  20.64285887   0.38943765   0.05001814   3.10758827
## [11]   7.77052583   0.48400261   2.28001917   2.36658196   2.37072832
## [16]   2.93268248   3.36313757   2.23139683   3.62233541   3.42912325
## [21]   2.03159594   1.93761755   1.62586504   1.89617403   1.27543168
## [26]   1.98666133   2.02805963
# Time series graphical representation
par(mfrow=c(2,3))# we can view all the graphs in one graphical window
plot(Pop.ts,main="Total population , 1992-2018",col="red",xlab="Year",ylab="Population")
plot(Exp.ts,main="Life expectancy , 1992-2018",col="red",xlab="Year",ylab="Years")
plot(Unemp.ts,main="Unemployment rate , 1992-2018",col="red",xlab="Year",ylab="Rate")
plot(CPI.ts,main="CPI , 1992-2018",col="red",xlab="Year",ylab="CPI")
plot(Infl.ts,main="Inflation rate , 1992-2018",col="red",xlab="Year",ylab="Rate")
# 
# let us now analyse the correlation among the indices
M=data.frame(Pop.ts,Exp.ts,Unemp.ts,CPI.ts,Infl.ts)# data frame
cor(M)# correlation matrix
##              Pop.ts     Exp.ts   Unemp.ts     CPI.ts    Infl.ts
## Pop.ts    1.0000000 -0.9881067  0.5207413 -0.9740044  0.5514574
## Exp.ts   -0.9881067  1.0000000 -0.4502540  0.9590836 -0.4874743
## Unemp.ts  0.5207413 -0.4502540  1.0000000 -0.5570282  0.7962605
## CPI.ts   -0.9740044  0.9590836 -0.5570282  1.0000000 -0.6536084
## Infl.ts   0.5514574 -0.4874743  0.7962605 -0.6536084  1.0000000
# We notice that total population and CPI are strongly (negatively) correlated;Life expectancy and CPI are strongly (positively) correlated 
library(corrplot)# library used for graphical display of correlation  
## corrplot 0.84 loaded
par(mfrow=c(2,2))# different graphical presentation of the correlation matrix

corrplot(cor(M), method="circle")
corrplot(cor(M), method="pie")
corrplot(cor(M), method="color")
corrplot(cor(M), method="number")

# We can also use the "type" argument to decide on which part of the matrix to show (since they are symetric)
corrplot(cor(M), type ="upper")
corrplot(cor(M), type="lower")
corrplot(cor(M), type="full")
# We can also use a hierarchial cluster to a better view of the correlated variables using argument "order"  
par(mfrow=c(1,2))

corrplot(cor(M), method="color",order="hclust",type="upper")
corrplot(cor(M), method="number",type="full",order="hclust")

# if you want to use different colors for the graphics you can use the function colorRampPalette()
Ngjy<- colorRampPalette(c("red", "white", "blue"))(20)# color vector
corrplot(cor(M), type="upper", order="hclust", col=Ngjy)
corrplot(cor(M), type="upper", order="hclust", col=c("black", "white"),bg="lightblue")# ndryshojme background

# Another color package which may be used is RColorBrewer
library(RColorBrewer)
corrplot(cor(M), type="upper", order="hclust", col=brewer.pal(n=8, name="RdBu"))
#
# Computing the p-value of correlations
#### Source code: http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
# mat : is a matrix of data
# ... : further arguments to pass to the native R cor.test function
cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], ...)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}
#
#
# p-value matrix for autocorrelations 
p.mat <- cor.mtest(cor(M))# p-value matrix
head(p.mat[, 1:5])# in our case we have 5 variables
##                Pop.ts       Exp.ts    Unemp.ts       CPI.ts     Infl.ts
## Pop.ts   0.000000e+00 1.421007e-05 0.034450905 6.935964e-05 0.028048547
## Exp.ts   1.421007e-05 0.000000e+00 0.042105055 2.491695e-04 0.034666585
## Unemp.ts 3.445091e-02 4.210505e-02 0.000000000 2.658922e-02 0.003301761
## CPI.ts   6.935964e-05 2.491695e-04 0.026589216 0.000000e+00 0.018890169
## Infl.ts  2.804855e-02 3.466658e-02 0.003301761 1.889017e-02 0.000000000
#################################################################
# Correlation matrix and confidence level (observe the changes when substitute sig.level with 0.005 and 0.01))
corrplot(cor(M), type="upper", order="hclust",p.mat = p.mat, sig.level = 0.05)

corrplot(cor(M), type="upper", order="hclust", p.mat = p.mat, sig.level = 0.05, insig = "blank")
corrplot(cor(M), type="upper", order="hclust",p.mat = p.mat, sig.level = 0.01)

corrplot(cor(M), type="upper", order="hclust", p.mat = p.mat, sig.level = 0.01, insig = "blank")
#
par(mfrow=c(1,1))

####################################################
# let now understand the linear trend by using the lm() function
#We first create a time vector t and save it as a time series object 
t=ts(seq(1992,2018),start=1992,end=2018,frequency=1)
t
## Time Series:
## Start = 1992 
## End = 2018 
## Frequency = 1 
##  [1] 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
plot(Exp.ts,col="blue",type="l",main="Life expectancy",xlab="Year",ylab="Age in years",lwd=2)
Mod.1=lm(Exp.ts~t)# linear model to understand trend in life expectancy
summary(Mod.1)# summary of the linear model
## 
## Call:
## lm(formula = Exp.ts ~ t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31652 -0.10275 -0.00851  0.13500  0.21251 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.772e+02  7.544e+00  -63.26   <2e-16 ***
## t            2.755e-01  3.762e-03   73.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1523 on 25 degrees of freedom
## Multiple R-squared:  0.9954, Adjusted R-squared:  0.9952 
## F-statistic:  5362 on 1 and 25 DF,  p-value: < 2.2e-16
# Comments: we observe that Life expectancy has a linear inceasing trend.And a linear regression model will be a good prediction model 
# The R-square is 0.9952 , and also the other information p-value of the explained variable (time) and the model p-value are satisfactory and significant 
abline(Mod.1,col="red",lwd=2)
text(1995,75,"R-squared=0.9952")
text(2000,76,"Equation:Life_exp=-4.772e+02+2.755e-01*time")
legend(2007,74,c("Life expectancy","Linear model"),fill=c("blue","red"),box.col = "white")

###############################
#
#
#
#
# Next step: analyse the other variables and build a step-wise linear model!
# Thank you for sharing!
# Any comment is appreciated!
# Contact: Eralda Gjika eralda.dhamo@fshn.edu.al 
# LinkedIn: https://www.linkedin.com/in/eralda-dhamo-gjika-71879128/