Econométrie Appliquée

Zhengyuan Gao

2019/02

Un cours très appliqué

Using R for Introductory Econometrics (anglais mais gratuit online, http://www.urfie.net/read.html )

Using R for Introductory Statistics (https://cran.r-project.org/doc/contrib/Verzani-SimpleR.pdf)

Un cours très appliqué

Un plan

  1. [R] Programmation
  2. Une autre regard sur les régressions
  3. Simple séries temporelles
  4. ARIMA models
  5. Modèle linéaire généralisé

Evaluation

Économétriques maintenant

Cette leçon sur le logiciel [R]

  1. organisation des données

  2. operateur

  3. manipulations diverses, accès à la documentation,

  4. représentations graphiques, programmation et maintenance.

Pourquoi utiliser [R]

Mais…Python?

R ou Python

Mes premiers pas en R

Étape 1: Download R (UNIX, Windows et MacOS) en https://www.r-project.org/

Étape 2: Download Rstudio. RStudio est un environnement de développement gratuit, libre et multiplateforme pour [R]. https://www.rstudio.com/

RStudio Desktop: pour une exécution locale du logiciel

Étape 3: Double-cliquez sur l’icône [R] sur votre bureau.

Étape 4: Dans la console, tapez 1+1….

4 fenêtres

Une fenêtre: gestion

Fichier

Une fenêtre: editor pour le script/donnée/projet etc

Une fenêtre: console pour RCommander

Manipulation de données avec RCommander

datair = datasets::airmiles # Affectation
# on peut aussi utiliser le signe <-  
# x <- datasets::airmiles  c'est la même chose.
x = 2; X = 1 # Les noms de variables sont case sensitive.
x       # Affichage des résultats
## [1] 2
X
## [1] 1
X = sin(2*pi/3)
Y = sqrt(X)
Z = X+Y
Z
## [1] 1.79663

Nature/type des données

x = 1.5; y= 1; typeof(x) 
## [1] "double"
typeof(y)
## [1] "double"
x>y
## [1] TRUE
x==y
## [1] FALSE
is.numeric(x)
## [1] TRUE
is.integer(x)
## [1] FALSE
TRUE + FALSE*FALSE + TRUE
## [1] 2
x = "mon ami"; typeof(x)
## [1] "character"
as.integer("3")
## [1] 3
as.integer(3)
## [1] 3

Structures de données: vecteurs

x = c(2,1,4); x
## [1] 2 1 4
x = seq(from=0, to=0.5, by=0.1); x
## [1] 0.0 0.1 0.2 0.3 0.4 0.5
x = 1:3; x
## [1] 1 2 3
names(x) = c("A","B","C"); x
## A B C 
## 1 2 3

Structures de données: les matrices

X = matrix(1:12, nrow=4, ncol=3, byrow=TRUE)
X
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12
X = matrix(1:12, nrow=4, ncol=3) # X = matrix(1:12, nrow=4, ncol=3 , byrow=FALSE) 
X
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
class(X)
## [1] "matrix"
X = array(1:12, dim=c(2,2,3))
X
## , , 1
## 
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    5    7
## [2,]    6    8
## 
## , , 3
## 
##      [,1] [,2]
## [1,]    9   11
## [2,]   10   12
class(X)
## [1] "array"

Structures de données: list et data.frame

x = list(TRUE, 1:2, matrix(1:4, nrow=2), "Une chaîne")
x
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] 1 2
## 
## [[3]]
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
## 
## [[4]]
## [1] "Une chaîne"
x = list(vec=1:2, mat=matrix(1:4, nrow=2), chr="Une chaîne")
x
## $vec
## [1] 1 2
## 
## $mat
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
## 
## $chr
## [1] "Une chaîne"
y = list(vec=3:4, mat=matrix(5:8, nrow=2), chr="Une autre chaîne")
xy = list(x,y); xy
## [[1]]
## [[1]]$vec
## [1] 1 2
## 
## [[1]]$mat
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
## 
## [[1]]$chr
## [1] "Une chaîne"
## 
## 
## [[2]]
## [[2]]$vec
## [1] 3 4
## 
## [[2]]$mat
##      [,1] [,2]
## [1,]    5    7
## [2,]    6    8
## 
## [[2]]$chr
## [1] "Une autre chaîne"
donnes = data.frame(Sexe = c("F","F","F"), Taille=c(1.75, 1.7, 1.6),
          Poids = c(65, 60, 55),
          row.names=c("Mandy", "May", "Maya"))
donnes
##       Sexe Taille Poids
## Mandy    F   1.75    65
## May      F   1.70    60
## Maya     F   1.60    55
str(donnes)
## 'data.frame':    3 obs. of  3 variables:
##  $ Sexe  : Factor w/ 1 level "F": 1 1 1
##  $ Taille: num  1.75 1.7 1.6
##  $ Poids : num  65 60 55

Structures de données: l’autres

x = factor(c("bleu","rough","bleu","rough"))
x
## [1] bleu  rough bleu  rough
## Levels: bleu rough
dates = c("01/02/19","27/02/19")
dates = as.Date(dates, "%d/%m/%y")
dates
## [1] "2019-02-01" "2019-02-27"
class(dates)
## [1] "Date"
y = ts(1:10, frequency = 4, start = c(2017,1))
y
##      Qtr1 Qtr2 Qtr3 Qtr4
## 2017    1    2    3    4
## 2018    5    6    7    8
## 2019    9   10

Operations

x = cbind(1:4,5:8) # Essayez rbind(1:4,5:8) pour la fusion des lignes
x
##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8
x = data.frame(ID=1:2, sexe = c("F", "F"), Poids = c(65, 60))
y = data.frame(ID=1:2, sexe = c("F", "F"), Taille = c(175, 170))
xy = cbind(x,y); xy # les ID et sexe soient dupliquees
##   ID sexe Poids ID sexe Taille
## 1  1    F    65  1    F    175
## 2  2    F    60  2    F    170
xy = merge(x,y); xy
##   ID sexe Poids Taille
## 1  1    F    65    175
## 2  2    F    60    170
vec = c(1,4,6)
vec[2]
## [1] 4
vec[-2] # tous les éléments sauf le deuxieme
## [1] 1 6
vec[2:3]
## [1] 4 6
vec[-c(1,3)]
## [1] 4
vec[c(T,F,F)] # masque logique
## [1] 1
vec>5
## [1] FALSE FALSE  TRUE
vec[vec>5] # extraction par masque logique
## [1] 6
x = matrix(1:12, nrow=4, ncol=3, byrow=TRUE)
x
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12
x[2,3] # ligne 2, colonne 3
## [1] 6
x[,1]  # On prend toutes les lignes et seulement la colonne 1.
## [1]  1  4  7 10
x[c(1,4), ] # On prend toutes les colonnes et les lignes 1 et 4.
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]   10   11   12
x[3, -c(1,3)]
## [1] 8
xlog = matrix(c(TRUE,FALSE), nrow=4, ncol=3)
xlog
##       [,1]  [,2]  [,3]
## [1,]  TRUE  TRUE  TRUE
## [2,] FALSE FALSE FALSE
## [3,]  TRUE  TRUE  TRUE
## [4,] FALSE FALSE FALSE
x[xlog]
## [1] 1 7 2 8 3 9
which(x>5, arr.ind = TRUE) 
##      row col
## [1,]   3   1
## [2,]   4   1
## [3,]   3   2
## [4,]   4   2
## [5,]   2   3
## [6,]   3   3
## [7,]   4   3
x[x>5] = 0
x
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    0
## [3,]    0    0    0
## [4,]    0    0    0

Création de fonctions

function(liste de paramètres) {corps de fonction}

BMI = function(poids, taille) {
  BMI = poids/taille^2 * 10000
  names(BMI) = "L'indice de mass corporelle"
  return(BMI)
}

BMI(65, 175)
## L'indice de mass corporelle 
##                    21.22449
BMI(55, 160)
## L'indice de mass corporelle 
##                    21.48438
bonjour = function(nom){ cat("Bonjour", nom, "!")}
bonjour
## function(nom){ cat("Bonjour", nom, "!")}
bonjour("le monde")
## Bonjour le monde !

Donnée Économiques dans [R]

library(wooldridge)
data("airfare")
names(airfare) # les variables
##  [1] "year"    "id"      "dist"    "passen"  "fare"    "bmktshr" "ldist"  
##  [8] "y98"     "y99"     "y00"     "lfare"   "ldistsq" "concen"  "lpassen"
head(airfare, n=2)
##   year id dist passen fare bmktshr    ldist y98 y99 y00    lfare  ldistsq
## 1 1997  1  528    152  106  0.8386 6.269096   0   0   0 4.663439 39.30157
## 2 1998  1  528    265  106  0.8133 6.269096   1   0   0 4.663439 39.30157
##   concen lpassen
## 1 0.8386 5.02388
## 2 0.8133 5.57973
colMeans(airfare)
##         year           id         dist       passen         fare 
## 1998.5000000  575.0000000  989.7449956  636.8241950  178.7967798 
##      bmktshr        ldist          y98          y99          y00 
##    0.6101149    6.6964816    0.2500000    0.2500000    0.2500000 
##        lfare      ldistsq       concen      lpassen 
##    5.0956005   45.2774707    0.6101149    6.0170272
summary(airfare$fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    37.0   123.0   168.0   178.8   225.0   522.0
plot(density(airfare$fare))

Le modèle de régression linéaire simple

Une régression

data("wage1"); names(wage1)
##  [1] "wage"     "educ"     "exper"    "tenure"   "nonwhite" "female"  
##  [7] "married"  "numdep"   "smsa"     "northcen" "south"    "west"    
## [13] "construc" "ndurman"  "trcommpu" "trade"    "services" "profserv"
## [19] "profocc"  "clerocc"  "servocc"  "lwage"    "expersq"  "tenursq"
lm(wage ~ educ, data=wage1) # ou lm(wage1$wage ~ wage1$educ)
## 
## Call:
## lm(formula = wage ~ educ, data = wage1)
## 
## Coefficients:
## (Intercept)         educ  
##     -0.9049       0.5414
lm(wage ~ 0 + educ, data=wage1) # ou lm(wage1$wage ~ 0 + wage1$educ)
## 
## Call:
## lm(formula = wage ~ 0 + educ, data = wage1)
## 
## Coefficients:
##   educ  
## 0.4727

Une régression

wage_reg = lm(wage ~ educ, data=wage1) 
# Pour restaurer le resultat de la régression
coef(wage_reg)
## (Intercept)        educ 
##  -0.9048516   0.5413593
# Pour estimateurs fiables de beta_0 et beta_1
plot(wage1$wage, wage1$educ)
abline(wage_reg) # la droite de régression

Votre fonction de régression

reg.simple = function(y, x){
  beta1est = cov(x,y)/var(x)
  beta0est = mean(y) - beta1est*mean(x)
  return(list(beta1=beta1est, beta0=beta0est, cor=cor(x,y)))
}

beta = reg.simple(wage1$wage, wage1$educ)
beta
## $beta1
## [1] 0.5413593
## 
## $beta0
## [1] -0.9048516
## 
## $cor
## [1] 0.4059033
plot(wage1$wage, wage1$educ)
abline(a=beta$beta0,b=beta$beta1)

Les propriétés des estimations

wage_hat = fitted(wage_reg); e_hat = resid(wage_reg)
mean(e_hat)
## [1] -1.19498e-16
mean(wage_hat)
## [1] 5.896103
var(wage_hat)/var(wage1$wage)
## [1] 0.1647575
1 - var(e_hat)/var(wage1$wage)
## [1] 0.1647575

Tests d’hypothèses sur les paramètres

Tests d’hypothèses sur les paramètres

summary(wage_reg)
## 
## Call:
## lm(formula = wage ~ educ, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3396 -2.1501 -0.9674  1.1921 16.6085 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.90485    0.68497  -1.321    0.187    
## educ         0.54136    0.05325  10.167   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.378 on 524 degrees of freedom
## Multiple R-squared:  0.1648, Adjusted R-squared:  0.1632 
## F-statistic: 103.4 on 1 and 524 DF,  p-value: < 2.2e-16

Intervalles de confiance

confint(wage_reg)
##                  2.5 %    97.5 %
## (Intercept) -2.2504719 0.4407687
## educ         0.4367534 0.6459651
confint(wage_reg, level=0.99)
##                  0.5 %    99.5 %
## (Intercept) -2.6756608 0.8659575
## educ         0.4037001 0.6790184

Régression linéaire multiple

Régression linéaire multiple

new.wage=cbind.data.frame(wage1$wage,wage1$educ,wage1$exper)
names(new.wage)=c("wage", "educ", "exper")

library(scatterplot3d)
s3d = scatterplot3d(new.wage, angle=70, pch = 16)

wage.reg2 = lm(wage ~ educ + exper, data=new.wage)
summary(wage.reg2)
## 
## Call:
## lm(formula = wage ~ educ + exper, data = new.wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5532 -1.9801 -0.7071  1.2030 15.8370 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.39054    0.76657  -4.423 1.18e-05 ***
## educ         0.64427    0.05381  11.974  < 2e-16 ***
## exper        0.07010    0.01098   6.385 3.78e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.257 on 523 degrees of freedom
## Multiple R-squared:  0.2252, Adjusted R-squared:  0.2222 
## F-statistic: 75.99 on 2 and 523 DF,  p-value: < 2.2e-16
s3d$plane3d(wage.reg2)

n = 20; x1 = runif(n,-5,5); x2 = runif(n,-50,50)
y = 1 + x1+0.5*x2+rnorm(n,0,2) 
reg.bon = lm(y~x1+x2)
exemple = scatterplot3d(y,x1,x2, type="h", angle=55, pch = 16)
exemple$plane3d(reg.bon)

# ça, c'est bon.
summary(reg.bon)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3705 -0.7167  0.3439  1.2182  2.5571 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.43593    0.47180   0.924    0.368    
## x1           0.90953    0.16234   5.603 3.17e-05 ***
## x2           0.50467    0.01428  35.343  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.074 on 17 degrees of freedom
## Multiple R-squared:  0.9869, Adjusted R-squared:  0.9853 
## F-statistic: 638.4 on 2 and 17 DF,  p-value: < 2.2e-16

Régression linéaire multiple

summary(lm(wage ~ educ + exper + I(exper^2), data=wage1))
## 
## Call:
## lm(formula = wage ~ educ + exper + I(exper^2), data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0692 -2.0837 -0.5417  1.2860 15.1363 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.964890   0.752153  -5.271 1.99e-07 ***
## educ         0.595343   0.053025  11.228  < 2e-16 ***
## exper        0.268287   0.036897   7.271 1.31e-12 ***
## I(exper^2)  -0.004612   0.000822  -5.611 3.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.166 on 522 degrees of freedom
## Multiple R-squared:  0.2692, Adjusted R-squared:  0.265 
## F-statistic: 64.11 on 3 and 522 DF,  p-value: < 2.2e-16

Régression linéaire multiple

\[\mbox{wage} = \beta_0 + \beta_1 \mbox{educ} + \beta_2\mbox{exper} + \beta_3(\mbox{exper})(\mbox{educ}) + \varepsilon\] such that \(\frac{\Delta \mbox{wage}}{ \Delta \mbox{exper}} = \beta_2 + \beta_3 \mbox{educ}.\)

summary(lm(wage ~ educ + exper + exper*educ, data=wage1))
## 
## Call:
## lm(formula = wage ~ educ + exper + exper * educ, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6747 -1.9683 -0.6991  1.2803 15.8067 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.859916   1.181080  -2.421   0.0158 *  
## educ         0.601735   0.089900   6.693 5.64e-11 ***
## exper        0.045769   0.042614   1.074   0.2833    
## educ:exper   0.002062   0.003491   0.591   0.5549    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.259 on 522 degrees of freedom
## Multiple R-squared:  0.2257, Adjusted R-squared:  0.2212 
## F-statistic: 50.71 on 3 and 522 DF,  p-value: < 2.2e-16

Tests d’hypothèses sur les paramètres

library(car); wage.reg2=lm(wage ~ educ + exper + exper*educ, data=wage1)
linearHypothesis(wage.reg2, c("educ=0.6","exper=0"))
## Linear hypothesis test
## 
## Hypothesis:
## educ = 0.6
## exper = 0
## 
## Model 1: restricted model
## Model 2: wage ~ educ + exper + exper * educ
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    524 5580.8                           
## 2    522 5544.5  2    36.313 1.7094  0.182

Prédiction

summary(wage1$wage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.530   3.330   4.650   5.896   6.880  24.980
summary(wage1$educ)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.00   12.00   12.56   14.00   18.00
summary(wage1$exper)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   13.50   17.02   26.00   51.00
reg.pour.pred = lm(wage ~ educ + exper + I(exper^2), data=wage1)
cvaleurs = data.frame(educ=rep(19,3), exper=c(2,8,15))
cvaleurs
##   educ exper
## 1   19     2
## 2   19     8
## 3   19    15
predict(reg.pour.pred, cvaleurs)
##         1         2         3 
##  7.864751  9.197736 10.333169
predict(reg.pour.pred, cvaleurs, interval="confidence", level=0.99)
##         fit      lwr       upr
## 1  7.864751 6.811282  8.918219
## 2  9.197736 8.288405 10.107068
## 3 10.333169 9.385244 11.281094

Miscellaneous

pairs(wage1[1:3])

# Plus beaux dessins
# install.packages("ggplot2"); install.packages("GGally")
library(GGally)
library(ggplot2)

ggpairs(wage1[1:3])

# Plus beaux dessins mais plus complexes

lowerFn = function(data, mapping, method = "lm", ...) {
  p = ggplot(data = data, mapping = mapping) +
    geom_point(colour = "blue") +
    geom_smooth(method = method, color = "red", ...)
  p}

ggpairs(
  wage1[1:3], lower = list(continuous = wrap(lowerFn, method = "lm")),
  diag = list(continuous = wrap("barDiag", colour = "blue")),
  upper = list(continuous = wrap("cor", size = 10)))

Fin de la partie I et II

Question ?

Simple séries temporelles

library(wooldridge)
data("phillips")
head(phillips)
##   year unem  inf inf_1 unem_1 cinf      cunem
## 1 1948  3.8  8.1    NA     NA   NA         NA
## 2 1949  5.9 -1.2   8.1    3.8 -9.3  2.1000001
## 3 1950  5.3  1.3  -1.2    5.9  2.5 -0.5999999
## 4 1951  3.3  7.9   1.3    5.3  6.6 -2.0000002
## 5 1952  3.0  1.9   7.9    3.3 -6.0 -0.3000000
## 6 1953  2.9  0.8   1.9    3.0 -1.1 -0.0999999
plot(phillips$year, phillips$inf, type="l")

temp_donnees = ts(phillips, start=1, frequency=1)
head(temp_donnees)
##      year unem  inf inf_1 unem_1 cinf      cunem
## [1,] 1948  3.8  8.1    NA     NA   NA         NA
## [2,] 1949  5.9 -1.2   8.1    3.8 -9.3  2.1000001
## [3,] 1950  5.3  1.3  -1.2    5.9  2.5 -0.5999999
## [4,] 1951  3.3  7.9   1.3    5.3  6.6 -2.0000002
## [5,] 1952  3.0  1.9   7.9    3.3 -6.0 -0.3000000
## [6,] 1953  2.9  0.8   1.9    3.0 -1.1 -0.0999999
plot(temp_donnees[,3]) # ou plot.ts(temp_donnes[,3])

Simple séries temporelles

Régression linéaire

head(phillips$unem)
## [1] 3.8 5.9 5.3 3.3 3.0 2.9
temps = time(phillips$unem)
head(temps)
## [1] 1 2 3 4 5 6
reg.lin = lm(phillips$unem~phillips$year) 
reg.lin2 = lm(phillips$unem~temps)
coef(reg.lin)
##   (Intercept) phillips$year 
##  -54.17794636    0.03027683
coef(reg.lin2)
## (Intercept)       temps 
##  4.77103896  0.03027683
summary(reg.lin)
## 
## Call:
## lm(formula = phillips$unem ~ phillips$year)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3757 -0.8768 -0.0371  0.8142  3.8693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -54.17795   23.50080  -2.305   0.0250 *
## phillips$year   0.03028    0.01190   2.545   0.0138 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.439 on 54 degrees of freedom
## Multiple R-squared:  0.1071, Adjusted R-squared:  0.09058 
## F-statistic: 6.478 on 1 and 54 DF,  p-value: 0.01381
resi.mco = residuals(reg.lin)
ychap = fitted(reg.lin)
plot.ts(phillips$unem)
abline(coef=coef(reg.lin2), col="blue")
s = c(12, 24, 36)
segments(temps[s],ychap[s], temps[s], phillips$unem[s], col="red", lty=3)

T = length(resi.mco)
plot(resi.mco[-T], resi.mco[-1], xlab = "residu en t-1", ylab="residu en t")

# ou 
lag.plot(resi.mco,1)

Opérateur retard

x = ts(1:5); xlag = lag(x,2, na.pad = TRUE)
cbind(x,xlag)
## Time Series:
## Start = -1 
## End = 5 
## Frequency = 1 
##     x xlag
## -1 NA    1
##  0 NA    2
##  1  1    3
##  2  2    4
##  3  3    5
##  4  4   NA
##  5  5   NA
lag.plot(x,2)

# lag-plot pour une cycle
x=c(0:5,4:0, 1); y=exp(x)
cbind(x,y)
##       x          y
##  [1,] 0   1.000000
##  [2,] 1   2.718282
##  [3,] 2   7.389056
##  [4,] 3  20.085537
##  [5,] 4  54.598150
##  [6,] 5 148.413159
##  [7,] 4  54.598150
##  [8,] 3  20.085537
##  [9,] 2   7.389056
## [10,] 1   2.718282
## [11,] 0   1.000000
## [12,] 1   2.718282
plot.ts(x); lag.plot(x);

plot.ts(y); lag.plot(y); 

Simulation

  1. de vérifier qu’on a compris les phénomènes aléatoires qu’on étudie;

  2. de vérifier l’efficacité des méthodes d’estimation;

  3. d’estimer des fonctions complexes des variables sans effectuer des calculs théoriques.

x = rep(0,20)
x
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
x[1] = 0; x[2] = 1
# for loop
for(t in 3:20){x[t] = x[t-1] + x[t-2]}
x
##  [1]    0    1    1    2    3    5    8   13   21   34   55   89  144  233
## [15]  377  610  987 1597 2584 4181
x = ts(x, start = c(2019,2), frequency = 12)
x
##       Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec
## 2019         0    1    1    2    3    5    8   13   21   34   55
## 2020   89  144  233  377  610  987 1597 2584 4181
plot(x)

More discussion on For loop

Pseudo-aléatoires

set.seed(2019); e = rnorm(250); e2 = rnorm(250)
set.seed(2019); e3 = rnorm(250);
tail(cbind(e,e2,e3),5)
##                  e         e2          e3
## [246,] -0.49720183  1.5119067 -0.49720183
## [247,]  1.26335729 -1.2320650  1.26335729
## [248,] -0.08190324 -1.4650982 -0.08190324
## [249,]  0.38625611 -0.5812875  0.38625611
## [250,] -1.39066911  0.5641280 -1.39066911
plot(e); abline(h=0)

plot(density(e))

N = rep(0,250); U = rep(0,250); U[1] = 2019 # the seed
for(t in 2:250){  
    U[t] = (1664525 * U[t-1] + 1013904223) %% 2^32}
U = U/2^32
ts.plot(U)

plot(density(U))

Simulation Pour estimateur

t=1:100; x=ts(rnorm(100)); e=ts(rnorm(100))
y = 4*x + 10*t
summary(lm(y~x+t))
## Warning in summary.lm(lm(y ~ x + t)): essentially perfect fit: summary may
## be unreliable
## 
## Call:
## lm(formula = y ~ x + t)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.342e-12 -2.449e-14 -1.710e-15  2.456e-14  1.190e-12 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) 2.913e-14  3.803e-14 7.660e-01    0.446    
## x           4.000e+00  1.880e-14 2.127e+14   <2e-16 ***
## t           1.000e+01  6.564e-16 1.524e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.887e-13 on 97 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.167e+32 on 2 and 97 DF,  p-value: < 2.2e-16

Régression linéaire avec deux séries temporelles

# install.package("directlabels")
library(directlabels); library(lattice)
p=xyplot(unem + inf~year, data=phillips, type="l")
direct.label(p)

reg_temp = lm(phillips$unem~phillips$inf)
summary(reg_temp)
## 
## Call:
## lm(formula = phillips$unem ~ phillips$inf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8308 -0.9254  0.0285  0.9530  4.0507 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.1534     0.3215  16.030   <2e-16 ***
## phillips$inf   0.1237     0.0654   1.892   0.0639 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.475 on 54 degrees of freedom
## Multiple R-squared:  0.06215,    Adjusted R-squared:  0.04479 
## F-statistic: 3.579 on 1 and 54 DF,  p-value: 0.06389
plot(phillips$unem, phillips$inf)
abline(coef=coef(reg_temp))

Modèle à retards échelonnés finis

lag.plot(phillips$unem,2, do.lines=FALSE, diag.col="red", col.main="blue")

inf_2=c(NA, NA, phillips$inf[1:54]); head(inf_2)
## [1]   NA   NA  8.1 -1.2  1.3  7.9
reg_temp2=lm(phillips$unem ~ phillips$inf + phillips$inf_1 + inf_2)
summary(reg_temp2)
## 
## Call:
## lm(formula = phillips$unem ~ phillips$inf + phillips$inf_1 + 
##     inf_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7363 -0.5458 -0.0392  0.6993  1.9567 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.37884    0.28572  15.325  < 2e-16 ***
## phillips$inf   -0.08904    0.08235  -1.081 0.284767    
## phillips$inf_1  0.16138    0.09846   1.639 0.107480    
## inf_2           0.25596    0.07052   3.630 0.000666 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.139 on 50 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.467,  Adjusted R-squared:  0.4351 
## F-statistic: 14.61 on 3 and 50 DF,  p-value: 5.876e-07
plot(phillips$unem, inf_2)
abline(coef=c(coef(reg_temp2)[1],coef(reg_temp2)[4]))

Séries temporelles et Tendance

data("hseinv")
head(hseinv,2)
##   year   inv    pop  price     linv     lpop     lprice t     invpc
## 1 1947 54864 144126 0.8190 10.91261 11.87844 -0.1996712 1 0.3806669
## 2 1948 64717 146631 0.8649 11.07778 11.89567 -0.1451414 2 0.4413596
##       linvpc   lprice_1   linvpc_1    gprice    ginvpc
## 1 -0.9658305         NA         NA        NA        NA
## 2 -0.8178953 -0.1996712 -0.9658305 0.0545298 0.1479352
p=xyplot(linvpc + lprice ~year, data=hseinv, type="l")
direct.label(p)

summary(lm(linvpc~lprice, data=hseinv))
## 
## Call:
## lm(formula = linvpc ~ lprice, data = hseinv)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45991 -0.08694 -0.01264  0.08651  0.34672 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.55023    0.04303 -12.788 1.03e-15 ***
## lprice       1.24094    0.38242   3.245  0.00238 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1554 on 40 degrees of freedom
## Multiple R-squared:  0.2084, Adjusted R-squared:  0.1886 
## F-statistic: 10.53 on 1 and 40 DF,  p-value: 0.002376
inv=lm(linvpc~lprice+t, data=hseinv)
summary(inv)
## 
## Call:
## lm(formula = linvpc ~ lprice + t, data = hseinv)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45092 -0.08583 -0.01734  0.08517  0.26024 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.913060   0.135613  -6.733    5e-08 ***
## lprice      -0.380961   0.678835  -0.561  0.57787    
## t            0.009829   0.003512   2.798  0.00794 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1436 on 39 degrees of freedom
## Multiple R-squared:  0.3408, Adjusted R-squared:  0.307 
## F-statistic: 10.08 on 2 and 39 DF,  p-value: 0.000296
lag.plot(resid(inv),2, do.lines=FALSE)

Fin de la partie III

Modèles autorégressifs

y1=y2=y3=y4=rep(1, 26); time = 0:25
for (t in 1:25){ 
  y1[t+1] = 0.9 * y1[t];
  y2[t+1] = -0.9 * y2[t];
  y3[t+1] = 1.05 * y3[t];
  y4[t+1] = -1.05 * y4[t];
}

library(directlabels); library(lattice)
p=xyplot(y1 + y2 + y3 + y4 ~ time, type="l"); direct.label(p)

# ou:  ts.plot(cbind(y1,y2,y3,y4), col=c("blue","red","black","grey"))
# Pour autre couleur: demo("colors")
est.d1 = lm(y1[2:25]~0+y1[1:24]); est.d2 = lm(y2[2:25]~0+y2[1:24]);
est.d3 = lm(y3[2:25]~0+y3[1:24]); est.d4 = lm(y4[2:25]~0+y4[1:24]);
c(est.d1$coefficients, est.d2$coefficients, est.d3$coefficients,
           est.d4$coefficients)
## y1[1:24] y2[1:24] y3[1:24] y4[1:24] 
##     0.90    -0.90     1.05    -1.05

Fonction d’autocorrélation

cov(y1[2:25],y1[1:24])/var(y1[1:24])
## [1] 0.9
lag.plot(y1, lags=6, layout=c(2,3))

lag.plot(y3, lags=6, layout=c(2,3))

lag.plot(y4, lags=6, layout=c(2,3))

cov(y1[5:25],y1[1:21])
## [1] 0.0466918

Bruit blanc

set.seed(2019); e = rnorm(200); summary(lm(e[2:100]~0+e[1:99]))
## 
## Call:
## lm(formula = e[2:100] ~ 0 + e[1:99])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2772 -0.6591 -0.2258  0.5102  2.6378 
## 
## Coefficients:
##         Estimate Std. Error t value Pr(>|t|)
## e[1:99] -0.04274    0.10083  -0.424    0.673
## 
## Residual standard error: 0.9092 on 98 degrees of freedom
## Multiple R-squared:  0.00183,    Adjusted R-squared:  -0.008355 
## F-statistic: 0.1797 on 1 and 98 DF,  p-value: 0.6726
lag.plot(e, lag=3, layout=c(1,3), do.lines = FALSE)

cov(e[2:100],e[1:99])/var(e[1:99]) # rho_1
## [1] -0.04974472
cov(e[21:100],e[1:80])/var(e[1:80]) # rho_20
## [1] -0.05723198
acf(e)

acf(e, lag.max = 20, plot = FALSE)
## 
## Autocorrelations of series 'e', by lag
## 
##      0      1      2      3      4      5      6      7      8      9 
##  1.000  0.005 -0.152 -0.105  0.031  0.035 -0.003  0.085 -0.058  0.029 
##     10     11     12     13     14     15     16     17     18     19 
## -0.050  0.028 -0.039  0.117 -0.068 -0.086  0.112 -0.010  0.009 -0.134 
##     20 
## -0.045

Test de blancheur

Box.test(e, lag=1)
## 
##  Box-Pierce test
## 
## data:  e
## X-squared = 0.0051073, df = 1, p-value = 0.943
Box.test(e, lag=1, type="Ljung")
## 
##  Box-Ljung test
## 
## data:  e
## X-squared = 0.0051843, df = 1, p-value = 0.9426

La stationnarité faible

Processus AR(1)

set.seed(2019)
Y1=Y2=Y3=Y4=rep(0, 50); 
for (t in 1:50){ 
  Y1[t+1] = 0.9 * Y1[t] + rnorm(1);
  Y2[t+1] = -0.9 * Y2[t] + rnorm(1);
  Y3[t+1] = 1.05 * Y3[t] + rnorm(1);
  Y4[t+1] = -1.03 * Y4[t] + rnorm(1);
}
p=xyplot(Y1 + Y2 + Y3 + Y4 ~ 0:50, type="l"); direct.label(p)

est.r1 = lm(Y1[2:50]~0+Y1[1:49]); est.r2 = lm(Y2[2:50]~0+Y2[1:49]);
est.r3 = lm(Y3[2:50]~0+Y3[1:49]); est.r4 = lm(Y4[2:50]~0+Y4[1:49]);
c(est.r1$coefficients, est.r2$coefficients, est.r3$coefficients,
           est.r4$coefficients)
##   Y1[1:49]   Y2[1:49]   Y3[1:49]   Y4[1:49] 
##  0.9617065 -0.8977096  1.0435869 -1.0348403
#ou  arima(Y3, order=c(1,0,0), method=c("CSS"), include.mean=FALSE)

Inachevé mais….