FORECASTING MODELS OF RECESION IN SPAIN APPLYING MACHINE LEARNING TECHNIQUES

ggplot(data=datosdes, aes(Año,Tc_pib_per_capita))+
  labs(y="Tasa de crecimiento del PIB per cápita real español")+
  geom_line(linetype=1,aes(x=Año,y=Tc_pib_per_capita),
            size=1.3,color="steelblue")+
  geom_point(size=2,data=datosdes
             ,aes(x=Año,y=Tc_pib_per_capita))+
  scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +
  scale_y_continuous(breaks = seq(-0.06, 0.1, by = 0.01))+
  geom_hline(yintercept = 0,alpha=0.8,size=1)+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5), 
        text = element_text(size=12))

ggplot(data=datosdes)+labs(y="Tasa de variación FBCF (UMN) a precios actuales ",x="Año")+
  theme(panel.background = element_rect(fill = 'grey93', colour = 'black'))+
  annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = -25, ymax = 20,
           alpha = .5)+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = -25, ymax = 20,
           alpha = .5)+
  geom_hline(yintercept = 0,alpha=0.7)+
  geom_line(linetype=1, size=1.8,color="steelblue",
            aes(x=Año,y=fbcf[2:34,3]*100))+
  scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +
  scale_y_continuous(breaks = seq(-25, 20, by = 5))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5),
        text = element_text(size=12))

train.c <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
datos.train <- modelo1[train.c, ]
datos.test <- modelo1[-train.c, ]
table(datos.train$pib)

## 
##  0  1 
## 18  5

table(datos.test$pib)

## 
## 0 1 
## 8 2

LOGIT

depstep <- glm(pib~tvahorro,data =datos.train, family = binomial(link = "logit"))
summary(depstep)

## 
## Call:
## glm(formula = pib ~ tvahorro, family = binomial(link = "logit"), 
##     data = datos.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79783  -0.29508  -0.10826  -0.00923   1.59755  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.4772     1.0064   0.474   0.6354  
## tvahorro    -60.9674    28.3570  -2.150   0.0316 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.0850  on 22  degrees of freedom
## Residual deviance:  9.4229  on 21  degrees of freedom
## AIC: 13.423
## 
## Number of Fisher Scoring iterations: 7

pro<-predict (depstep, datos.test,type = "response")
prdBln1 <- ifelse(pro > 0.5, 1, 0)
cnfmtrx1 <- table(prd=prdBln1, act=datos.test$pib)
confusionMatrix(cnfmtrx1)

## Confusion Matrix and Statistics
## 
##    act
## prd 0 1
##   0 8 1
##   1 0 1
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.555, 0.9975)
##     No Information Rate : 0.8            
##     P-Value [Acc > NIR] : 0.3758         
##                                          
##                   Kappa : 0.6154         
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.5000         
##          Pos Pred Value : 0.8889         
##          Neg Pred Value : 1.0000         
##              Prevalence : 0.8000         
##          Detection Rate : 0.8000         
##    Detection Prevalence : 0.9000         
##       Balanced Accuracy : 0.7500         
##                                          
##        'Positive' Class : 0              
##

x<- as.numeric(datos.test$pib)
pmea<-mean(abs(x-pro))*100
pmea

## [1] 12.02898

for(i in 1:100){
  train.cb <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
  datos.trainb <- modelo1[train.cb, ]
  datos.testb <- modelo1[-train.cb, ]
  br <- glm(pib~tvahorro,data =datos.trainb, family = binomial(link = "logit"))
  br.<-predict (br, datos.testb,type = "response")
  br1<-c(br1,br.)
  año<-c(año,as.numeric(row.names(datos.testb))+1983)
  pmeabuclelogit1<-mean(abs((as.numeric(datos.testb$pib))-br.))*100
  prdBlnbucle1 <- ifelse(br. > 0.5, 1, 0)
  scorebucle1<-mean(datos.testb$pib == prdBlnbucle1)
  vectorlogit1[i]<- pmeabuclelogit1
  newvectorlogit1[i]<-scorebucle1
  ea<-c(ea,  prdBlnbucle1)
  ea1<-c(ea1,datos.testb[,1])
}
año<-año[-1]
ea<-ea[-1]
ea1<-ea1[-1]
br1<-br1[-1]
confusionMatrix(table(prd=ea, act=ea1))

## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 749  65
##   1  41 145
##                                           
##                Accuracy : 0.894           
##                  95% CI : (0.8732, 0.9124)
##     No Information Rate : 0.79            
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6665          
##  Mcnemar's Test P-Value : 0.02549         
##                                           
##             Sensitivity : 0.9481          
##             Specificity : 0.6905          
##          Pos Pred Value : 0.9201          
##          Neg Pred Value : 0.7796          
##              Prevalence : 0.7900          
##          Detection Rate : 0.7490          
##    Detection Prevalence : 0.8140          
##       Balanced Accuracy : 0.8193          
##                                           
##        'Positive' Class : 0               
##

mean(vectorlogit1)

## [1] 12.53793

g<-ggplot(data=datosgrafico1, aes(Año,Probabilidades))+scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))
 for(i in 1:100){
g<-g+geom_point(shape=1,stroke=2,colour = 'steelblue',size=3.5,aes_string(año[(i*10-9):(i*10)],br1[(i*10-9):(i*10)]*100))
}
g

for (i in 1:33){
  trainloo<-modelo1[-i,]
  testloo<- modelo1[i,]
  ppro <- glm(pib~tvahorro,data =trainloo, family = binomial(link = "probit"))
  ppro1<-predict (ppro, testloo,type = "response")
  probap<-c(probap,ppro1)
  testloo <- as.numeric(modelo1[i,1])
  prdloopro <- ifelse( ppro1> 0.5, 1, 0)
  scorevectorpro[i]<-mean(testloo ==  prdloopro)
  vectorloopro[i]<-mean(abs(testloo-ppro1))*100
  propro[i]<-ppro1
}

mean(scorevectorpro)

## [1] 0.8787879

mean(vectorloopro)

## [1] 13.67422

ggplot(data=datosgrafico, aes(Año,Probabilidades))+annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  geom_line(linetype=1, colour = 'steelblue',size=1.8,aes(datosgrafico$Año,probap*100))+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  geom_point(shape=21,fill="white",stroke=3,size=3.5,colour = 'black',aes(datosgrafico$Año,probap*100))+
  scale_x_continuous(breaks = round(seq(1984, 2016, by = 1))) +
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

medialo<-rep(mean(as.numeric(modelo1$pib)),33)
tfglcompleto <- glm(pib~tvahorro,data =modelo1, family = binomial(link = "logit"))
tfglcompleto1<-predict (tfglcompleto, modelo1,type = "response")
sslc<- sum((tfglcompleto1-medialo)^2)
sstlc<- sum(((as.numeric(modelo1$pib))-medialo)^2)
rlc<-(sslc/sstlc)
rlc

## [1] 0.6501186

PROBIT

probit <- glm(pib~tvahorro,data =datos.trainb,
                     family = binomial(link = "probit"))
summary(probit)

## 
## Call:
## glm(formula = pib ~ tvahorro, family = binomial(link = "probit"), 
##     data = datos.trainb)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.86766  -0.37703  -0.09617  -0.00007   1.46058  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.4166     0.5575   0.747   0.4549  
## tvahorro    -34.4754    14.6539  -2.353   0.0186 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.085  on 22  degrees of freedom
## Residual deviance: 11.649  on 21  degrees of freedom
## AIC: 15.649
## 
## Number of Fisher Scoring iterations: 8

l1probit<-predict (probit, datos.testb,type = "response")

prdBlnp <- ifelse(l1probit > 0.5, 1, 0)
cnfmtrxp <- table(prd=prdBlnp, act=datos.test$pib)
confusionMatrix(cnfmtrxp)

## Confusion Matrix and Statistics
## 
##    act
## prd 0 1
##   0 6 2
##   1 2 0
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.2624, 0.8784)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.9672          
##                                           
##                   Kappa : -0.25           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.750           
##             Specificity : 0.000           
##          Pos Pred Value : 0.750           
##          Neg Pred Value : 0.000           
##              Prevalence : 0.800           
##          Detection Rate : 0.600           
##    Detection Prevalence : 0.800           
##       Balanced Accuracy : 0.375           
##                                           
##        'Positive' Class : 0               
##

pmeap<-mean(abs(x-l1probit))*100
pmeap

## [1] 40.50727

for(i in 1:100){
  train.cb <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
  datos.trainb <- modelo1[train.cb, ]
  datos.testb <- modelo1[-train.cb, ]
  finalprobit <- glm(pib~tvahorro,data =datos.trainb,
                     family = binomial(link = "probit"))
  final1probit<-predict (finalprobit, datos.testb,type = "response")
  brpr<-c(brpr,final1probit)
  añoprobit<-c(añoprobit,as.numeric(row.names(datos.testb))+1983)
  prdBlfinal <- ifelse(final1probit > 0.5, 1, 0)
  scorefinalprobit<-mean(datos.testb$pib ==  prdBlfinal)
  pmeabucleprobit<-mean(abs((as.numeric(datos.testb$pib))-final1probit))*100
  newvectorprobit[i]<-scorefinalprobit
  vectorprobit[i]<- pmeabucleprobit
  elpr<-c(elpr, prdBlfinal)
  elpr1<-c(elpr1,as.numeric(datos.testb[,1]))
}

primeropr <- table(prd=elpr, act=elpr1)
confusionMatrix(primeropr)

## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 748  66
##   1  49 137
##                                           
##                Accuracy : 0.885           
##                  95% CI : (0.8636, 0.9041)
##     No Information Rate : 0.797           
##     P-Value [Acc > NIR] : 1.07e-13        
##                                           
##                   Kappa : 0.6332          
##  Mcnemar's Test P-Value : 0.1357          
##                                           
##             Sensitivity : 0.9385          
##             Specificity : 0.6749          
##          Pos Pred Value : 0.9189          
##          Neg Pred Value : 0.7366          
##              Prevalence : 0.7970          
##          Detection Rate : 0.7480          
##    Detection Prevalence : 0.8140          
##       Balanced Accuracy : 0.8067          
##                                           
##        'Positive' Class : 0               
##

mean(vectorprobit)

## [1] 13.45202

p<-ggplot(data=datosgrafico1, aes(Año,Probabilidades))+scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                                                                                      alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))
for(i in 1:100){
  p<-p+geom_point(shape=1,stroke=2,colour = 'steelblue',size=3.5,aes_string(añoprobit[(i*10-9):(i*10)],brpr[(i*10-9):(i*10)]*100))
}
p

for (i in 1:33){
  trainloo<-modelo1[-i,]
  testloo<- modelo1[i,]
  ppro <- glm(pib~tvahorro,data =trainloo, family = binomial(link = "probit"))
  ppro1<-predict (ppro, testloo,type = "response")
  probap<-c(probap,ppro1)
  testloo <- as.numeric(modelo1[i,1])
  prdloopro <- ifelse( ppro1> 0.5, 1, 0)
  scorevectorpro[i]<-mean(testloo ==  prdloopro)
  vectorloopro[i]<-mean(abs(testloo-ppro1))*100
  propro[i]<-ppro1
}

mean(scorevectorpro)

## [1] 0.8787879

mean(vectorloopro)

## [1] 13.67422

ggplot(data=datosgrafico, aes(Año,Probabilidades))+annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  geom_line(linetype=1, colour = 'steelblue',size=1.8,aes(datosgrafico$Año,propro*100))+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  geom_point(shape=21,fill="white",stroke=3,size=4.5,colour = 'black',aes(datosgrafico$Año,propro*100))+
  scale_x_continuous(breaks = round(seq(1984, 2016, by = 1))) +
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

proc <- glm(pib~tvahorro,data =modelo1, family = binomial(link = "probit"))
summary(proc)

## 
## Call:
## glm(formula = pib ~ tvahorro, family = binomial(link = "probit"), 
##     data = modelo1)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.88838  -0.21035  -0.04140  -0.00001   1.54935  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.3860     0.5234   0.738  0.46077   
## tvahorro    -38.2484    14.3619  -2.663  0.00774 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 34.106  on 32  degrees of freedom
## Residual deviance: 12.594  on 31  degrees of freedom
## AIC: 16.594
## 
## Number of Fisher Scoring iterations: 9

proc1<-predict (proc, modelo1,type = "response")


sslp<- sum((proc1-medialo)^2)
sstlc<- sum(((as.numeric(modelo1$pib))-medialo)^2)
rlcp<-(sslp/sstlc)
rlcp

## [1] 0.6473693

DISCRIMINANT ANALYSIS

LDA <- lda(pib ~negocios, data =datos.train, method = "mve")
LDA1<- predict(LDA, datos.test)
meLDA<-mean(abs(x-(LDA1$posterior[,2])))*100
meLDA

## [1] 1.533611

for(i in 1:100){
  train.cb <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
  datos.trainb <- modelo1[train.cb, ]
  datos.testb <- modelo1[-train.cb, ]
  modelomve <- lda(pib ~negocios, data =datos.trainb, method = "mve")
  mve <- predict(modelomve, newdata = datos.testb)
  problda<-c(problda,mve$posterior[,2])
  mveif <- ifelse(mve$posterior[,2] > 0.5, 1, 0)
  prelda3[i]<-mean(datos.testb$pib ==  mveif)
  mepalda3[i]<-mean(abs((as.numeric(datos.testb$pib))-mve$posterior[,2]))*100
  añolda<-c(añolda,as.numeric(row.names(datos.testb))+1983)
  elprlda<-c(elprlda, mveif)
  elprlda1<-c(elprlda1,as.numeric(datos.testb[,1]))
}

primerolda <- table(prd=elprlda, act=elprlda1)
confusionMatrix(primerolda)

## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 790  26
##   1   0 184
##                                           
##                Accuracy : 0.974           
##                  95% CI : (0.9621, 0.9829)
##     No Information Rate : 0.79            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9179          
##  Mcnemar's Test P-Value : 9.443e-07       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8762          
##          Pos Pred Value : 0.9681          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7900          
##          Detection Rate : 0.7900          
##    Detection Prevalence : 0.8160          
##       Balanced Accuracy : 0.9381          
##                                           
##        'Positive' Class : 0               
##

mean(mepalda3)

## [1] 5.405337

l<-ggplot(data=datosgrafico1, aes(Año,Probabilidades))+scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

for(i in 1:100){
  l<-l+geom_point(shape=1,stroke=2,colour = 'steelblue',size=3.5,aes_string(añolda[(i*10-9):(i*10)],problda[(i*10-9):(i*10)]*100))
}
l

for (i in 1:33){
  trainloo<- modelo1[-i,]
  testloo<- modelo1[i,]
  modelomveloo <- lda(pib ~negocios, data =trainloo, method = "mve")
  mveloo <- predict(modelomveloo, newdata = testloo, type = "response")
  testloo <- as.numeric(modelo1[i,1])
  testloo <- testloo-1
  mveifloo <- ifelse(mveloo$posterior[,2] > 0.5, 1, 0)
  prelda3loo[i]<-mean(testloo ==  mveifloo)
  mepalda3loo[i]<-mean(abs(testloo-mveloo$posterior[,2]))*100
  probldaloo<-c(probldaloo,mveloo$posterior[,2])
}

mean(prelda3loo)

## [1] 0.03030303

mean(mepalda3loo)

## [1] 97.82513

ggplot(data=datosgrafico, aes(Año,Probabilidades))+annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  geom_line(linetype=1, colour = 'steelblue',size=1.8,aes(datosgrafico$Año,probldaloo*100))+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  geom_point(shape=21,fill="white",stroke=3,size=4.5,colour = 'black',aes(datosgrafico$Año,probldaloo*100))+
  scale_x_continuous(breaks = round(seq(1984, 2016, by = 1))) +
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

LDAc <- lda(pib ~negocios, data =modelo1, method = "mve")
LDAc1<- predict(LDAc, modelo1)
ss<- sum((LDAc1$posterior[,2]-medialo)^2)
sst<- sum(((as.numeric(modelo1$pib))-medialo)^2)
rlda<-(ss/sst)
rlda

## [1] 0.8573345

SUPPORT VECTOR MACHINE

svm <- svm(pib ~ negocios+euribor, data =datos.train, probability=TRUE)
svm1 <- predict(svm, datos.test, probability=TRUE)

for(i in 1:100){
  train.cb <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
  datos.trainb <- modelo1[train.cb, ]
  datos.testb <- modelo1[-train.cb, ]
  fitbucle <- svm(pib ~ negocios+euribor, data =datos.trainb, probability=TRUE)
  predbucle1 <- predict(fitbucle, datos.testb)
  svmif <- ifelse(predbucle1 > 0.5, 1, 0)
  pmeasvmbucle<-mean(abs(predbucle1-((as.numeric(datos.testb$pib)))))*100
  scorebucle<-mean(datos.testb$pib ==   svmif)
  vectorsvm[i]<- pmeasvmbucle
  newvectorsvm[i]<- scorebucle
  elSVM<-c(elSVM,  svmif)
  elSVM1<-c(elSVM1,as.numeric(datos.testb[,1]))
  probSVM<-c(probSVM,predbucle1)
  añoSVM<-c(añoSVM,as.numeric(row.names(datos.testb))+1983)
}

primeroSVM <- table(prd=elSVM, act=elSVM1)
confusionMatrix(primeroSVM)

## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 765 117
##   1   0 118
##                                           
##                Accuracy : 0.883           
##                  95% CI : (0.8614, 0.9023)
##     No Information Rate : 0.765           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6068          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.5021          
##          Pos Pred Value : 0.8673          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7650          
##          Detection Rate : 0.7650          
##    Detection Prevalence : 0.8820          
##       Balanced Accuracy : 0.7511          
##                                           
##        'Positive' Class : 0               
##

mean(vectorsvm)

## [1] 15.82514

for (i in 1:33){
  trainloo<- modelo1[-i,]
  testloo<- modelo1[i,]
  fitbucleLOO <- svm(pib ~ euribor+negocios, data =trainloo, probability=TRUE)
  predbucleLOO <- predict(fitbucleLOO, testloo, probability=TRUE)
  testloo <- as.numeric(modelo1[i,1])
  prosvm[i]<-predbucleLOO
   vectorLOOsvm[i]<-mean(abs(predbucleLOO-testloo))*100
   ifloosvm <- ifelse(predbucleLOO > 0.5, 1, 0)
  newvectorLOOsvm[i]<-mean(testloo ==  ifloosvm) 
}

mean( vectorLOOsvm)

## [1] 11.65092

mean(newvectorLOOsvm)

## [1] 0.9393939

ggplot(data=datosgrafico, aes(Año,Probabilidades))+annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  geom_line(linetype=1, colour = 'steelblue',size=1.8,aes(datosgrafico$Año,prosvm*100))+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  geom_point(shape=21,fill="white",stroke=3,size=4.5,colour = 'black',aes(datosgrafico$Año,prosvm*100))+
  scale_x_continuous(breaks = round(seq(1984, 2016, by = 1))) +
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

DECISION TREES

ac <- rpart(pib ~ TVFBC+tvgastobruto, data =datos.train)
ac1<- predict(ac,datos.test)
pemaac<-mean(abs((as.numeric(datos.test$pib))-ac1)*100)
pemaac

## [1] 20

for(i in 1:100){
  train.cb <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
  datos.trainb <- modelo1[train.cb, ]
  datos.testb <- modelo1[-train.cb, ]
  acc <- rpart(pib ~ TVFBC+tvgastobruto, data =datos.trainb)
  acc1<- predict(acc,datos.testb)
  probaac<-c(probaac,acc1)
  prd <- ifelse(acc1 > 0.5, 1, 0)
  accscore[i]<-mean(datos.testb$pib == prd)
  acmepa[i]<-mean(abs(x- acc1))*100
  añoac<-c(añoac,as.numeric(row.names(datos.testb))+1983)
  
  probac<-c(probac, prd)
  probac1<-c(probac1,as.numeric(datos.testb[,1]))
}

probac<-probac[-1]
añoac<-añoac[-1]
probac1<-probac1[-1]
probaac<-probaac[-1]

primeroac <- table(prd=probac, act=probac1)
confusionMatrix(primeroac)

## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 708  58
##   1  83 151
##                                         
##                Accuracy : 0.859         
##                  95% CI : (0.8359, 0.88)
##     No Information Rate : 0.791         
##     P-Value [Acc > NIR] : 2.019e-08     
##                                         
##                   Kappa : 0.5915        
##  Mcnemar's Test P-Value : 0.04326       
##                                         
##             Sensitivity : 0.8951        
##             Specificity : 0.7225        
##          Pos Pred Value : 0.9243        
##          Neg Pred Value : 0.6453        
##              Prevalence : 0.7910        
##          Detection Rate : 0.7080        
##    Detection Prevalence : 0.7660        
##       Balanced Accuracy : 0.8088        
##                                         
##        'Positive' Class : 0             
##

mean(acmepa)

## [1] 29.55714

ac<-ggplot(data=datosgrafico1, aes(Año,Probabilidades))+scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                                                                                      alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

for(i in 1:100){
  ac<-ac+geom_point(shape=1,stroke=2,colour = 'steelblue',size=3.5,
                    aes_string(añoac[(i*10-9):(i*10)],
                     probaac[(i*10-9):(i*10)]*100))
}
ac

for (i in 1:33){
  trainloo<- modelo1[-i,]
  testloo<- modelo1[i,]
  accc <- rpart(pib ~ TVFBC+tvgastobruto, data =trainloo)
  accc1<- predict(accc,testloo)
  testloo <- as.numeric(modelo1[i,1])
  prdlcc <- ifelse(accc1 > 0.5, 1, 0)
  aclooscore[i]<-mean(testloo == prdlcc)
  acloomepa[i]<-mean(abs(testloo-accc1))*100
  
  probac[i]<-accc1
}
mean(aclooscore)

## [1] 0.969697

mean(acloomepa)

## [1] 6.060606

ggplot(data=datosgrafico, aes(Año,Probabilidades))+annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  geom_line(linetype=1, colour = 'steelblue',size=1.8,aes(datosgrafico$Año,probac*100))+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  geom_point(shape=21,fill="white",stroke=3,size=4.5,colour = 'black',aes(datosgrafico$Año,probac*100))+
  scale_x_continuous(breaks = round(seq(1984, 2016, by = 1))) +
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

ssac<- sum((probac-medialo)^2)
sstac<- sum(((as.numeric(modelo1$pib))-medialo)^2)
rac<-(ssac/sstac)
rac

## [1] 0.844584

RANDOM FOREST

set.seed(71)
RFcompleto<-randomForest(as.factor(pib) ~ TVFBC+tvgastobruto,importance=TRUE,
                        proximity=TRUE, data =datos.train)
RFcompleto1 <- predict(RFcompleto, newdata = datos.test, type = 'prob')[,2]

for(i in 1:100){
  train.cb <- sample(1:nrow(modelo1), nrow(modelo1)*0.7)
  datos.trainb <- modelo1[train.cb, ]
  datos.testb <- modelo1[-train.cb, ]
  RFbucle<-randomForest(as.factor(pib) ~ TVFBC+tvgastobruto, data =datos.trainb)
  fitForestbucle <- predict(RFbucle, newdata = datos.testb, type = 'prob')[, 2]
  fitForestprueba <- predict(RFbucle, newdata = datos.testb)
  probrfg<-c(probrfg,fitForestbucle)
  pmeabuclet<-mean(abs((as.numeric(datos.testb[,1]))-fitForestbucle ))*100
  score<-mean(datos.testb$pib == fitForestprueba)
  vector[i]<- pmeabuclet
  newvector[i]<- score
  
  añorf<-c(añorf,as.numeric(row.names(datos.testb))+1983)
  
  probrf<-c(probrf,  as.numeric(fitForestprueba)-1)
  probrf1<-c(probrf1,as.numeric(datos.testb[,1])-1)
}

mean(vector)

## [1] 4.795

rf<-ggplot(data=datosgrafico1, aes(Año,Probabilidades))+scale_x_continuous(breaks = seq(1984, 2016, by = 1)) +annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                                                                                       alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

for(i in 1:100){
  rf<-rf+geom_point(shape=1,stroke=2,colour = 'steelblue',size=3.5,
                    aes_string(añorf[(i*10-9):(i*10)],
                               probrfg[(i*10-9):(i*10)]*100))
}
rf

for (i in 1:33){
  trainloo<- modelo1[-i,]
  testloo<- modelo1[i,]
  RFbucle<-randomForest(as.factor(pib) ~ TVFBC+tvgastobruto, data =trainloo)
  fitForestloo <- predict(RFbucle, newdata = testloo, type = 'prob')[, 2]
  pr <- ifelse(fitForestloo > 0.5, 1, 0)
  testloo <- as.numeric(modelo1[i,1])
  vectorrfscore[i]<-mean(testloo == pr)
  vectorrf[i]<-mean(abs(testloo-fitForestloo))*100
  probabilidadesrf[i]<-fitForestloo
  
}

mean(vectorrf)

## [1] 4.424242

mean(vectorrfscore)

## [1] 1

ggplot(data=datosgrafico, aes(Año,Probabilidades))+annotate("rect", xmin = 1992.5, xmax = 1993.5, ymin = 0, ymax = 100,
                                                            alpha = .5)+labs(y="Probabilidades (%)")+
  annotate("rect", xmin = 2007.5, xmax = 2013.5, ymin = 0, ymax = 100,
           alpha = .5)+
  geom_line(linetype=1, colour = 'steelblue',size=1.8,aes(datosgrafico$Año,probabilidadesrf*100))+
  theme(panel.background = element_rect(fill = 'grey93'),
        panel.grid.minor = element_line(colour="white",size=1),
        panel.grid.major = element_line(colour="white",size=1))+
  geom_point(shape=21,fill="white",stroke=3,size=4.5,colour = 'black',aes(datosgrafico$Año,probabilidadesrf*100))+
  scale_x_continuous(breaks = round(seq(1984, 2016, by = 1))) +
  scale_y_continuous(breaks = round(seq(0, 100, by = 10)))+
  theme(text = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

RFcompleto<-randomForest(as.factor(pib) ~ TVFBC+tvgastobruto, data =modelo1)
RFcompleto1 <- predict(RFcompleto, newdata = modelo1, type = 'prob')[, 2]
ssrf<- sum((RFcompleto1-medialo)^2)
sstrf<- sum(((as.numeric(modelo1$pib))-medialo)^2)
rsvm<-(ssrf/sstrf)
rsvm

## [1] 0.9216059

ARIMA

atvahorro<-auto.arima(modelo1$tvahorro) 
plot(forecast(atvahorro,h=4))

aconstruccion<-auto.arima(modelo1$Construccion)
plot(forecast(aconstruccion))

anegocios<-auto.arima(modelo1$negocios)
plot(forecast(anegocios))

aeuribor<-auto.arima(modelo1$euribor) 
plot(forecast(aeuribor))

atvfbc<-auto.arima(modelo1$tvfbc)
plot(forecast(atvfbc))

atvgastobruto<-auto.arima(modelo1$tvgastobruto)
plot(forecast(atvgastobruto))

ptvahrro<-forecast(atvahorro,h=4)
pconstruccion<-forecast(aconstruccion,h=4)
pnegocios<-forecast(anegocios,h=4)
peuribor<-forecast(aeuribor,h=4)
ptvfbc<-forecast(atvfbc,h=4)
ptvgastobruto<-forecast(atvgastobruto,h=4)

up<-ptvahrro$mean + 0.8416*sqrt(atvahorro$sigma2)
lo<-ptvahrro$mean - 0.8416*sqrt(atvahorro$sigma2)

upc<-pconstruccion$mean + 0.8416*sqrt(aconstruccion$sigma2)
loc<-pconstruccion$mean - 0.8416*sqrt(aconstruccion$sigma2)

upn<-pnegocios$mean + 0.8416*sqrt(anegocios$sigma2)
lon<-pnegocios$mean - 0.8416*sqrt(anegocios$sigma2)

upe<-peuribor$mean + 0.8416*sqrt(aeuribor$sigma2)
loe<-peuribor$mean - 0.8416*sqrt(aeuribor$sigma2)

upp<-ptvfbc$mean + 0.8416*sqrt(atvfbc$sigma2)
lop<-ptvfbc$mean - 0.8416*sqrt(atvfbc$sigma2)

upg<-ptvgastobruto$mean + 0.8416*sqrt(atvgastobruto$sigma2)
log<-ptvgastobruto$mean - 0.8416*sqrt(atvgastobruto$sigma2)

datalogit<-data.frame(modelo1$pib,modelo1$tvahorro,
                      modelo1$Construccion,modelo1$negocios,modelo1$euribor,
                      modelo1$tvfbc,modelo1$tvgastobruto)
colnames(datalogit)<-c("pib","tvahorro","construccion","negocios","euribor","tvfbc","tvgastobruto")
prelogit<-glm(pib~tvahorro+construccion,family = binomial(link = "logit"),datalogit)

datalogitp<-data.frame(ptvahrro$mean,pconstruccion$mean,pnegocios$mean,
                       peuribor$mean,ptvfbc$mean,ptvgastobruto$mean)
colnames(datalogitp)<-c("tvahorro","construccion","negocios","euribor","tvfbc","tvgastobruto")
datalogitpe<-data.frame(lo,loc,lon,
                        loe,lop,log)
colnames(datalogitpe)<-c("tvahorro","construccion","negocios","euribor","tvfbc","tvgastobruto")
datalogitop<-data.frame(up,upc,upn,upe,upp,upg)
colnames(datalogitop)<-c("tvahorro","construccion","negocios","euribor","tvfbc","tvgastobruto")
prediclogitprobable<-predict(prelogit,newdata = datalogitp,type="response"); prediclogitprobable*100

##        1        2        3        4 
## 3.589468 4.122017 4.184879 4.459766

prediclogitpesimista<-predict(prelogit,newdata = datalogitpe,type="response");prediclogitpesimista*100

##        1        2        3        4 
## 39.71775 43.20820 43.59609 45.23750

prediclogitoptimista<-predict(prelogit,newdata = datalogitop,type="response");prediclogitoptimista*100

##         1         2         3         4 
## 0.2099439 0.2423526 0.2462004 0.2630826

######probit#######

preprobit <- glm(pib~tvahorro,data =datalogit, family = binomial(link = "probit"))

predicprobitprobable<-predict(preprobit,newdata = datalogitp,type="response");predicprobitprobable*100

##         1         2         3         4 
## 0.4153453 0.4841371 0.4516590 0.4661914

predicprobitpesimista<-predict(preprobit,newdata = datalogitpe,type="response");predicprobitpesimista*100

##        1        2        3        4 
## 14.05275 15.25457 14.69938 14.95036

predicprobitoptimista<-predict(preprobit,newdata = datalogitop,type="response");predicprobitoptimista*100

##           1           2           3           4 
## 0.001330360 0.001674475 0.001508510 0.001582008

#####lda####

LDAp <- lda(pib ~negocios, data =datalogit, method = "mve")
ldaprobable<-predict(LDAp, datalogitp); ldaprobable$posterior[,2]*100

##            1            2            3            4 
## 0.0003083086 0.1197622942 0.2922833337 0.3340571737

ldappesimista<-predict(LDAp, datalogitpe);ldappesimista$posterior[,2]*100

##           1           2           3           4 
##  0.06138178 19.28120883 36.86769556 40.03782460

ldapoptimista<-predict(LDAp, datalogitop);ldapoptimista$posterior[,2]*100

##            1            2            3            4 
## 1.547633e-06 6.018924e-04 1.471465e-03 1.682471e-03

####svm####
svmpre <- svm(pib ~ negocios+euribor, data =datalogit, probability=TRUE)
svmp <- predict(svmpre, datalogitp, probability=TRUE);svmp*100

##        1        2        3        4 
## 7.294759 4.087522 6.697190 7.407709

svmpe <- predict(svmpre, datalogitpe, probability=TRUE);svmpe*100

##         1         2         3         4 
##  7.029029 34.745248 41.746702 42.852690

svmop <- predict(svmpre, datalogitop, probability=TRUE);svmop*100

##          1          2          3          4 
## 18.1721526  2.7104780  0.9663035  0.9078932

####arboles#####
acpre <- rpart(pib ~ tvfbc+tvgastobruto, data =datalogit)
acpro<-predict(acpre, datalogitp);acpro*100

## 1 2 3 4 
## 0 0 0 0

acpesimista<-predict(acpre, datalogitpe);acpesimista*100

## 1 2 3 4 
## 0 0 0 0

acoptimista<-predict(acpre, datalogitop);acoptimista*100

## 1 2 3 4 
## 0 0 0 0

####randomforest#####
RFpre<-randomForest(as.factor(pib) ~ tvfbc+tvgastobruto, data =datalogit)
RFpepro <- predict(RFpre, newdata = datalogitp, type = 'prob')[, 2];RFpepro*100

## 1 2 3 4 
## 0 0 0 0

RFpesimista <- predict(RFpre, newdata = datalogitpe,
                       type = 'prob')[, 2];RFpesimista*100

##    1    2    3    4 
## 36.2 66.0 66.2 66.4

RFoptimista <- predict(RFpre, newdata = datalogitop,
                       type = 'prob')[, 2];RFoptimista*100

## 1 2 3 4 
## 0 0 0 0

FORECASTING MODELS OF RECESION IN SPAIN APPLYING MACHINE LEARNING TECHNIQUES

José Buitrago Sánchez-Molero

3 de mayo de 2018

LOGIT

PROBIT

DISCRIMINANT ANALYSIS

SUPPORT VECTOR MACHINE

DECISION TREES

RANDOM FOREST

ARIMA