Clase 27 - Correlación y regresión

logo

1. Tipo de regresión

library("ggplot2")

source("multiplot.R")

set.seed(10)
x=sample(1:100,100,replace = T)


#Regresión lineal
m=2
c=60
y=m*x+c
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g1=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión lineal simple") + theme(plot.title = element_text(hjust = 0.5))

#Regresión exponencial
A=2
b=0.02
y=A*exp(-b*x)
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g2=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión exponencial") + theme(plot.title = element_text(hjust = 0.5))


#Regresión logarítimica
a=2
y=a*log(x)
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g3=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión logarítmica") + theme(plot.title = element_text(hjust = 0.5))

#Regresión polinomial (cuadrática)
a=3
b=3
c=3
d=3
y=a+b*x+c*x^2+d*x^3
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g4=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión cuadrática") + theme(plot.title = element_text(hjust = 0.5))


multiplot(g1,g2,g3,g4,cols = 2)

2. Regresión lineal simple

library("knitr")
datos=read.table("BD.csv",header = T,sep=",")
names(datos)=c("PAIS","GASTO_US","EXPVIDA_ANOS")
kable(head(datos))

PAIS	GASTO_US	EXPVIDA_ANOS
Australia	4492.554	82.5
Austria	5100.015	81.3
Belgium	4778.452	81.1
Chile	1877.359	79.1
Czech Republic	2466.027	78.7
Denmark	5057.861	80.8

g1=ggplot(datos,aes(GASTO_US, EXPVIDA_ANOS, label = PAIS))+geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green") +
  theme_bw() + ylab("Expectativa de vida [años]") + xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
  theme(plot.title = element_text(hjust = 0.5)) + geom_text(check_overlap = TRUE) #Remueve los que estn sobre otros
plot(g1)

regresion=lm(EXPVIDA_ANOS~GASTO_US,datos)
print(regresion)

## 
## Call:
## lm(formula = EXPVIDA_ANOS ~ GASTO_US, data = datos)
## 
## Coefficients:
## (Intercept)     GASTO_US  
##   7.798e+01    6.637e-04

datos$EXPVIDA_EST=regresion$coefficients[2]*datos$GASTO_US+regresion$coefficients[1]

g1=ggplot(datos,aes(GASTO_US, EXPVIDA_ANOS, label = PAIS))+geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green") +
  theme_bw() + ylab("Expectativa de vida [años]") + xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
  theme(plot.title = element_text(hjust = 0.5)) + geom_text(check_overlap = TRUE) + #Remueve los que estn sobre otros +
  geom_line(aes(GASTO_US,EXPVIDA_EST),datos,color="blue",cex=0.1)
plot(g1)

datos$promedio=datos$EXPVIDA_EST*0+mean(datos$EXPVIDA_ANOS)

g1=ggplot(datos,aes(GASTO_US, EXPVIDA_ANOS, label = PAIS))+geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green") +
  theme_bw() + ylab("Expectativa de vida [años]") + xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
  theme(plot.title = element_text(hjust = 0.5)) + geom_text(check_overlap = TRUE) + #Remueve los que estn sobre otros +
  geom_line(aes(GASTO_US,EXPVIDA_EST),datos,color="blue",cex=0.1) + geom_line(aes(GASTO_US,promedio),datos,color="red",cex=0.1)
plot(g1)

summary(regresion)

## 
## Call:
## lm(formula = EXPVIDA_ANOS ~ GASTO_US, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4864 -0.7812  0.0651  1.4460  2.9796 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.798e+01  8.012e-01  97.323  < 2e-16 ***
## GASTO_US    6.637e-04  1.870e-04   3.549  0.00122 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.097 on 32 degrees of freedom
## Multiple R-squared:  0.2824, Adjusted R-squared:   0.26 
## F-statistic:  12.6 on 1 and 32 DF,  p-value: 0.001219

confianza = confint(regresion,level=0.95)
print(confianza)

##                   2.5 %       97.5 %
## (Intercept) 76.34447058 79.608509537
## GASTO_US     0.00028278  0.001044611

g1=ggplot(datos,aes(x=GASTO_US, y=EXPVIDA_ANOS, label = PAIS)) +
  geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green")+
  theme_bw() + ylab("Expectativa de vida [años]")+
  xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_text(check_overlap = TRUE) + #Remueve los que estn sobre otros +
  geom_line(aes(GASTO_US,EXPVIDA_EST),datos,color="blue",cex=0.1) +
  geom_smooth(method ="lm", formula = y ~ x,level=0.95)
plot(g1)

3. Regresión exponencial

\[ \begin{equation} y=a e^{b X_i} = 4 * e^{-0.01 X_i} \end{equation} \]

##EXPONENCIAL
n=500; x=1:n; set.seed(10);y = 4*exp(-x*0.01); yprima=y+y*runif(100,-0.4,0.4); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
  theme_bw() + ylab("y") + ggtitle("Regresión exponencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

##EXPONENCIAL
n=500; x=1:n; set.seed(10);y = 4*exp(-x*0.01); yprima=y+y*runif(100,-0.4,0.4); datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a*exp(-b*x),start = list(a = 1,b=0.01)); summary(fit)

## 
## Formula: yprima ~ a * exp(-b * x)
## 
## Parameters:
##    Estimate Std. Error t value Pr(>|t|)    
## a 3.8364052  0.0510798   75.11   <2e-16 ***
## b 0.0100053  0.0001877   53.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2525 on 498 degrees of freedom
## 
## Number of iterations to convergence: 3 
## Achieved convergence tolerance: 1.242e-07

g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión exponencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

\[ \begin{equation} y=a e^{b X_i} = 4 * e^{-0.01 X_i} \end{equation} \]

\[ \begin{equation} y'_i=a e^{b X_i} \approx 3.84 * e^{-0.01 X_i} \end{equation} \]

4. Regresión potencial

\[ \begin{equation} y_i=c x^a = 2 x^{1.5} \end{equation} \]

##Potencial
n=500; x=1:n; set.seed(10);y = 2*x^1.5; yprima=y+y*runif(100,-0.8,0.8); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
  theme_bw() + ylab("y") + ggtitle("Regresión potencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

##Potencial
datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a*x^b,start = list(a = 1,b=1)); summary(fit)

## 
## Formula: yprima ~ a * x^b
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a  1.99819    0.96997    2.06   0.0399 *  
## b  1.48415    0.08133   18.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4666 on 498 degrees of freedom
## 
## Number of iterations to convergence: 9 
## Achieved convergence tolerance: 4.881e-08

g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión potencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

\[ \begin{equation} y_i=c x^a = 2 x^{1.5} \end{equation} \]

\[ \begin{equation} y_i=c x^a \approx 1.99 x^{1.48} \end{equation} \]

5. Regresión logarítmica

\[ \begin{equation} y_i= c +b*log(x) = 10 +20*log(x) \end{equation} \]

n=500; x=1:n; set.seed(10);y = 10 +20*log(x); yprima=y+y*runif(100,-0.1,0.1); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
  theme_bw() + ylab("y") + ggtitle("Regresión logarítmica") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

##Logarítmica
datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a+b*log(x),start = list(a = 1,b=1)); summary(fit)

## 
## Formula: yprima ~ a + b * log(x)
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a  10.0901     1.4681   6.873 1.89e-11 ***
## b  19.7425     0.2763  71.445  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.024 on 498 degrees of freedom
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 3.948e-09

g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión logarítmica") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

\[ \begin{equation} y_i= c +b*log(x) = 10 +20*log(x) \end{equation} \]

\[ \begin{equation} y'_i= c +b*log(x) = 10.09 +19.9*log(x) \end{equation} \]

6. Regresión polinomial

\[ \begin{equation} y_i= a_0 +a_1*x + a_2*x^2=3+4*x +5*x^2 \end{equation} \]

##Polinomial
n=500; x=1:n; set.seed(10);y = 3+4*x +5*x^2; yprima=y+y*runif(100,-0.1,0.1); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
  theme_bw() + ylab("y") + ggtitle("Regresión polinomial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

##Logaritmo
datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a+b*x +c*x^2,start = list(a = 1,b=1,c=1)); summary(fit)

## 
## Formula: yprima ~ a + b * x + c * x^2
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## a -432.68666 3947.22654  -0.110    0.913    
## b   10.86011   36.38562   0.298    0.765    
## c    4.92818    0.07033  70.074   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29300 on 497 degrees of freedom
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 1.631e-06

g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
  theme_bw() + ylab("y") + ggtitle("Regresión polinomial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)

\[ \begin{equation} y_i= a_0 +a_1*x + a_2*x^2=3+4*x +5*x^2 \end{equation} \]

\[ \begin{equation} y'_i= a_0 +a_1*x + a_2*x^2\approx-432.7+10.8*x +4.92*x^2\approx 5*x^2 \end{equation} \]

7. Regresión lineal múltiple

datos=read.table("hosp.csv",header=T,sep=",")
kable(head(datos[,c(1,3,4,5,6,7)],4))

Nhospital	RRHH	BSERV	Egresos	DCO	DE
H1	19779705	10982608	15196	91433	93952
H2	27881106	15394204	15960	114528	136033
H3	29969438	15065061	17785	139584	141724
H4	11358890	3408115	7782	32531	32629

library(ggplot2)

g=ggplot(datos)+geom_point(aes(x=Egresos,y=RRHH)) +  
 annotate("text",x = max(datos$Egresos)*0.7, y = max(datos$RRHH)*0.9, label = paste("Corr:",round(cor(datos$RRHH,datos$Egresos),3)))

g1=ggplot(datos)+geom_point(aes(x=DCO,y=RRHH)) +  
  annotate("text", x = max(datos$DCO)*0.7, y = max(datos$RRHH)*0.9, label = paste("Corr:",round(cor(datos$RRHH,datos$DCO),3)))

g2=ggplot(datos)+geom_point(aes(x=DE,y=RRHH)) +  
  annotate("text", x = max(datos$DE)*0.7, y = max(datos$RRHH)*0.9, label = paste("Corr:",round(cor(datos$RRHH,datos$DE),3)))
  
source("multiplot.R")
multiplot(g,g1,g2,cols=3)

fit_RRHH <- lm(datos$RRHH~Egresos+DCO+DE,data=datos)
summary(fit_RRHH)

## 
## Call:
## lm(formula = datos$RRHH ~ Egresos + DCO + DE, data = datos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8556957 -1018261  -174886   483225 16580483 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 348070.83  253002.28   1.376    0.171    
## Egresos        773.11      81.53   9.483   <2e-16 ***
## DCO             69.96      60.22   1.162    0.247    
## DE              23.05      61.23   0.377    0.707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2738000 on 177 degrees of freedom
## Multiple R-squared:  0.9436, Adjusted R-squared:  0.9427 
## F-statistic: 987.5 on 3 and 177 DF,  p-value: < 2.2e-16

fit_1 <- lm(datos$RRHH~Egresos,data=datos)
fit_2 <- lm(datos$RRHH~DCO,data=datos)
fit_3 <- lm(datos$RRHH~DE,data=datos)
print(anova(fit_RRHH,fit_1,fit_2,fit_3))

## Analysis of Variance Table
## 
## Model 1: datos$RRHH ~ Egresos + DCO + DE
## Model 2: datos$RRHH ~ Egresos
## Model 3: datos$RRHH ~ DCO
## Model 4: datos$RRHH ~ DE
##   Res.Df        RSS Df   Sum of Sq      F    Pr(>F)    
## 1    177 1.3273e+15                                    
## 2    179 1.8085e+15 -2 -4.8119e+14 32.084 1.289e-12 ***
## 3    179 2.0087e+15  0 -2.0021e+14                     
## 4    179 2.0419e+15  0 -3.3169e+13                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8. Regresión multivariada

lm(cbind(RRHH,BSERV)~ Egresos+DCO+DE,data=datos)

## 
## Call:
## lm(formula = cbind(RRHH, BSERV) ~ Egresos + DCO + DE, data = datos)
## 
## Coefficients:
##              RRHH        BSERV     
## (Intercept)   348070.83  -567950.81
## Egresos          773.11      459.51
## DCO               69.96       94.25
## DE                23.05      -10.87

summary(lm(cbind(RRHH,BSERV)~ Egresos+DCO+DE,data=datos))

## Response RRHH :
## 
## Call:
## lm(formula = RRHH ~ Egresos + DCO + DE, data = datos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8556957 -1018261  -174886   483225 16580483 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 348070.83  253002.28   1.376    0.171    
## Egresos        773.11      81.53   9.483   <2e-16 ***
## DCO             69.96      60.22   1.162    0.247    
## DE              23.05      61.23   0.377    0.707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2738000 on 177 degrees of freedom
## Multiple R-squared:  0.9436, Adjusted R-squared:  0.9427 
## F-statistic: 987.5 on 3 and 177 DF,  p-value: < 2.2e-16
## 
## 
## Response BSERV :
## 
## Call:
## lm(formula = BSERV ~ Egresos + DCO + DE, data = datos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8400687 -1132474     2251   465098 21709324 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -567950.81  297613.19  -1.908    0.058 .  
## Egresos         459.51      95.90   4.792 3.49e-06 ***
## DCO              94.25      70.83   1.331    0.185    
## DE              -10.87      72.03  -0.151    0.880    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3221000 on 177 degrees of freedom
## Multiple R-squared:  0.8677, Adjusted R-squared:  0.8654 
## F-statistic: 386.9 on 3 and 177 DF,  p-value: < 2.2e-16

9. Regresión logística

library("titanic")
datos=titanic_train
modelo=glm(Survived ~ Age, data = datos)
summary(modelo)

## 
## Call:
## glm(formula = Survived ~ Age, data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4811  -0.4158  -0.3662   0.5789   0.7252  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.483753   0.041788  11.576   <2e-16 ***
## Age         -0.002613   0.001264  -2.067   0.0391 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2404297)
## 
##     Null deviance: 172.21  on 713  degrees of freedom
## Residual deviance: 171.19  on 712  degrees of freedom
##   (177 observations deleted due to missingness)
## AIC: 1012.6
## 
## Number of Fisher Scoring iterations: 2

datos=titanic_train
modelo=glm(Survived ~ Age, data = datos)
plot(x = datos$Age, y = datos$Survived , col = "darkblue",
     main = "probabilidad de sobrevida en función de la edad", xlab = "edad",
     ylab = "Probabilidad de sobrevida")
par(new=TRUE)
num=exp(modelo$coefficients[1]+modelo$coefficients[1]*(datos$Age))
plot(datos$Age,num/(1+num),col="darkgreen",ylab="",yaxt='n',xaxt='n')

library("dummies")
sexo=dummy(titanic_train$Sex) #Sexo ajustado
datos=titanic_train
datos_proc=data.frame(datos$Survived,datos$Age,sexo[,2])
names(datos_proc)=c("Sobrevida","Edad","Femenino")
modelo=glm(Sobrevida~., data = datos_proc) #Predicción

#Probabilidad de que una mujer de 40 años sobreviva 
print(predict(modelo,data.frame(Edad=40,Femenino=1)),type='response')

##         1 
## 0.1967612

print(summary(modelo))

## 
## Call:
## glm(formula = Sobrevida ~ ., data = datos_proc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7786  -0.2115  -0.1931   0.2471   0.8401  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.7804879  0.0394345  19.792   <2e-16 ***
## Edad        -0.0009206  0.0010730  -0.858    0.391    
## Femenino    -0.5469036  0.0323428 -16.910   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1717122)
## 
##     Null deviance: 172.21  on 713  degrees of freedom
## Residual deviance: 122.09  on 711  degrees of freedom
##   (177 observations deleted due to missingness)
## AIC: 773.22
## 
## Number of Fisher Scoring iterations: 2