Clase 27 - Correlación y regresión
1. Tipo de regresión
library("ggplot2")
source("multiplot.R")
set.seed(10)
x=sample(1:100,100,replace = T)
#Regresión lineal
m=2
c=60
y=m*x+c
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g1=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión lineal simple") + theme(plot.title = element_text(hjust = 0.5))
#Regresión exponencial
A=2
b=0.02
y=A*exp(-b*x)
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g2=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión exponencial") + theme(plot.title = element_text(hjust = 0.5))
#Regresión logarítimica
a=2
y=a*log(x)
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g3=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión logarítmica") + theme(plot.title = element_text(hjust = 0.5))
#Regresión polinomial (cuadrática)
a=3
b=3
c=3
d=3
y=a+b*x+c*x^2+d*x^3
yprima=y+y*runif(100,-0.4,0.4)
datos=data.frame(x,y,yprima)
g4=ggplot()+geom_point(aes(x,yprima),datos,color="gray33") +geom_line(aes(x,y),datos,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión cuadrática") + theme(plot.title = element_text(hjust = 0.5))
multiplot(g1,g2,g3,g4,cols = 2)
2. Regresión lineal simple
library("knitr")
datos=read.table("BD.csv",header = T,sep=",")
names(datos)=c("PAIS","GASTO_US","EXPVIDA_ANOS")
kable(head(datos))
PAIS | GASTO_US | EXPVIDA_ANOS |
---|---|---|
Australia | 4492.554 | 82.5 |
Austria | 5100.015 | 81.3 |
Belgium | 4778.452 | 81.1 |
Chile | 1877.359 | 79.1 |
Czech Republic | 2466.027 | 78.7 |
Denmark | 5057.861 | 80.8 |
g1=ggplot(datos,aes(GASTO_US, EXPVIDA_ANOS, label = PAIS))+geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green") +
theme_bw() + ylab("Expectativa de vida [años]") + xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
theme(plot.title = element_text(hjust = 0.5)) + geom_text(check_overlap = TRUE) #Remueve los que estn sobre otros
plot(g1)
##
## Call:
## lm(formula = EXPVIDA_ANOS ~ GASTO_US, data = datos)
##
## Coefficients:
## (Intercept) GASTO_US
## 7.798e+01 6.637e-04
datos$EXPVIDA_EST=regresion$coefficients[2]*datos$GASTO_US+regresion$coefficients[1]
g1=ggplot(datos,aes(GASTO_US, EXPVIDA_ANOS, label = PAIS))+geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green") +
theme_bw() + ylab("Expectativa de vida [años]") + xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
theme(plot.title = element_text(hjust = 0.5)) + geom_text(check_overlap = TRUE) + #Remueve los que estn sobre otros +
geom_line(aes(GASTO_US,EXPVIDA_EST),datos,color="blue",cex=0.1)
plot(g1)
datos$promedio=datos$EXPVIDA_EST*0+mean(datos$EXPVIDA_ANOS)
g1=ggplot(datos,aes(GASTO_US, EXPVIDA_ANOS, label = PAIS))+geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green") +
theme_bw() + ylab("Expectativa de vida [años]") + xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
theme(plot.title = element_text(hjust = 0.5)) + geom_text(check_overlap = TRUE) + #Remueve los que estn sobre otros +
geom_line(aes(GASTO_US,EXPVIDA_EST),datos,color="blue",cex=0.1) + geom_line(aes(GASTO_US,promedio),datos,color="red",cex=0.1)
plot(g1)
##
## Call:
## lm(formula = EXPVIDA_ANOS ~ GASTO_US, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4864 -0.7812 0.0651 1.4460 2.9796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.798e+01 8.012e-01 97.323 < 2e-16 ***
## GASTO_US 6.637e-04 1.870e-04 3.549 0.00122 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.097 on 32 degrees of freedom
## Multiple R-squared: 0.2824, Adjusted R-squared: 0.26
## F-statistic: 12.6 on 1 and 32 DF, p-value: 0.001219
## 2.5 % 97.5 %
## (Intercept) 76.34447058 79.608509537
## GASTO_US 0.00028278 0.001044611
g1=ggplot(datos,aes(x=GASTO_US, y=EXPVIDA_ANOS, label = PAIS)) +
geom_point(aes(GASTO_US,EXPVIDA_ANOS),datos,color="green")+
theme_bw() + ylab("Expectativa de vida [años]")+
xlab("Gasto en salud PP [$US]") + ggtitle("Gasto vs calidad de salud") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(check_overlap = TRUE) + #Remueve los que estn sobre otros +
geom_line(aes(GASTO_US,EXPVIDA_EST),datos,color="blue",cex=0.1) +
geom_smooth(method ="lm", formula = y ~ x,level=0.95)
plot(g1)
3. Regresión exponencial
\[ \begin{equation} y=a e^{b X_i} = 4 * e^{-0.01 X_i} \end{equation} \]
##EXPONENCIAL
n=500; x=1:n; set.seed(10);y = 4*exp(-x*0.01); yprima=y+y*runif(100,-0.4,0.4); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
theme_bw() + ylab("y") + ggtitle("Regresión exponencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
##EXPONENCIAL
n=500; x=1:n; set.seed(10);y = 4*exp(-x*0.01); yprima=y+y*runif(100,-0.4,0.4); datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a*exp(-b*x),start = list(a = 1,b=0.01)); summary(fit)
##
## Formula: yprima ~ a * exp(-b * x)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 3.8364052 0.0510798 75.11 <2e-16 ***
## b 0.0100053 0.0001877 53.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2525 on 498 degrees of freedom
##
## Number of iterations to convergence: 3
## Achieved convergence tolerance: 1.242e-07
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión exponencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
\[ \begin{equation} y=a e^{b X_i} = 4 * e^{-0.01 X_i} \end{equation} \]
\[ \begin{equation} y'_i=a e^{b X_i} \approx 3.84 * e^{-0.01 X_i} \end{equation} \]
4. Regresión potencial
\[ \begin{equation} y_i=c x^a = 2 x^{1.5} \end{equation} \]
##Potencial
n=500; x=1:n; set.seed(10);y = 2*x^1.5; yprima=y+y*runif(100,-0.8,0.8); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
theme_bw() + ylab("y") + ggtitle("Regresión potencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
##Potencial
datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a*x^b,start = list(a = 1,b=1)); summary(fit)
##
## Formula: yprima ~ a * x^b
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 1.99819 0.96997 2.06 0.0399 *
## b 1.48415 0.08133 18.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4666 on 498 degrees of freedom
##
## Number of iterations to convergence: 9
## Achieved convergence tolerance: 4.881e-08
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión potencial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
\[ \begin{equation} y_i=c x^a = 2 x^{1.5} \end{equation} \]
\[ \begin{equation} y_i=c x^a \approx 1.99 x^{1.48} \end{equation} \]
5. Regresión logarítmica
\[ \begin{equation} y_i= c +b*log(x) = 10 +20*log(x) \end{equation} \]
n=500; x=1:n; set.seed(10);y = 10 +20*log(x); yprima=y+y*runif(100,-0.1,0.1); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
theme_bw() + ylab("y") + ggtitle("Regresión logarítmica") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
##Logarítmica
datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a+b*log(x),start = list(a = 1,b=1)); summary(fit)
##
## Formula: yprima ~ a + b * log(x)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 10.0901 1.4681 6.873 1.89e-11 ***
## b 19.7425 0.2763 71.445 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.024 on 498 degrees of freedom
##
## Number of iterations to convergence: 1
## Achieved convergence tolerance: 3.948e-09
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión logarítmica") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
\[ \begin{equation} y_i= c +b*log(x) = 10 +20*log(x) \end{equation} \]
\[ \begin{equation} y'_i= c +b*log(x) = 10.09 +19.9*log(x) \end{equation} \]
6. Regresión polinomial
\[ \begin{equation} y_i= a_0 +a_1*x + a_2*x^2=3+4*x +5*x^2 \end{equation} \]
##Polinomial
n=500; x=1:n; set.seed(10);y = 3+4*x +5*x^2; yprima=y+y*runif(100,-0.1,0.1); datos2=data.frame(x,y,yprima);
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +
theme_bw() + ylab("y") + ggtitle("Regresión polinomial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
##Logaritmo
datos2=data.frame(x,y,yprima);
fit = nls(yprima ~ a+b*x +c*x^2,start = list(a = 1,b=1,c=1)); summary(fit)
##
## Formula: yprima ~ a + b * x + c * x^2
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a -432.68666 3947.22654 -0.110 0.913
## b 10.86011 36.38562 0.298 0.765
## c 4.92818 0.07033 70.074 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29300 on 497 degrees of freedom
##
## Number of iterations to convergence: 1
## Achieved convergence tolerance: 1.631e-06
g1=ggplot()+geom_point(aes(x,yprima),datos2,color="gray33") +geom_line(aes(x,y),datos2,color="black",cex=1.3) +
theme_bw() + ylab("y") + ggtitle("Regresión polinomial") + theme(plot.title = element_text(hjust = 0.5))
plot(g1)
\[ \begin{equation} y_i= a_0 +a_1*x + a_2*x^2=3+4*x +5*x^2 \end{equation} \]
\[ \begin{equation} y'_i= a_0 +a_1*x + a_2*x^2\approx-432.7+10.8*x +4.92*x^2\approx 5*x^2 \end{equation} \]
7. Regresión lineal múltiple
Nhospital | RRHH | BSERV | Egresos | DCO | DE |
---|---|---|---|---|---|
H1 | 19779705 | 10982608 | 15196 | 91433 | 93952 |
H2 | 27881106 | 15394204 | 15960 | 114528 | 136033 |
H3 | 29969438 | 15065061 | 17785 | 139584 | 141724 |
H4 | 11358890 | 3408115 | 7782 | 32531 | 32629 |
library(ggplot2)
g=ggplot(datos)+geom_point(aes(x=Egresos,y=RRHH)) +
annotate("text",x = max(datos$Egresos)*0.7, y = max(datos$RRHH)*0.9, label = paste("Corr:",round(cor(datos$RRHH,datos$Egresos),3)))
g1=ggplot(datos)+geom_point(aes(x=DCO,y=RRHH)) +
annotate("text", x = max(datos$DCO)*0.7, y = max(datos$RRHH)*0.9, label = paste("Corr:",round(cor(datos$RRHH,datos$DCO),3)))
g2=ggplot(datos)+geom_point(aes(x=DE,y=RRHH)) +
annotate("text", x = max(datos$DE)*0.7, y = max(datos$RRHH)*0.9, label = paste("Corr:",round(cor(datos$RRHH,datos$DE),3)))
source("multiplot.R")
multiplot(g,g1,g2,cols=3)
##
## Call:
## lm(formula = datos$RRHH ~ Egresos + DCO + DE, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8556957 -1018261 -174886 483225 16580483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 348070.83 253002.28 1.376 0.171
## Egresos 773.11 81.53 9.483 <2e-16 ***
## DCO 69.96 60.22 1.162 0.247
## DE 23.05 61.23 0.377 0.707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2738000 on 177 degrees of freedom
## Multiple R-squared: 0.9436, Adjusted R-squared: 0.9427
## F-statistic: 987.5 on 3 and 177 DF, p-value: < 2.2e-16
fit_1 <- lm(datos$RRHH~Egresos,data=datos)
fit_2 <- lm(datos$RRHH~DCO,data=datos)
fit_3 <- lm(datos$RRHH~DE,data=datos)
print(anova(fit_RRHH,fit_1,fit_2,fit_3))
## Analysis of Variance Table
##
## Model 1: datos$RRHH ~ Egresos + DCO + DE
## Model 2: datos$RRHH ~ Egresos
## Model 3: datos$RRHH ~ DCO
## Model 4: datos$RRHH ~ DE
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 177 1.3273e+15
## 2 179 1.8085e+15 -2 -4.8119e+14 32.084 1.289e-12 ***
## 3 179 2.0087e+15 0 -2.0021e+14
## 4 179 2.0419e+15 0 -3.3169e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
8. Regresión multivariada
##
## Call:
## lm(formula = cbind(RRHH, BSERV) ~ Egresos + DCO + DE, data = datos)
##
## Coefficients:
## RRHH BSERV
## (Intercept) 348070.83 -567950.81
## Egresos 773.11 459.51
## DCO 69.96 94.25
## DE 23.05 -10.87
## Response RRHH :
##
## Call:
## lm(formula = RRHH ~ Egresos + DCO + DE, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8556957 -1018261 -174886 483225 16580483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 348070.83 253002.28 1.376 0.171
## Egresos 773.11 81.53 9.483 <2e-16 ***
## DCO 69.96 60.22 1.162 0.247
## DE 23.05 61.23 0.377 0.707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2738000 on 177 degrees of freedom
## Multiple R-squared: 0.9436, Adjusted R-squared: 0.9427
## F-statistic: 987.5 on 3 and 177 DF, p-value: < 2.2e-16
##
##
## Response BSERV :
##
## Call:
## lm(formula = BSERV ~ Egresos + DCO + DE, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8400687 -1132474 2251 465098 21709324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -567950.81 297613.19 -1.908 0.058 .
## Egresos 459.51 95.90 4.792 3.49e-06 ***
## DCO 94.25 70.83 1.331 0.185
## DE -10.87 72.03 -0.151 0.880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3221000 on 177 degrees of freedom
## Multiple R-squared: 0.8677, Adjusted R-squared: 0.8654
## F-statistic: 386.9 on 3 and 177 DF, p-value: < 2.2e-16
9. Regresión logística
##
## Call:
## glm(formula = Survived ~ Age, data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4811 -0.4158 -0.3662 0.5789 0.7252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.483753 0.041788 11.576 <2e-16 ***
## Age -0.002613 0.001264 -2.067 0.0391 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2404297)
##
## Null deviance: 172.21 on 713 degrees of freedom
## Residual deviance: 171.19 on 712 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 1012.6
##
## Number of Fisher Scoring iterations: 2
datos=titanic_train
modelo=glm(Survived ~ Age, data = datos)
plot(x = datos$Age, y = datos$Survived , col = "darkblue",
main = "probabilidad de sobrevida en función de la edad", xlab = "edad",
ylab = "Probabilidad de sobrevida")
par(new=TRUE)
num=exp(modelo$coefficients[1]+modelo$coefficients[1]*(datos$Age))
plot(datos$Age,num/(1+num),col="darkgreen",ylab="",yaxt='n',xaxt='n')
library("dummies")
sexo=dummy(titanic_train$Sex) #Sexo ajustado
datos=titanic_train
datos_proc=data.frame(datos$Survived,datos$Age,sexo[,2])
names(datos_proc)=c("Sobrevida","Edad","Femenino")
modelo=glm(Sobrevida~., data = datos_proc) #Predicción
#Probabilidad de que una mujer de 40 años sobreviva
print(predict(modelo,data.frame(Edad=40,Femenino=1)),type='response')
## 1
## 0.1967612
##
## Call:
## glm(formula = Sobrevida ~ ., data = datos_proc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7786 -0.2115 -0.1931 0.2471 0.8401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7804879 0.0394345 19.792 <2e-16 ***
## Edad -0.0009206 0.0010730 -0.858 0.391
## Femenino -0.5469036 0.0323428 -16.910 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1717122)
##
## Null deviance: 172.21 on 713 degrees of freedom
## Residual deviance: 122.09 on 711 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 773.22
##
## Number of Fisher Scoring iterations: 2