##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
El modelo de regresión lineal supone que la variable \(Y\) es cuantitativa. Pero en muchas situaciones la variable es cualitativa.
por ejemplo,
Este tipo de proceso de predicción es el que llamaremos Clasificación.
Hay varios tipos de metodos para llevar acabo la clasificación,
\[\text{odds}=\frac{p}{1-p}\]
\[\begin{cases} 0<\text{Odds}<1 \text{ (Fracazo mas probable)} & \text{ si } p<1-p \\ \text{Odds}=1 \text{ (Probabilidad de Éxito y Fracazo iguales)}& \text{ si } p=1-p \\ \text{Odds}>1 \text{ (Éxito mas probable)}& \text{ si } p>1-p \end{cases}\]
Notar que \(Odds \in (0,\infty)\) lo usaremos asi \(\ln(Odds) \in (-\infty,\infty)\)
Utilizaremos como ejemplo el dataset Default
names(Default)
## [1] "default" "student" "balance" "income"
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
Default$coded_default <- ifelse(Default$default=="Yes",1,0)
Default %>% ggplot(aes(x=balance,y=coded_default))+
geom_point(aes(col=Default$default))+
geom_smooth(method = "lm")
reg_fit <- lm(coded_default~balance,data = Default)
summary(reg_fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0751919588 3.354360e-03 -22.41618 1.262551e-108
## balance 0.0001298722 3.474933e-06 37.37401 2.774969e-286
coef(reg_fit)
## (Intercept) balance
## -0.0751919588 0.0001298722
\[P(\text{default}=Yes | \text{ balance})\]
\[P(\text{default}=Yes | \text{ balance})=\beta_0+\beta_1\text{ balance}\]
\[P(X)=\beta_0+\beta_1\text{ X }\]
\[\begin{align*} P(X)&=\beta_0+\beta_1X \ \\ \ln{\frac{p}{1-p}}&=\beta_0+\beta_1X \\ \frac{p}{1-p} &=e^{\beta_0+\beta_1X} \\ p &=e^{\beta_0+\beta_1X}(1-p) \\ p &=e^{\beta_0+\beta_1X}-pe^{\beta_0+\beta_1X} \\ p+pe^{\beta_0+\beta_1X} &=e^{\beta_0+\beta_1X} \\ p(1+e^{\beta_0+\beta_1X} )&=e^{\beta_0+\beta_1X} \\ p&=\frac{ e^{\beta_0+\beta_1X} }{1+e^{\beta_0+\beta_1X} } \end{align*}\]
logit_reg <- glm(default~balance,data = Default,family = "binomial")
summary(logit_reg)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.651330614 0.3611573721 -29.49221 3.623124e-191
## balance 0.005498917 0.0002203702 24.95309 1.976602e-137
coef(logit_reg)
## (Intercept) balance
## -10.651330614 0.005498917
entonces esto lo podemos interpretar de la siguiente forma, \[\ln{\text{Odds}}=\beta_0+\beta_1 \text{ balance}\] donde \(\beta_0=\) -10.6513306 y \(\beta_1=\) 0.0054989.
\[ \begin{align*} \ln{\text{Odds}}&=-10.6513+0.0055 \text{ balance} \\ \text{Odds}&=e^{-10.6513+0.0055 \text{ balance}} \\ p &= \frac{e^{-10.6513+0.0055 \text{ balance}}}{1+e^{-10.6513+0.0055 \text{ balance}}} \end{align*} \]
p<-exp(beta0+beta1*Default$balance)/(1+exp(beta0+beta1*Default$balance) )
plot(Default$balance,p)
prob_default <- function(balance,beta0,beta1){
p<-exp(beta0+beta1*balance)/(1+exp(beta0+beta1*balance) )
return(as.numeric(p))
}
prob_default(1500,beta0,beta1)
## [1] 0.08307362
new_balance <- seq(500,3000,by=150)
data.frame(default=new_balance,
prob=round(sapply(new_balance,
"prob_default",
beta0=beta0,
beta1=beta1),
4)
)
## default prob
## 1 500 0.0004
## 2 650 0.0008
## 3 800 0.0019
## 4 950 0.0044
## 5 1100 0.0099
## 6 1250 0.0224
## 7 1400 0.0497
## 8 1550 0.1066
## 9 1700 0.2139
## 10 1850 0.3831
## 11 2000 0.5863
## 12 2150 0.7638
## 13 2300 0.8807
## 14 2450 0.9439
## 15 2600 0.9746
## 16 2750 0.9887
## 17 2900 0.9950
summary(logit_reg)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.651330614 0.3611573721 -29.49221 3.623124e-191
## balance 0.005498917 0.0002203702 24.95309 1.976602e-137
round(exp(cbind(coef=coef(logit_reg), confint(logit_reg) )),6)
## Waiting for profiling to be done...
## coef 2.5 % 97.5 %
## (Intercept) 0.000024 0.000011 0.000047
## balance 1.005514 1.005092 1.005961
logit_reg2 <- glm(default~scale(balance),data = Default,family = "binomial")
round(exp(cbind(coef=coef(logit_reg2), confint(logit_reg2) )),6)
## Waiting for profiling to be done...
## coef 2.5 % 97.5 %
## (Intercept) 0.00234 0.001611 0.003311
## scale(balance) 14.29498 11.666866 17.723556
\[ \begin{align*} P(Y=k | X=x)&=\frac{P(X=x | Y=k ) P(Y=k)}{P(X=x)} \\ &= \frac{f_k(x) \pi_k}{\sum_{l=1}^{K}\pi_lf_l(x)}\\ \end{align*}\]
\[ \begin{align*} f_k(x) &\sim N(\mu_k,\sigma_k) \\ &=\frac{1}{\sqrt{2\pi}\sigma_k}e^{-\frac{1}{2} \left (\frac{x-\mu_k}{\sigma_k} \right )^2} \\ \end{align*}\]
Asumiremos que \(\sigma_k=\sigma\) es decir tenemos la misma desviacion para el predictor en todas las clases.
(Pizaron)