##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
El modelo de regresión lineal supone que la variable \(Y\) es cuantitativa. Pero en muchas situaciones la variable es cualitativa.
por ejemplo,
Este tipo de proceso de predicción es el que llamaremos Clasificación.
Hay varios tipos de metodos para llevar acabo la clasificación,
\[\text{odds}=\frac{p}{1-p}\]
\[\begin{cases} 0<\text{Odds}<1 \text{ (Fracazo mas probable)} & \text{ si } p<1-p \\ \text{Odds}=1 \text{ (Probabilidad de Éxito y Fracazo iguales)}& \text{ si } p=1-p \\ \text{Odds}>1 \text{ (Éxito mas probable)}& \text{ si } p>1-p \end{cases}\]
Notar que \(Odds \in (0,\infty)\) lo usaremos asi \(\ln(Odds) \in (-\infty,\infty)\)
Utilizaremos como ejemplo el dataset Default
names(Default)
## [1] "default" "student" "balance" "income"
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
Default$coded_default <- ifelse(Default$default=="Yes",1,0)
Default %>% ggplot(aes(x=balance,y=coded_default))+
geom_point(aes(col=Default$default))+
geom_smooth(method = "lm")
reg_fit <- lm(coded_default~balance,data = Default)
summary(reg_fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0751919588 3.354360e-03 -22.41618 1.262551e-108
## balance 0.0001298722 3.474933e-06 37.37401 2.774969e-286
coef(reg_fit)
## (Intercept) balance
## -0.0751919588 0.0001298722
\[P(\text{default}=Yes | \text{ balance})\]
\[P(\text{default}=Yes | \text{ balance})=\beta_0+\beta_1\text{ balance}\]
\[P(X)=\beta_0+\beta_1\text{ X }\]
\[\begin{align*} P(X)&=\beta_0+\beta_1X \ \\ \ln{\frac{p}{1-p}}&=\beta_0+\beta_1X \\ \frac{p}{1-p} &=e^{\beta_0+\beta_1X} \\ p &=e^{\beta_0+\beta_1X}(1-p) \\ p &=e^{\beta_0+\beta_1X}-pe^{\beta_0+\beta_1X} \\ p+pe^{\beta_0+\beta_1X} &=e^{\beta_0+\beta_1X} \\ p(1+e^{\beta_0+\beta_1X} )&=e^{\beta_0+\beta_1X} \\ p&=\frac{ e^{\beta_0+\beta_1X} }{1+e^{\beta_0+\beta_1X} } \end{align*}\]
logit_reg <- glm(default~balance,data = Default,family = "binomial")
summary(logit_reg)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.651330614 0.3611573721 -29.49221 3.623124e-191
## balance 0.005498917 0.0002203702 24.95309 1.976602e-137
coef(logit_reg)
## (Intercept) balance
## -10.651330614 0.005498917
entonces esto lo podemos interpretar de la siguiente forma, \[\ln{\text{Odds}}=\beta_0+\beta_1 \text{ balance}\] donde \(\beta_0=\) -10.6513306 y \(\beta_1=\) 0.0054989.
\[ \begin{align*} \ln{\text{Odds}}&=-10.6513+0.0055 \text{ balance} \\ \text{Odds}&=e^{-10.6513+0.0055 \text{ balance}} \\ p &= \frac{e^{-10.6513+0.0055 \text{ balance}}}{1+e^{-10.6513+0.0055 \text{ balance}}} \end{align*} \]
p<-exp(beta0+beta1*Default$balance)/(1+exp(beta0+beta1*Default$balance) )
plot(Default$balance,p)
prob_default <- function(balance,beta0,beta1){
p<-exp(beta0+beta1*balance)/(1+exp(beta0+beta1*balance) )
return(as.numeric(p))
}
prob_default(1900,beta0,beta1)
## [1] 0.4498443
new_balance <- seq(500,3000,by=150)
data.frame(default=new_balance,
prob=round(sapply(new_balance,
"prob_default",
beta0=beta0,
beta1=beta1),
4)
)
## default prob
## 1 500 0.0004
## 2 650 0.0008
## 3 800 0.0019
## 4 950 0.0044
## 5 1100 0.0099
## 6 1250 0.0224
## 7 1400 0.0497
## 8 1550 0.1066
## 9 1700 0.2139
## 10 1850 0.3831
## 11 2000 0.5863
## 12 2150 0.7638
## 13 2300 0.8807
## 14 2450 0.9439
## 15 2600 0.9746
## 16 2750 0.9887
## 17 2900 0.9950
summary(logit_reg)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.651330614 0.3611573721 -29.49221 3.623124e-191
## balance 0.005498917 0.0002203702 24.95309 1.976602e-137
round(exp(cbind(coef=coef(logit_reg), confint(logit_reg) )),6)
## Waiting for profiling to be done...
## coef 2.5 % 97.5 %
## (Intercept) 0.000024 0.000011 0.000047
## balance 1.005514 1.005092 1.005961
logit_reg2 <- glm(default~scale(balance),data = Default,family = "binomial")
round(exp(cbind(coef=coef(logit_reg2), confint(logit_reg2) )),6)
## Waiting for profiling to be done...
## coef 2.5 % 97.5 %
## (Intercept) 0.00234 0.001611 0.003311
## scale(balance) 14.29498 11.666866 17.723556
summary(logit_reg2)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.057674 0.1836302 -32.98845 1.189589e-238
## scale(balance) 2.659909 0.1065964 24.95309 1.976602e-137
logit_reg3 <- glm(default~ scale(balance) + student, data = Default , family = "binomial")
summary(logit_reg3)$coefficients %>% round(10)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.9560278 0.1861942 -31.988252 0.0000e+00
## scale(balance) 2.7756070 0.1121479 24.749526 0.0000e+00
## studentYes -0.7148776 0.1475190 -4.846003 1.2597e-06
\[ \begin{align*} P(Y=k | X=x)&=\frac{P(X=x | Y=k ) P(Y=k)}{P(X=x)} \\ &= \frac{f_k(x) \pi_k}{\sum_{l=1}^{K}\pi_lf_l(x)}\\ \end{align*}\]
\[ \begin{align*} f_k(x) &\sim N(\mu_k,\sigma_k) \\ &=\frac{1}{\sqrt{2\pi}\sigma_k}e^{-\frac{1}{2} \left (\frac{x-\mu_k}{\sigma_k} \right )^2} \\ \end{align*}\]
Asumiremos que \(\sigma_k=\sigma\) es decir tenemos la misma desviacion para el predictor en todas las clases.
(Pizaron)
\[\delta_k(x)=\log(\pi_k)+\frac{x \mu_k}{\sigma^2}-\frac{\mu_k^2}{2\sigma^2}\]
\(x\) se asigna a la clase que tenga el \(\delta_k(x)\) mas grande.
\[ \begin{align*} \hat \mu_k &= \frac{1}{n_k}\sum_{i:y_i=k}x_i \\ \hat \sigma^2 &= \frac{1}{n-K}\sum_{k=1}^K\sum_{i:y_i=k}(x_i-\hat \mu_k)^2 \\ \hat \pi_k&=\frac{n_k}{n} \end{align*}\]
Default %>% filter(balance != 0) %>% group_by(default) %>% summarise(mean(balance))
## # A tibble: 2 × 2
## default `mean(balance)`
## <fctr> <dbl>
## 1 No 847.7012
## 2 Yes 1747.8217
Default %>% filter(balance != 0)%>% ggplot(aes(balance,fill= default,y=..density..))+
geom_histogram(binwidth = 20, alpha = .5, position = "identity")+
geom_vline( aes(xintercept= (848+1748)/2) )+
geom_density(alpha=0.5)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda_fit<-lda(default~balance,data=Default)
lda_fit
## Call:
## lda(default ~ balance, data = Default)
##
## Prior probabilities of groups:
## No Yes
## 0.9667 0.0333
##
## Group means:
## balance
## No 803.9438
## Yes 1747.8217
##
## Coefficients of linear discriminants:
## LD1
## balance 0.002206916
predict(lda_fit,data.frame(balance=seq(500,3000,by=150)))
## $class
## [1] No No No No No No No No No No No Yes Yes Yes Yes Yes Yes
## Levels: No Yes
##
## $posterior
## No Yes
## 1 0.99902799 0.0009720114
## 2 0.99806479 0.0019352148
## 3 0.99615078 0.0038492158
## 4 0.99235826 0.0076417424
## 5 0.98488575 0.0151142462
## 6 0.97032474 0.0296752559
## 7 0.94255393 0.0574460677
## 8 0.89169530 0.1083047002
## 9 0.80511967 0.1948803257
## 10 0.67459339 0.3254066094
## 11 0.50986471 0.4901352893
## 12 0.34296550 0.6570345049
## 13 0.20756312 0.7924368762
## 14 0.11616611 0.8838338914
## 15 0.06187200 0.9381279995
## 16 0.03203430 0.9679656958
## 17 0.01633525 0.9836647529
##
## $x
## LD1
## 1 -0.74014426
## 2 -0.40910683
## 3 -0.07806941
## 4 0.25296802
## 5 0.58400545
## 6 0.91504287
## 7 1.24608030
## 8 1.57711773
## 9 1.90815515
## 10 2.23919258
## 11 2.57023001
## 12 2.90126743
## 13 3.23230486
## 14 3.56334229
## 15 3.89437971
## 16 4.22541714
## 17 4.55645457
lda_predict<-predict(lda_fit,data.frame(balance=seq(500,3000,by=150)))
data.frame(balance=seq(500,3000,by=150),lda_predict$class,prior=lda_predict$posterior)
## balance lda_predict.class prior.No prior.Yes
## 1 500 No 0.99902799 0.0009720114
## 2 650 No 0.99806479 0.0019352148
## 3 800 No 0.99615078 0.0038492158
## 4 950 No 0.99235826 0.0076417424
## 5 1100 No 0.98488575 0.0151142462
## 6 1250 No 0.97032474 0.0296752559
## 7 1400 No 0.94255393 0.0574460677
## 8 1550 No 0.89169530 0.1083047002
## 9 1700 No 0.80511967 0.1948803257
## 10 1850 No 0.67459339 0.3254066094
## 11 2000 No 0.50986471 0.4901352893
## 12 2150 Yes 0.34296550 0.6570345049
## 13 2300 Yes 0.20756312 0.7924368762
## 14 2450 Yes 0.11616611 0.8838338914
## 15 2600 Yes 0.06187200 0.9381279995
## 16 2750 Yes 0.03203430 0.9679656958
## 17 2900 Yes 0.01633525 0.9836647529
x<-1