A continuación veremos cómo realizar una regresión logística en R desde ceros.
install.packages('ISLR')
## Installing package into '/home/kaliw/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library('ISLR')
Un conjunto de datos simulados que contiene información sobre diez mil clientes. El objetivo aquí es predecir qué clientes no pagarán su deuda de tarjeta de crédito.
credit_data <- Credit
summary(credit_data)
## ID Income Limit Rating
## Min. : 1.0 Min. : 10.35 Min. : 855 Min. : 93.0
## 1st Qu.:100.8 1st Qu.: 21.01 1st Qu.: 3088 1st Qu.:247.2
## Median :200.5 Median : 33.12 Median : 4622 Median :344.0
## Mean :200.5 Mean : 45.22 Mean : 4736 Mean :354.9
## 3rd Qu.:300.2 3rd Qu.: 57.47 3rd Qu.: 5873 3rd Qu.:437.2
## Max. :400.0 Max. :186.63 Max. :13913 Max. :982.0
## Cards Age Education Gender Student
## Min. :1.000 Min. :23.00 Min. : 5.00 Male :193 No :360
## 1st Qu.:2.000 1st Qu.:41.75 1st Qu.:11.00 Female:207 Yes: 40
## Median :3.000 Median :56.00 Median :14.00
## Mean :2.958 Mean :55.67 Mean :13.45
## 3rd Qu.:4.000 3rd Qu.:70.00 3rd Qu.:16.00
## Max. :9.000 Max. :98.00 Max. :20.00
## Married Ethnicity Balance
## No :155 African American: 99 Min. : 0.00
## Yes:245 Asian :102 1st Qu.: 68.75
## Caucasian :199 Median : 459.50
## Mean : 520.01
## 3rd Qu.: 863.00
## Max. :1999.00
credit_data$marriedTarget <- ifelse(credit_data$Married == "Yes", 1, 0)
set.seed(111)
split = sort(sample(nrow(credit_data), nrow(credit_data)*0.7))
training = credit_data[split,]
testing = credit_data[-split,]
model <- glm(marriedTarget ~ Income + Limit + Rating + Cards + Age + Education + Gender + Student + Balance, family=binomial, data=training)
summary((model))
##
## Call:
## glm(formula = marriedTarget ~ Income + Limit + Rating + Cards +
## Age + Education + Gender + Student + Balance, family = binomial,
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8955 -1.2795 0.8018 0.9792 1.3932
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1955657 1.0796625 -1.107 0.2681
## Income -0.0156693 0.0118931 -1.318 0.1877
## Limit -0.0005632 0.0009157 -0.615 0.5385
## Rating 0.0195659 0.0132642 1.475 0.1402
## Cards -0.0860103 0.1095398 -0.785 0.4323
## Age -0.0120293 0.0077979 -1.543 0.1229
## Education 0.0181959 0.0410206 0.444 0.6573
## GenderFemale 0.3801467 0.2542046 1.495 0.1348
## StudentYes 0.6018666 0.7091414 0.849 0.3960
## Balance -0.0027436 0.0013465 -2.038 0.0416 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 372.46 on 279 degrees of freedom
## Residual deviance: 358.66 on 270 degrees of freedom
## AIC: 378.66
##
## Number of Fisher Scoring iterations: 4
probabilities <- predict(model, newddata= testing, type = 'response')
marriagePredictions <- ifelse(probabilities > 0.5, "Yes", "No")
#table(marriagePredictions, testing$Married)
install.packages('caret')
## Installing package into '/home/kaliw/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library('caret')
## Loading required package: ggplot2
## Loading required package: lattice
df <- read.csv('500.csv')
str(df)
## 'data.frame': 500 obs. of 4 variables:
## $ Gender: chr "Male" "Male" "Female" "Female" ...
## $ Height: int 174 189 185 195 149 189 147 154 174 169 ...
## $ Weight: int 96 87 110 104 61 104 92 111 90 103 ...
## $ Index : int 4 2 4 3 3 3 5 5 3 4 ...
df$genderTarget <- ifelse(df$Gender == "Male", 1, 0)
set.seed(111)
split = sort(sample(nrow(df), nrow(df)*0.7))
training = df[split,]
testing = df[-split,]
model <- glm(genderTarget ~ Height + Weight+ Index, family=binomial, data=training)
summary((model))
##
## Call:
## glm(formula = genderTarget ~ Height + Weight + Index, family = binomial,
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.22351 -1.18730 0.03111 1.16644 1.27306
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6104665 1.6193386 -0.377 0.706
## Height 0.0023617 0.0092944 0.254 0.799
## Weight -0.0007666 0.0074527 -0.103 0.918
## Index 0.0777182 0.1909418 0.407 0.684
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 485.20 on 349 degrees of freedom
## Residual deviance: 484.69 on 346 degrees of freedom
## AIC: 492.69
##
## Number of Fisher Scoring iterations: 3
probas <- predict(model, newddata= testing, type = 'response')
genderPredictions <- ifelse(probas > 0.5, "Male", "Female")
#table(genderPredictions, testing$Gender)
confint(model)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -3.79878592 2.56609131
## Height -0.01588979 0.02065326
## Weight -0.01542254 0.01387248
## Index -0.29659138 0.45407882
confint.default(model)
## 2.5 % 97.5 %
## (Intercept) -3.78431181 2.56337877
## Height -0.01585504 0.02057845
## Weight -0.01537361 0.01384045
## Index -0.29652091 0.45195728
exp(coef(model))
## (Intercept) Height Weight Index
## 0.5430974 1.0023645 0.9992337 1.0808180
exp(cbind(OR = coef(model), confint(model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.5430974 0.02239795 13.014854
## Height 1.0023645 0.98423578 1.020868
## Weight 0.9992337 0.98469577 1.013969
## Index 1.0808180 0.74334769 1.574722