Regresion Logistica

La regresión logística es un tipo de modelo estadístico utilizado principalmente para la clasificación binaria, es decir, para predecir una de dos posibles categorías. A diferencia de la regresión lineal que predice un valor continuo, la regresión logística estima la probabilidad de que una instancia pertenezca a una clase específica.

#Importamos los datos
library(readxl)

## Warning: package 'readxl' was built under R version 4.3.3

train_1 <- read_excel("C:/Users/Valeria/Downloads/train_1.xlsx")
View(train_1)
library(readxl)
test_2 <- read_excel("C:/Users/Valeria/Downloads/test_2.xlsx")
View(test_2)

#Verificar estructura de Datos
str(train_1)

## tibble [1,460 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Id          : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
##  $ LotArea     : num [1:1460] 8450 9600 11250 9550 14260 ...
##  $ Utilities   : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ BldgType    : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ YearBuilt   : num [1:1460] 2003 1976 2001 1915 2000 ...
##  $ FullBath    : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
##  $ BedroomAbvGr: num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenQual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
##  $ GarageCars  : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
##  $ Fence       : chr [1:1460] "NA" "NA" "NA" "NA" ...
##  $ SalePrice   : num [1:1460] 208500 181500 223500 140000 250000 ...

str(test_2)

## tibble [1,459 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Id          : num [1:1459] 1461 1462 1463 1464 1465 ...
##  $ LotArea     : num [1:1459] 11622 14267 13830 9978 5005 ...
##  $ Utilities   : chr [1:1459] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ BldgType    : chr [1:1459] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ YearBuilt   : num [1:1459] 1961 1958 1997 1998 1992 ...
##  $ FullBath    : num [1:1459] 1 1 2 2 2 2 2 2 1 1 ...
##  $ BedroomAbvGr: num [1:1459] 2 3 3 3 2 3 3 3 2 2 ...
##  $ KitchenQual : chr [1:1459] "TA" "Gd" "TA" "Gd" ...
##  $ GarageCars  : num [1:1459] 1 1 2 2 2 2 2 2 2 2 ...
##  $ Fence       : chr [1:1459] "MnPrv" "NA" "MnPrv" "NA" ...
##  $ SalePrice   : num [1:1459] 169277 187758 183584 179317 150730 ...

#Analizamos una variable que es categorica
train_1$FullBath<- ifelse(train_1$FullBath == 1, "2", 0)
table(train_1$FullBath)

## 
##   0   2 
## 810 650

train_1$FullBathBinary <- ifelse(train_1$FullBath > 1, 1, 0)
table(train_1$FullBathBinary)

## 
##   0   1 
## 810 650

# Creamos modelo de regresión logística
modelo_log <- glm(FullBathBinary ~ LotArea + YearBuilt + 
                   GarageCars + KitchenQual + BldgType + 
                   BedroomAbvGr, data = train_1, family = binomial)
# Resumen del modelo
summary(modelo_log)

## 
## Call:
## glm(formula = FullBathBinary ~ LotArea + YearBuilt + GarageCars + 
##     KitchenQual + BldgType + BedroomAbvGr, family = binomial, 
##     data = train_1)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     7.161e+01  6.715e+00  10.664  < 2e-16 ***
## LotArea        -3.422e-05  1.467e-05  -2.332 0.019688 *  
## YearBuilt      -3.351e-02  3.366e-03  -9.955  < 2e-16 ***
## GarageCars     -1.035e+00  1.408e-01  -7.353 1.94e-13 ***
## KitchenQualFa   4.269e-01  6.459e-01   0.661 0.508688    
## KitchenQualGd  -1.020e-01  3.677e-01  -0.277 0.781514    
## KitchenQualTA   1.335e+00  3.706e-01   3.602 0.000316 ***
## BldgType2fmCon -1.910e+00  4.904e-01  -3.896 9.80e-05 ***
## BldgTypeDuplex -3.243e+00  5.414e-01  -5.991 2.09e-09 ***
## BldgTypeTwnhs  -1.343e+00  4.540e-01  -2.958 0.003093 ** 
## BldgTypeTwnhsE -8.860e-01  2.904e-01  -3.051 0.002277 ** 
## BedroomAbvGr   -1.445e+00  1.240e-01 -11.655  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2006.4  on 1459  degrees of freedom
## Residual deviance: 1137.8  on 1448  degrees of freedom
## AIC: 1161.8
## 
## Number of Fisher Scoring iterations: 6

# Summarize the model results
modelo_log$coefficients

##    (Intercept)        LotArea      YearBuilt     GarageCars  KitchenQualFa 
##   7.160534e+01  -3.421576e-05  -3.350612e-02  -1.035281e+00   4.268887e-01 
##  KitchenQualGd  KitchenQualTA BldgType2fmCon BldgTypeDuplex  BldgTypeTwnhs 
##  -1.019841e-01   1.335082e+00  -1.910414e+00  -3.243399e+00  -1.343158e+00 
## BldgTypeTwnhsE   BedroomAbvGr 
##  -8.860031e-01  -1.445421e+00

#Validando las predicciones de los datos en conjuntos de entrenamiento y prueba
library(caTools)

## Warning: package 'caTools' was built under R version 4.3.3

set.seed(123)
split = sample.split(train_1$FullBathBinary, SplitRatio = 0.7)
trainset_log = subset(train_1, split == TRUE)
testset_log = subset(train_1, split == FALSE)

# Ajustar Regresion al conjunto de Prueba
modelo_log <- glm(train_1$FullBathBinary ~ LotArea + YearBuilt + 
                   GarageCars  + KitchenQual + BldgType + 
                   BedroomAbvGr, data = train_1, family = binomial)
coef(modelo_log)

##    (Intercept)        LotArea      YearBuilt     GarageCars  KitchenQualFa 
##   7.160534e+01  -3.421576e-05  -3.350612e-02  -1.035281e+00   4.268887e-01 
##  KitchenQualGd  KitchenQualTA BldgType2fmCon BldgTypeDuplex  BldgTypeTwnhs 
##  -1.019841e-01   1.335082e+00  -1.910414e+00  -3.243399e+00  -1.343158e+00 
## BldgTypeTwnhsE   BedroomAbvGr 
##  -8.860031e-01  -1.445421e+00

# Analizando Resultados con los datos de Prueba
ypred_log = predict(modelo_log, newdata = testset_log, type = "response")
head(ypred_log)

##           1           2           3           4           5           6 
## 0.499255943 0.007507347 0.896949323 0.083572061 0.793495758 0.007615266

# Predicción de los valores usando el conjunto de entrenamiento
trainset_log$ypred_log <- predict(modelo_log, newdata = trainset_log, type = "response")

#Graficamos los resultados de entrenamiento
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

ggplot(trainset_log, aes(x = FullBath, y = YearBuilt)) +
  geom_point(color = 'red') +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, color = 'blue') +
  ggtitle('Cant. Banos Completos Vs Anos de Construccion (Entrenamiento)') +
  xlab('Cant. Banos Completos') +
  ylab('Anos de Construccion')

## `geom_smooth()` using formula = 'y ~ x'

# Predicción de los valores usando el conjunto de prueba
testset_log$ypred_log <- predict(modelo_log, newdata = testset_log, type = "response")

# Graficar los resultados de prueba
ggplot(testset_log, aes(x = FullBath, y = SalePrice)) +
  geom_point(color = 'red') +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, color = 'blue') +
  ggtitle('Cant. Banos Completos Vs Precio de Venta (Prueba)') +
  xlab('Cant. Banos Completos') +
  ylab('Precio de Venta')

## `geom_smooth()` using formula = 'y ~ x'

#StepModel
step_model <- step(glm(train_1$FullBathBinary ~ LotArea + YearBuilt + 
                   GarageCars + KitchenQual + BldgType + 
                   BedroomAbvGr, data = train_1, family = binomial), 
                   direction = "both")

## Start:  AIC=1161.78
## train_1$FullBathBinary ~ LotArea + YearBuilt + GarageCars + KitchenQual + 
##     BldgType + BedroomAbvGr
## 
##                Df Deviance    AIC
## <none>              1137.8 1161.8
## - LotArea       1   1146.6 1168.6
## - GarageCars    1   1197.0 1219.0
## - BldgType      4   1213.7 1229.7
## - KitchenQual   3   1214.4 1232.4
## - YearBuilt     1   1245.6 1267.6
## - BedroomAbvGr  1   1315.2 1337.2

summary(step_model)

## 
## Call:
## glm(formula = train_1$FullBathBinary ~ LotArea + YearBuilt + 
##     GarageCars + KitchenQual + BldgType + BedroomAbvGr, family = binomial, 
##     data = train_1)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     7.161e+01  6.715e+00  10.664  < 2e-16 ***
## LotArea        -3.422e-05  1.467e-05  -2.332 0.019688 *  
## YearBuilt      -3.351e-02  3.366e-03  -9.955  < 2e-16 ***
## GarageCars     -1.035e+00  1.408e-01  -7.353 1.94e-13 ***
## KitchenQualFa   4.269e-01  6.459e-01   0.661 0.508688    
## KitchenQualGd  -1.020e-01  3.677e-01  -0.277 0.781514    
## KitchenQualTA   1.335e+00  3.706e-01   3.602 0.000316 ***
## BldgType2fmCon -1.910e+00  4.904e-01  -3.896 9.80e-05 ***
## BldgTypeDuplex -3.243e+00  5.414e-01  -5.991 2.09e-09 ***
## BldgTypeTwnhs  -1.343e+00  4.540e-01  -2.958 0.003093 ** 
## BldgTypeTwnhsE -8.860e-01  2.904e-01  -3.051 0.002277 ** 
## BedroomAbvGr   -1.445e+00  1.240e-01 -11.655  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2006.4  on 1459  degrees of freedom
## Residual deviance: 1137.8  on 1448  degrees of freedom
## AIC: 1161.8
## 
## Number of Fisher Scoring iterations: 6

ypred_test_step <- predict(step_model, newdata = testset_log, type = "response")
head(ypred_test_step)

##           1           2           3           4           5           6 
## 0.499255943 0.007507347 0.896949323 0.083572061 0.793495758 0.007615266

#Comportamiento en las predicciones:
#Variable Binaria (0 = Espacio para 1 vehiculo, 1 =Espacio para mas de 1 vehiculo)
table(train_1$FullBath)

## 
##   0   2 
## 810 650

train_1$FullBath_prob <- 
  predict(modelo_log, type = "response")

head(train_1$FullBath_prob)

##           1           2           3           4           5           6 
## 0.090680061 0.499255943 0.088333778 0.394188620 0.007507347 0.896949323

#Instalamos pRoc
library(pROC)

## Warning: package 'pROC' was built under R version 4.3.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Creando la ROC curve
ROC1 <- roc(train_1$FullBath, train_1$FullBath_prob) # Añadido un paréntesis que falta

## Setting levels: control = 0, case = 2

## Setting direction: controls < cases

#Indicadores de precisión y exactitud
plot(ROC1, col = "red")

#Area bajo la curva
auc(ROC1) # Llamando a la función auc() con el objeto ROC1

## Area under the curve: 0.9083

Regresion Logistica - Metodo Supervisado

Valeria Gamarra - María Camila Mejía Blanco - Camilo Andres Porras Madiedo

2024-06-02

REGRESION LOGISTICA