La regresión logística es un tipo de modelo estadístico utilizado principalmente para la clasificación binaria, es decir, para predecir una de dos posibles categorías. A diferencia de la regresión lineal que predice un valor continuo, la regresión logística estima la probabilidad de que una instancia pertenezca a una clase específica.
#Importamos los datos
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
train_1 <- read_excel("C:/Users/Valeria/Downloads/train_1.xlsx")
View(train_1)
library(readxl)
test_2 <- read_excel("C:/Users/Valeria/Downloads/test_2.xlsx")
View(test_2)
#Verificar estructura de Datos
str(train_1)
## tibble [1,460 × 11] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
## $ LotArea : num [1:1460] 8450 9600 11250 9550 14260 ...
## $ Utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ BldgType : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ YearBuilt : num [1:1460] 2003 1976 2001 1915 2000 ...
## $ FullBath : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
## $ BedroomAbvGr: num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenQual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ GarageCars : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
## $ Fence : chr [1:1460] "NA" "NA" "NA" "NA" ...
## $ SalePrice : num [1:1460] 208500 181500 223500 140000 250000 ...
str(test_2)
## tibble [1,459 × 11] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:1459] 1461 1462 1463 1464 1465 ...
## $ LotArea : num [1:1459] 11622 14267 13830 9978 5005 ...
## $ Utilities : chr [1:1459] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ BldgType : chr [1:1459] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ YearBuilt : num [1:1459] 1961 1958 1997 1998 1992 ...
## $ FullBath : num [1:1459] 1 1 2 2 2 2 2 2 1 1 ...
## $ BedroomAbvGr: num [1:1459] 2 3 3 3 2 3 3 3 2 2 ...
## $ KitchenQual : chr [1:1459] "TA" "Gd" "TA" "Gd" ...
## $ GarageCars : num [1:1459] 1 1 2 2 2 2 2 2 2 2 ...
## $ Fence : chr [1:1459] "MnPrv" "NA" "MnPrv" "NA" ...
## $ SalePrice : num [1:1459] 169277 187758 183584 179317 150730 ...
#Analizamos una variable que es categorica
train_1$FullBath<- ifelse(train_1$FullBath == 1, "2", 0)
table(train_1$FullBath)
##
## 0 2
## 810 650
train_1$FullBathBinary <- ifelse(train_1$FullBath > 1, 1, 0)
table(train_1$FullBathBinary)
##
## 0 1
## 810 650
# Creamos modelo de regresión logística
modelo_log <- glm(FullBathBinary ~ LotArea + YearBuilt +
GarageCars + KitchenQual + BldgType +
BedroomAbvGr, data = train_1, family = binomial)
# Resumen del modelo
summary(modelo_log)
##
## Call:
## glm(formula = FullBathBinary ~ LotArea + YearBuilt + GarageCars +
## KitchenQual + BldgType + BedroomAbvGr, family = binomial,
## data = train_1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.161e+01 6.715e+00 10.664 < 2e-16 ***
## LotArea -3.422e-05 1.467e-05 -2.332 0.019688 *
## YearBuilt -3.351e-02 3.366e-03 -9.955 < 2e-16 ***
## GarageCars -1.035e+00 1.408e-01 -7.353 1.94e-13 ***
## KitchenQualFa 4.269e-01 6.459e-01 0.661 0.508688
## KitchenQualGd -1.020e-01 3.677e-01 -0.277 0.781514
## KitchenQualTA 1.335e+00 3.706e-01 3.602 0.000316 ***
## BldgType2fmCon -1.910e+00 4.904e-01 -3.896 9.80e-05 ***
## BldgTypeDuplex -3.243e+00 5.414e-01 -5.991 2.09e-09 ***
## BldgTypeTwnhs -1.343e+00 4.540e-01 -2.958 0.003093 **
## BldgTypeTwnhsE -8.860e-01 2.904e-01 -3.051 0.002277 **
## BedroomAbvGr -1.445e+00 1.240e-01 -11.655 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2006.4 on 1459 degrees of freedom
## Residual deviance: 1137.8 on 1448 degrees of freedom
## AIC: 1161.8
##
## Number of Fisher Scoring iterations: 6
# Summarize the model results
modelo_log$coefficients
## (Intercept) LotArea YearBuilt GarageCars KitchenQualFa
## 7.160534e+01 -3.421576e-05 -3.350612e-02 -1.035281e+00 4.268887e-01
## KitchenQualGd KitchenQualTA BldgType2fmCon BldgTypeDuplex BldgTypeTwnhs
## -1.019841e-01 1.335082e+00 -1.910414e+00 -3.243399e+00 -1.343158e+00
## BldgTypeTwnhsE BedroomAbvGr
## -8.860031e-01 -1.445421e+00
#Validando las predicciones de los datos en conjuntos de entrenamiento y prueba
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
set.seed(123)
split = sample.split(train_1$FullBathBinary, SplitRatio = 0.7)
trainset_log = subset(train_1, split == TRUE)
testset_log = subset(train_1, split == FALSE)
# Ajustar Regresion al conjunto de Prueba
modelo_log <- glm(train_1$FullBathBinary ~ LotArea + YearBuilt +
GarageCars + KitchenQual + BldgType +
BedroomAbvGr, data = train_1, family = binomial)
coef(modelo_log)
## (Intercept) LotArea YearBuilt GarageCars KitchenQualFa
## 7.160534e+01 -3.421576e-05 -3.350612e-02 -1.035281e+00 4.268887e-01
## KitchenQualGd KitchenQualTA BldgType2fmCon BldgTypeDuplex BldgTypeTwnhs
## -1.019841e-01 1.335082e+00 -1.910414e+00 -3.243399e+00 -1.343158e+00
## BldgTypeTwnhsE BedroomAbvGr
## -8.860031e-01 -1.445421e+00
# Analizando Resultados con los datos de Prueba
ypred_log = predict(modelo_log, newdata = testset_log, type = "response")
head(ypred_log)
## 1 2 3 4 5 6
## 0.499255943 0.007507347 0.896949323 0.083572061 0.793495758 0.007615266
# Predicción de los valores usando el conjunto de entrenamiento
trainset_log$ypred_log <- predict(modelo_log, newdata = trainset_log, type = "response")
#Graficamos los resultados de entrenamiento
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(trainset_log, aes(x = FullBath, y = YearBuilt)) +
geom_point(color = 'red') +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, color = 'blue') +
ggtitle('Cant. Banos Completos Vs Anos de Construccion (Entrenamiento)') +
xlab('Cant. Banos Completos') +
ylab('Anos de Construccion')
## `geom_smooth()` using formula = 'y ~ x'
# Predicción de los valores usando el conjunto de prueba
testset_log$ypred_log <- predict(modelo_log, newdata = testset_log, type = "response")
# Graficar los resultados de prueba
ggplot(testset_log, aes(x = FullBath, y = SalePrice)) +
geom_point(color = 'red') +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, color = 'blue') +
ggtitle('Cant. Banos Completos Vs Precio de Venta (Prueba)') +
xlab('Cant. Banos Completos') +
ylab('Precio de Venta')
## `geom_smooth()` using formula = 'y ~ x'
#StepModel
step_model <- step(glm(train_1$FullBathBinary ~ LotArea + YearBuilt +
GarageCars + KitchenQual + BldgType +
BedroomAbvGr, data = train_1, family = binomial),
direction = "both")
## Start: AIC=1161.78
## train_1$FullBathBinary ~ LotArea + YearBuilt + GarageCars + KitchenQual +
## BldgType + BedroomAbvGr
##
## Df Deviance AIC
## <none> 1137.8 1161.8
## - LotArea 1 1146.6 1168.6
## - GarageCars 1 1197.0 1219.0
## - BldgType 4 1213.7 1229.7
## - KitchenQual 3 1214.4 1232.4
## - YearBuilt 1 1245.6 1267.6
## - BedroomAbvGr 1 1315.2 1337.2
summary(step_model)
##
## Call:
## glm(formula = train_1$FullBathBinary ~ LotArea + YearBuilt +
## GarageCars + KitchenQual + BldgType + BedroomAbvGr, family = binomial,
## data = train_1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.161e+01 6.715e+00 10.664 < 2e-16 ***
## LotArea -3.422e-05 1.467e-05 -2.332 0.019688 *
## YearBuilt -3.351e-02 3.366e-03 -9.955 < 2e-16 ***
## GarageCars -1.035e+00 1.408e-01 -7.353 1.94e-13 ***
## KitchenQualFa 4.269e-01 6.459e-01 0.661 0.508688
## KitchenQualGd -1.020e-01 3.677e-01 -0.277 0.781514
## KitchenQualTA 1.335e+00 3.706e-01 3.602 0.000316 ***
## BldgType2fmCon -1.910e+00 4.904e-01 -3.896 9.80e-05 ***
## BldgTypeDuplex -3.243e+00 5.414e-01 -5.991 2.09e-09 ***
## BldgTypeTwnhs -1.343e+00 4.540e-01 -2.958 0.003093 **
## BldgTypeTwnhsE -8.860e-01 2.904e-01 -3.051 0.002277 **
## BedroomAbvGr -1.445e+00 1.240e-01 -11.655 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2006.4 on 1459 degrees of freedom
## Residual deviance: 1137.8 on 1448 degrees of freedom
## AIC: 1161.8
##
## Number of Fisher Scoring iterations: 6
ypred_test_step <- predict(step_model, newdata = testset_log, type = "response")
head(ypred_test_step)
## 1 2 3 4 5 6
## 0.499255943 0.007507347 0.896949323 0.083572061 0.793495758 0.007615266
#Comportamiento en las predicciones:
#Variable Binaria (0 = Espacio para 1 vehiculo, 1 =Espacio para mas de 1 vehiculo)
table(train_1$FullBath)
##
## 0 2
## 810 650
train_1$FullBath_prob <-
predict(modelo_log, type = "response")
head(train_1$FullBath_prob)
## 1 2 3 4 5 6
## 0.090680061 0.499255943 0.088333778 0.394188620 0.007507347 0.896949323
#Instalamos pRoc
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Creando la ROC curve
ROC1 <- roc(train_1$FullBath, train_1$FullBath_prob) # Añadido un paréntesis que falta
## Setting levels: control = 0, case = 2
## Setting direction: controls < cases
#Indicadores de precisión y exactitud
plot(ROC1, col = "red")
#Area bajo la curva
auc(ROC1) # Llamando a la función auc() con el objeto ROC1
## Area under the curve: 0.9083