objects(); ls()
## character(0)
## character(0)
wd="D:/Maestria/Primer Semestre/Métodos/Proyecto"
setwd(wd)
library("easypackages")
lib_req<-c("car","lmtest","visdat","corrplot","MASS","olsrr","pROC")
easypackages::packages(lib_req)
#### Cargando Los Datos
stroke <- read.table("healthcare-dataset-stroke-data.csv",sep=",",header=T,dec=".",na.strings="N/A")
visdat::vis_miss(stroke) # Visualización de Datos Faltantes
summary(stroke)
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 Class :character 1st Qu.:0.00000
## Median : 91.89 Median :28.10 Mode :character Median :0.00000
## Mean :106.15 Mean :28.89 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
##Eliminar columna Id
stroke$id=NULL
## Revisión Columna Gender
unique(stroke$gender)
## [1] "Male" "Female" "Other"
stroke=transform(stroke, gender=factor(gender,levels=c("Male","Female","Other"),labels=c("Male","Female","Other")))
tableGender = table(stroke$gender)
legend = c(paste("Male: ",tableGender[1]), paste("Female: ",tableGender[2]),paste("Other: ",tableGender[3]))
barplot(tableGender ,col=c("blue","red","black") , legen=legend)
#eliminando el registro con genero other
stroke = stroke[!(stroke$gender=="Other"),]
stroke=transform(stroke, gender=factor(gender,levels=c("Male","Female"),labels=c("Male","Female")))
tableGender = table(stroke$gender)
legend = c(paste("Male: ",tableGender[1]), paste("Female: ",tableGender[2]))
barplot(tableGender ,col=c("blue","red") , legen=legend)
## Revisión Columna Age
par(mfrow=c(1,2))
hist(x = stroke$age)
boxplot(stroke$age)
## Revision Columna hypertension
unique(stroke$hypertension)
## [1] 0 1
stroke=transform(stroke, hypertension=factor(hypertension,levels=0:1,labels=c("No Tiene Hipertension","Si Tiene Hipertension")))
tableHypertension = table(stroke$hypertension)
legend = c(paste("No Tiene Hipertension: ",tableHypertension[1]), paste("Si Tiene Hipertencion: ",tableHypertension[2]))
barplot(tableHypertension ,col=c("Blue","red") , legen=legend)
## Revision Columna heart_disease
unique(stroke$heart_disease)
## [1] 1 0
stroke=transform(stroke, heart_disease=factor(heart_disease,levels=0:1,labels=c("No Tiene Enfermedades del Corazon","Si Tiene Enfermedades del Corazon")))
tableHeart_disease = table(stroke$heart_disease)
legend = c(paste("No Tiene Enfermedades del Corazon: ",tableHeart_disease[1]), paste("Si Tiene Enfermedades del Corazon: ",tableHeart_disease[2]))
barplot(tableHeart_disease ,col=c("Blue","red") , legen=legend)
## Revision Columna ever_married
unique(stroke$ever_married)
## [1] "Yes" "No"
stroke=transform(stroke, ever_married=factor(ever_married,levels=c("Yes","No"),labels=c("Si","No")))
tableEver_married = table(stroke$ever_married)
legend = c(paste("SÃ ha estado Casado: ",tableEver_married[1]), paste("No ha estado casado: ",tableEver_married[2]))
barplot(tableEver_married ,col=c("Blue","red") , legen=legend)
## Revision Columna work_type
unique(stroke$work_type)
## [1] "Private" "Self-employed" "Govt_job" "children"
## [5] "Never_worked"
stroke=transform(stroke, work_type=factor(work_type,levels=c("Private","Self-employed","Govt_job","children","Never_worked"),labels=c("Private","Self-employed","Govt_job","children","Never_worked")))
tableWork_type = table(stroke$work_type)
legend = c(tableWork_type[1],tableWork_type[2],tableWork_type[3],tableWork_type[4],tableWork_type[5])
barplot(tableWork_type ,col=c("Blue","red","black","pink","yellow") , legen=legend)
## Revision Columna Residence_type
unique(stroke$Residence_type)
## [1] "Urban" "Rural"
stroke=transform(stroke, Residence_type=factor(Residence_type,levels=c("Urban","Rural"),labels=c("Urbano","Rural")))
tableResidence_type = table(stroke$Residence_type)
legend = c(paste("Urbano: ",tableResidence_type[1]), paste("Rural: ",tableResidence_type[2]))
barplot(tableResidence_type ,col=c("Blue","red") , legen=legend)
## Revisión Columna avg_glucose_level
par(mfrow=c(1,2))
hist(x = stroke$avg_glucose_level)
boxplot(stroke$avg_glucose_level)
## Revisión Columna bmi
valoresNulosIMC = is.na(stroke$bmi)
promedioIMC = sum(stroke$bmi[!valoresNulosIMC])/length(stroke$bmi[!valoresNulosIMC])
stroke$bmi[valoresNulosIMC]=promedioIMC
par(mfrow=c(1,2))
hist(x = stroke$bmi)
boxplot(stroke$bmi)
## Revision Columna smoking_status
unique(stroke$smoking_status)
## [1] "formerly smoked" "never smoked" "smokes" "Unknown"
stroke=transform(stroke, smoking_status=factor(smoking_status,levels=c("formerly smoked","never smoked","smokes","Unknown"),labels=c("formerly smoked","never smoked","smokes","Unknown")))
tableSmoking_status = table(stroke$smoking_status)
legend = c(tableSmoking_status[1],tableSmoking_status[2],tableSmoking_status[3],tableSmoking_status[4])
barplot(tableSmoking_status ,col=c("Blue","red","black","pink") , legen=legend)
## Revision Columna stroke
unique(stroke$stroke)
## [1] 1 0
stroke=transform(stroke, stroke=factor(stroke,levels=0:1,labels=c("No","Si")))
tableStroke = table(stroke$stroke)
legend = c(paste("No Sufrió Accidente Cardiovascular: ",tableStroke[1]), paste("Si Sufrió Accidente Cardiovascular: ",tableStroke[2]))
barplot(tableStroke ,col=c("Blue","red") , legen=legend)
summary(stroke)
## gender age hypertension
## Male :2115 Min. : 0.08 No Tiene Hipertension:4611
## Female:2994 1st Qu.:25.00 Si Tiene Hipertension: 498
## Median :45.00
## Mean :43.23
## 3rd Qu.:61.00
## Max. :82.00
## heart_disease ever_married work_type
## No Tiene Enfermedades del Corazon:4833 Si:3353 Private :2924
## Si Tiene Enfermedades del Corazon: 276 No:1756 Self-employed: 819
## Govt_job : 657
## children : 687
## Never_worked : 22
##
## Residence_type avg_glucose_level bmi smoking_status
## Urbano:2596 Min. : 55.12 Min. :10.30 formerly smoked: 884
## Rural :2513 1st Qu.: 77.24 1st Qu.:23.80 never smoked :1892
## Median : 91.88 Median :28.40 smokes : 789
## Mean :106.14 Mean :28.89 Unknown :1544
## 3rd Qu.:114.09 3rd Qu.:32.80
## Max. :271.74 Max. :97.60
## stroke
## No:4860
## Si: 249
##
##
##
##
#### Graficando las relaciones entre las variables predictorias y la respuesta
par(mfrow=c(4,3))
tabla<-prop.table(table(stroke$stroke))
coord<-barplot(tabla, col=c("blue","red"),ylim=c(0,1), main="Stroke")
text(coord,tabla,labels=round(tabla,2), pos=3)
#Graficando las varaibla cuantitativas
lapply(names(stroke[,-c(1,3,4,5,6,7,10,11)]),function(y){
boxplot(stroke[,y]~stroke[,"stroke"],ylab= y, xlab="stroke",boxwex = 0.5,col=NULL)
stripchart(stroke[,y] ~ stroke[,"stroke"], vertical = T,
method = "jitter", pch = 19,
col = c("blue","red"), add = T)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
#Graficando las variables cualitativas
lapply(names(stroke[,-c(2,8,9,11)]),function(y){
tabla<-prop.table(table(stroke$stroke,stroke[,y]))
coord<-barplot(tabla, col=c("blue","red"),ylim=c(0,1), main=paste("Stroke Vs ",y) )
text(coord,tabla,labels=round(tabla,2), pos=3)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
#### Ajustando un modelo Saturado.
Modelo_saturado = glm(formula= stroke~., data=stroke,family = "binomial") # Ajusta un Modelo saturado --> todas las variables
summary(Modelo_saturado)
##
## Call:
## glm(formula = stroke ~ ., family = "binomial", data = stroke)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1266 -0.3199 -0.1639 -0.0868 3.5608
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -7.754204 0.579771 -13.375
## genderFemale -0.013535 0.141867 -0.095
## age 0.074977 0.005842 12.834
## hypertensionSi Tiene Hipertension 0.400596 0.164959 2.428
## heart_diseaseSi Tiene Enfermedades del Corazon 0.281023 0.191098 1.471
## ever_marriedNo 0.184643 0.225391 0.819
## work_typeSelf-employed -0.376874 0.166589 -2.262
## work_typeGovt_job -0.142818 0.206502 -0.692
## work_typechildren 0.836207 0.820461 1.019
## work_typeNever_worked -9.504686 309.249371 -0.031
## Residence_typeRural -0.083365 0.138323 -0.603
## avg_glucose_level 0.003976 0.001199 3.317
## bmi 0.003539 0.011288 0.313
## smoking_statusnever smoked -0.206053 0.175911 -1.171
## smoking_statussmokes 0.113882 0.215347 0.529
## smoking_statusUnknown -0.072098 0.208368 -0.346
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## genderFemale 0.923991
## age < 2e-16 ***
## hypertensionSi Tiene Hipertension 0.015163 *
## heart_diseaseSi Tiene Enfermedades del Corazon 0.141408
## ever_marriedNo 0.412664
## work_typeSelf-employed 0.023679 *
## work_typeGovt_job 0.489185
## work_typechildren 0.308112
## work_typeNever_worked 0.975481
## Residence_typeRural 0.546719
## avg_glucose_level 0.000911 ***
## bmi 0.753911
## smoking_statusnever smoked 0.241459
## smoking_statussmokes 0.596923
## smoking_statusUnknown 0.729333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1990.3 on 5108 degrees of freedom
## Residual deviance: 1581.1 on 5093 degrees of freedom
## AIC: 1613.1
##
## Number of Fisher Scoring iterations: 14
# Predicciones del modelo saturado
Prob_modelo_saturado = predict(Modelo_saturado,type = "response")
# Buscando el punto de corte para el modelo saturado
roc <- pROC::roc(stroke$stroke , Prob_modelo_saturado , auc = TRUE, ci = TRUE)
print(auc(roc))
## Area under the curve: 0.8469
pROC::plot.roc(roc, legacy.axes = TRUE, print.thres = "best", print.auc = TRUE,
auc.polygon = FALSE, max.auc.polygon = FALSE, auc.polygon.col = "gainsboro",
col = 2, grid = TRUE)
# Generar clasificaciones - Convertir probabilidades predichas en clasificaciones
pc=0.040 # Seleccionamos un punto de corte dicho por la curva ROC
Class_Modelo_saturado = as.factor(ifelse(Prob_modelo_saturado>pc,"Si","No"))
# Evaluando la bondad de clasificacion
table(stroke$stroke,Class_Modelo_saturado,dnn=c("observado","predicho")) # Matriz de confusión
## predicho
## observado No Si
## No 3379 1481
## Si 32 217
caret::confusionMatrix(Class_Modelo_saturado,stroke$stroke,positive = "Si")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Si
## No 3379 32
## Si 1481 217
##
## Accuracy : 0.7039
## 95% CI : (0.6911, 0.7164)
## No Information Rate : 0.9513
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1507
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.87149
## Specificity : 0.69527
## Pos Pred Value : 0.12780
## Neg Pred Value : 0.99062
## Prevalence : 0.04874
## Detection Rate : 0.04247
## Detection Prevalence : 0.33235
## Balanced Accuracy : 0.78338
##
## 'Positive' Class : Si
##
ICC_Class_Modelo_saturado=caret::confusionMatrix(Class_Modelo_saturado,stroke$stroke,positive = "Si")
ICC_Class_Modelo_saturado$byClass
## Sensitivity Specificity Pos Pred Value
## 0.87148594 0.69526749 0.12779741
## Neg Pred Value Precision Recall
## 0.99061859 0.12779741 0.87148594
## F1 Prevalence Detection Rate
## 0.22290704 0.04873752 0.04247407
## Detection Prevalence Balanced Accuracy
## 0.33235467 0.78337672
Ind_class=list()
Ind_class$Modelo_Saturado=ICC_Class_Modelo_saturado$byClass
data.frame(Ind_class)
## Modelo_Saturado
## Sensitivity 0.87148594
## Specificity 0.69526749
## Pos Pred Value 0.12779741
## Neg Pred Value 0.99061859
## Precision 0.12779741
## Recall 0.87148594
## F1 0.22290704
## Prevalence 0.04873752
## Detection Rate 0.04247407
## Detection Prevalence 0.33235467
## Balanced Accuracy 0.78337672
utilizar un algoritmo backward para eliminar variables que no aportan al modelo y así tener simplicidad y generar una mejor predicción
#### Realizar un backward para generar un modelo con menos variables.
Modelo_RLM1=step(Modelo_saturado,direction = "backward")
## Start: AIC=1613.06
## stroke ~ gender + age + hypertension + heart_disease + ever_married +
## work_type + Residence_type + avg_glucose_level + bmi + smoking_status
##
## Df Deviance AIC
## - smoking_status 3 1583.9 1609.9
## - gender 1 1581.1 1611.1
## - bmi 1 1581.2 1611.2
## - work_type 4 1587.3 1611.3
## - Residence_type 1 1581.4 1611.4
## - ever_married 1 1581.7 1611.7
## <none> 1581.1 1613.1
## - heart_disease 1 1583.2 1613.2
## - hypertension 1 1586.7 1616.7
## - avg_glucose_level 1 1591.8 1621.8
## - age 1 1785.5 1815.5
##
## Step: AIC=1609.88
## stroke ~ gender + age + hypertension + heart_disease + ever_married +
## work_type + Residence_type + avg_glucose_level + bmi
##
## Df Deviance AIC
## - gender 1 1584.0 1608.0
## - bmi 1 1584.0 1608.0
## - work_type 4 1590.1 1608.1
## - Residence_type 1 1584.4 1608.4
## - ever_married 1 1584.4 1608.4
## <none> 1583.9 1609.9
## - heart_disease 1 1586.4 1610.4
## - hypertension 1 1589.3 1613.3
## - avg_glucose_level 1 1594.5 1618.5
## - age 1 1790.2 1814.2
##
## Step: AIC=1607.97
## stroke ~ age + hypertension + heart_disease + ever_married +
## work_type + Residence_type + avg_glucose_level + bmi
##
## Df Deviance AIC
## - bmi 1 1584.1 1606.1
## - work_type 4 1590.2 1606.2
## - Residence_type 1 1584.5 1606.5
## - ever_married 1 1584.5 1606.5
## <none> 1584.0 1608.0
## - heart_disease 1 1586.6 1608.6
## - hypertension 1 1589.4 1611.4
## - avg_glucose_level 1 1594.8 1616.8
## - age 1 1790.2 1812.2
##
## Step: AIC=1606.06
## stroke ~ age + hypertension + heart_disease + ever_married +
## work_type + Residence_type + avg_glucose_level
##
## Df Deviance AIC
## - work_type 4 1590.2 1604.2
## - Residence_type 1 1584.6 1604.6
## - ever_married 1 1584.6 1604.6
## <none> 1584.1 1606.1
## - heart_disease 1 1586.7 1606.7
## - hypertension 1 1589.6 1609.6
## - avg_glucose_level 1 1595.6 1615.6
## - age 1 1793.3 1813.3
##
## Step: AIC=1604.23
## stroke ~ age + hypertension + heart_disease + ever_married +
## Residence_type + avg_glucose_level
##
## Df Deviance AIC
## - Residence_type 1 1590.8 1602.8
## - ever_married 1 1590.9 1602.9
## <none> 1590.2 1604.2
## - heart_disease 1 1593.1 1605.1
## - hypertension 1 1595.6 1607.6
## - avg_glucose_level 1 1602.6 1614.6
## - age 1 1810.2 1822.2
##
## Step: AIC=1602.76
## stroke ~ age + hypertension + heart_disease + ever_married +
## avg_glucose_level
##
## Df Deviance AIC
## - ever_married 1 1591.5 1601.5
## <none> 1590.8 1602.8
## - heart_disease 1 1593.6 1603.6
## - hypertension 1 1596.0 1606.0
## - avg_glucose_level 1 1603.2 1613.2
## - age 1 1811.4 1821.4
##
## Step: AIC=1601.45
## stroke ~ age + hypertension + heart_disease + avg_glucose_level
##
## Df Deviance AIC
## <none> 1591.5 1601.5
## - heart_disease 1 1594.4 1602.4
## - hypertension 1 1596.7 1604.7
## - avg_glucose_level 1 1603.6 1611.6
## - age 1 1847.3 1855.3
summary(Modelo_RLM1)
##
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + avg_glucose_level,
## family = "binomial", data = stroke)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0587 -0.3215 -0.1732 -0.0828 3.7706
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -7.488996 0.357890 -20.925
## age 0.068920 0.005140 13.408
## hypertensionSi Tiene Hipertension 0.381396 0.162599 2.346
## heart_diseaseSi Tiene Enfermedades del Corazon 0.329972 0.187724 1.758
## avg_glucose_level 0.004121 0.001162 3.547
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## age < 2e-16 ***
## hypertensionSi Tiene Hipertension 0.01899 *
## heart_diseaseSi Tiene Enfermedades del Corazon 0.07879 .
## avg_glucose_level 0.00039 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1990.3 on 5108 degrees of freedom
## Residual deviance: 1591.4 on 5104 degrees of freedom
## AIC: 1601.4
##
## Number of Fisher Scoring iterations: 7
# Predicciones del modelo saturado
Prob_modelo_backward = predict(Modelo_RLM1,type = "response")
# Buscando el punto de corte para el modelo saturado
roc <- pROC::roc(stroke$stroke , Prob_modelo_backward , auc = TRUE, ci = TRUE)
print(auc(roc))
## Area under the curve: 0.8442
pROC::plot.roc(roc, legacy.axes = TRUE, print.thres = "best", print.auc = TRUE,
auc.polygon = FALSE, max.auc.polygon = FALSE, auc.polygon.col = "gainsboro",
col = 2, grid = TRUE)
# Generar clasificaciones - Convertir probabilidades predichas en clasificaciones
pc=0.032 # Seleccionamos un punto de corte dicho por la curva ROC
Class_Modelo_backward = as.factor(ifelse(Prob_modelo_backward>pc,"Si","No"))
# Evaluando la bondad de clasificacion
table(stroke$stroke,Class_Modelo_backward,dnn=c("observado","predicho")) # Matriz de confusión
## predicho
## observado No Si
## No 3109 1751
## Si 24 225
caret::confusionMatrix(Class_Modelo_backward,stroke$stroke,positive = "Si")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Si
## No 3109 24
## Si 1751 225
##
## Accuracy : 0.6526
## 95% CI : (0.6393, 0.6656)
## No Information Rate : 0.9513
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1266
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.90361
## Specificity : 0.63971
## Pos Pred Value : 0.11387
## Neg Pred Value : 0.99234
## Prevalence : 0.04874
## Detection Rate : 0.04404
## Detection Prevalence : 0.38677
## Balanced Accuracy : 0.77166
##
## 'Positive' Class : Si
##
ICC_Class_Modelo_backward=caret::confusionMatrix(Class_Modelo_backward,stroke$stroke,positive = "Si")
ICC_Class_Modelo_backward$byClass
## Sensitivity Specificity Pos Pred Value
## 0.90361446 0.63971193 0.11386640
## Neg Pred Value Precision Recall
## 0.99233961 0.11386640 0.90361446
## F1 Prevalence Detection Rate
## 0.20224719 0.04873752 0.04403993
## Detection Prevalence Balanced Accuracy
## 0.38676845 0.77166320
Ind_class$ModeloBackward=ICC_Class_Modelo_backward$byClass
data.frame(Ind_class)
## Modelo_Saturado ModeloBackward
## Sensitivity 0.87148594 0.90361446
## Specificity 0.69526749 0.63971193
## Pos Pred Value 0.12779741 0.11386640
## Neg Pred Value 0.99061859 0.99233961
## Precision 0.12779741 0.11386640
## Recall 0.87148594 0.90361446
## F1 0.22290704 0.20224719
## Prevalence 0.04873752 0.04873752
## Detection Rate 0.04247407 0.04403993
## Detection Prevalence 0.33235467 0.38676845
## Balanced Accuracy 0.78337672 0.77166320