Este ejercicio consiste en realizar un análisis exploratorio sobre un dataset de personal de una determinada empresa con 14999 instancias y 10 atributos.
El objetivo es conseguir un modelo adecuado con un resultado aceptable interpretando cada paso del razonamiento necesario para llegar al objetivo.
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
corrplot 0.84 loaded
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 㤼㸱randomForest㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
margin
The following object is masked from 㤼㸱package:dplyr㤼㸲:
combine
raw_data
summary(raw_data)
left salary satisfaction_level last_evaluation number_project average_montly_hours
Min. :0.0000 high :1237 Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
1st Qu.:0.0000 low :7316 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
Median :0.0000 medium:6446 Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
Mean :0.2381 Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
3rd Qu.:0.0000 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
time_spend_company Work_accident promotion_last_5years sales
Min. : 2.000 Min. :0.0000 Min. :0.00000 sales :4140
1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.00000 technical :2720
Median : 3.000 Median :0.0000 Median :0.00000 support :2229
Mean : 3.498 Mean :0.1446 Mean :0.02127 IT :1227
3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.00000 product_mng: 902
Max. :10.000 Max. :1.0000 Max. :1.00000 marketing : 858
(Other) :2923
str(raw_data)
'data.frame': 14999 obs. of 10 variables:
$ left : int 1 1 1 1 1 1 1 1 1 1 ...
$ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
$ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
$ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
$ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
$ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
$ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
$ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sales : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
CantidadNulos <- sapply(raw_data, function(x) sum(is.na(x)))
data.frame(CantidadNulos)
par(mfrow = c(1,2))
boxplot(raw_data$satisfaction_level, main = "Satisfaction Level")
hist(raw_data$satisfaction_level, main = "Distribucion Satisfaction Level", freq = F)
lines(density(raw_data$satisfaction_level), col = "red", lwd=2)
par(mfrow = c(1,2))
boxplot(raw_data$last_evaluation, main = "Last Evaluation")
hist(raw_data$last_evaluation, main = "Distribucion Last Evaluation", freq = F)
lines(density(raw_data$last_evaluation), col = "red", lwd=2)
par(mfrow = c(1,2))
boxplot(raw_data$number_project, main = "Number of Project")
hist(raw_data$number_project, main = "Distribucion Number of Project")
par(mfrow = c(1,2))
boxplot(raw_data$average_montly_hours, main = "Average Montly Hours")
hist(raw_data$average_montly_hours, main = "Distribucion Average Montly Hours", freq = F)
lines(density(raw_data$average_montly_hours), col = "red", lwd=2)
par(mfrow = c(1,2))
boxplot(raw_data$time_spend_company, main = "Time Spend Company")
hist(raw_data$time_spend_company, main = "Distribucion Time Spend Company")
colorSalary = rainbow(nlevels((as.factor(raw_data$salary))))
colorYN = rainbow(nlevels((as.factor(raw_data$Work_accident))))
colorSales = rainbow(nlevels((as.factor(raw_data$sales))))
par(mfrow = c(1,2))
barplot(summary(raw_data$salary), main = "Distribución de 'Salary'",
col= colorSalary )
pie(summary(as.factor(raw_data$promotion_last_5years)), labels = c("Si","No"), main = "Distribución de 'Promotion'", col=colorYN)
par(mfrow = c(1,2))
pie(summary(as.factor(raw_data$Work_accident)), labels = c("Si","No"), main = "Distribución de 'Work Accident'", col=colorYN)
pie(summary(as.factor(raw_data$left)), labels = c("Si","No"), main = "Distribución de 'Left'", col=colorYN)
barplot(summary(raw_data$sales), main = "Distribución de 'Sales'", col = colorSales)
legend("topleft", summary(raw_data$sales), cex = 0.8, fill = colorSales, legend=levels(raw_data$sales))
corrplot(cor(select(raw_data, -c("salary", "sales"))), type="upper", method="pie")
cor(select(raw_data, -c("salary", "sales")))
left satisfaction_level last_evaluation number_project average_montly_hours
left 1.00000000 -0.38837498 0.006567120 0.023787185 0.071287179
satisfaction_level -0.38837498 1.00000000 0.105021214 -0.142969586 -0.020048113
last_evaluation 0.00656712 0.10502121 1.000000000 0.349332589 0.339741800
number_project 0.02378719 -0.14296959 0.349332589 1.000000000 0.417210634
average_montly_hours 0.07128718 -0.02004811 0.339741800 0.417210634 1.000000000
time_spend_company 0.14482217 -0.10086607 0.131590722 0.196785891 0.127754910
Work_accident -0.15462163 0.05869724 -0.007104289 -0.004740548 -0.010142888
promotion_last_5years -0.06178811 0.02560519 -0.008683768 -0.006063958 -0.003544414
time_spend_company Work_accident promotion_last_5years
left 0.144822175 -0.154621634 -0.061788107
satisfaction_level -0.100866073 0.058697241 0.025605186
last_evaluation 0.131590722 -0.007104289 -0.008683768
number_project 0.196785891 -0.004740548 -0.006063958
average_montly_hours 0.127754910 -0.010142888 -0.003544414
time_spend_company 1.000000000 0.002120418 0.067432925
Work_accident 0.002120418 1.000000000 0.039245435
promotion_last_5years 0.067432925 0.039245435 1.000000000
imcdiag(select(raw_data, -c("left","salary", "sales")), raw_data$left)
Call:
imcdiag(x = select(raw_data, -c("left", "salary", "sales")),
y = raw_data$left)
All Individual Multicollinearity Diagnostics Result
VIF TOL Wi Fi Leamer CVIF Klein
satisfaction_level 1.0634 0.9404 158.4444 190.1460 0.9697 1.0801 0
last_evaluation 1.2405 0.8061 600.9249 721.1580 0.8978 1.2600 1
number_project 1.3524 0.7394 880.6199 1056.8143 0.8599 1.3737 1
average_montly_hours 1.2790 0.7819 697.0355 836.4983 0.8842 1.2990 1
time_spend_company 1.0605 0.9429 151.2186 181.4744 0.9710 1.0772 0
Work_accident 1.0053 0.9948 13.1418 15.7712 0.9974 1.0210 0
promotion_last_5years 1.0076 0.9925 18.9554 22.7480 0.9962 1.0234 0
1 --> COLLINEARITY is detected by the test
0 --> COLLINEARITY is not detected by the test
* all coefficients have significant t-ratios
R-square of y on all x: 0.1927
* use method argument to check which regressors may be the reason of collinearity
===================================
ggplot(raw_data,
aes(x = satisfaction_level, y = last_evaluation, color = as.factor(left))) + geom_point()
ggplot(raw_data,
aes(x = number_project, y = satisfaction_level, color = as.factor(left))) + geom_boxplot()
Se observa que aquellas instancias con una cantidad de proyectos superiores a 4 y con niveles de conformidad entre 0.10 y 0.70, presentan la mayoria de las salidas de la empresa.
Se observa que aquellas instancias con una cantidad de proyectos menores a 4, presentan una mayoria con niveles de satisfaccion entre 0.50 y 0.80.
ggplot(raw_data,
aes(x = average_montly_hours, y = satisfaction_level, color = as.factor(left))) + geom_boxplot()
ggplot(raw_data,
aes(x = time_spend_company, y = satisfaction_level, color = as.factor(left))) + geom_boxplot()
data_train_1 <- sample_frac(raw_data, 0.7)
prop.table(table(data_train_1$left))
0 1
0.7590247 0.2409753
data_test_1 <- setdiff(raw_data, data_train_1)
prop.table(table(data_test_1$left))
0 1
0.8801029 0.1198971
data_train_1$left <- factor(data_train_1$left)
data_test_1$left <- factor(data_test_1$left)
tree_1 <- rpart(formula = left ~ ., data = data_train_1)
rpart.plot(tree_1)
prediccion <- predict(tree_1, newdata = data_test_1, type = "class")
confusionMatrix(prediccion, data_test_1[["left"]])
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2708 29
1 30 344
Accuracy : 0.981
95% CI : (0.9756, 0.9855)
No Information Rate : 0.8801
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9102
Mcnemar's Test P-Value : 1
Sensitivity : 0.9890
Specificity : 0.9223
Pos Pred Value : 0.9894
Neg Pred Value : 0.9198
Prevalence : 0.8801
Detection Rate : 0.8705
Detection Prevalence : 0.8798
Balanced Accuracy : 0.9556
'Positive' Class : 0
glm.model <- glm(formula = left ~ ., data = data_train_1, family = binomial(logit))
summary(glm.model)
Call:
glm(formula = left ~ ., family = binomial(logit), data = data_train_1)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3066 -0.6690 -0.4022 -0.1078 3.0220
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.3546828 0.2280553 -5.940 2.85e-09 ***
salarylow 1.8941971 0.1511511 12.532 < 2e-16 ***
salarymedium 1.3888745 0.1520547 9.134 < 2e-16 ***
satisfaction_level -4.1580605 0.1168443 -35.586 < 2e-16 ***
last_evaluation 0.6592715 0.1781232 3.701 0.000215 ***
number_project -0.3017375 0.0254097 -11.875 < 2e-16 ***
average_montly_hours 0.0043428 0.0006187 7.020 2.22e-12 ***
time_spend_company 0.2880462 0.0187221 15.385 < 2e-16 ***
Work_accident -1.5825269 0.1083561 -14.605 < 2e-16 ***
promotion_last_5years -1.6161117 0.3120835 -5.178 2.24e-07 ***
saleshr 0.1587844 0.1548761 1.025 0.305252
salesIT -0.2104590 0.1436538 -1.465 0.142909
salesmanagement -0.4990663 0.1932009 -2.583 0.009790 **
salesmarketing -0.1203607 0.1571524 -0.766 0.443745
salesproduct_mng -0.2351966 0.1533708 -1.534 0.125149
salesRandD -0.7177087 0.1727671 -4.154 3.26e-05 ***
salessales -0.1145991 0.1207182 -0.949 0.342463
salessupport -0.0620536 0.1287843 -0.482 0.629919
salestechnical -0.0922945 0.1262020 -0.731 0.464581
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 11595.1 on 10498 degrees of freedom
Residual deviance: 9038.5 on 10480 degrees of freedom
AIC: 9076.5
Number of Fisher Scoring iterations: 5
lgm.predict <- round(predict(glm.model, data_test_1, type = "response"))
lgm.predict <- factor(lgm.predict)
confusionMatrix(lgm.predict, data_test_1$left)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2542 231
1 196 142
Accuracy : 0.8627
95% CI : (0.8502, 0.8747)
No Information Rate : 0.8801
P-Value [Acc > NIR] : 0.99844
Kappa : 0.3222
Mcnemar's Test P-Value : 0.09989
Sensitivity : 0.9284
Specificity : 0.3807
Pos Pred Value : 0.9167
Neg Pred Value : 0.4201
Prevalence : 0.8801
Detection Rate : 0.8171
Detection Prevalence : 0.8914
Balanced Accuracy : 0.6546
'Positive' Class : 0
rf.model <- randomForest(left~., data = data_train_1)
rf.prediction <- predict(rf.model, data_test_1, type = "class")
confusionMatrix(rf.prediction, data_test_1$left)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2733 29
1 5 344
Accuracy : 0.9891
95% CI : (0.9848, 0.9924)
No Information Rate : 0.8801
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9467
Mcnemar's Test P-Value : 7.998e-05
Sensitivity : 0.9982
Specificity : 0.9223
Pos Pred Value : 0.9895
Neg Pred Value : 0.9857
Prevalence : 0.8801
Detection Rate : 0.8785
Detection Prevalence : 0.8878
Balanced Accuracy : 0.9602
'Positive' Class : 0
lda.model <- train(left ~., data = data_train_1, method = "lda")
lda.predict <- predict(lda.model, data_test_1)
confusionMatrix(lda.predict, data_test_1$left)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2517 239
1 221 134
Accuracy : 0.8521
95% CI : (0.8392, 0.8644)
No Information Rate : 0.8801
P-Value [Acc > NIR] : 1.000
Kappa : 0.2845
Mcnemar's Test P-Value : 0.428
Sensitivity : 0.9193
Specificity : 0.3592
Pos Pred Value : 0.9133
Neg Pred Value : 0.3775
Prevalence : 0.8801
Detection Rate : 0.8091
Detection Prevalence : 0.8859
Balanced Accuracy : 0.6393
'Positive' Class : 0
A fin de poder evaluar los modelos se utilizan los siguientes valores resultado:
results_tree <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_1 <- sample_frac(raw_data, 0.7)
prop.table(table(data_train_1$left))
data_test_1 <- setdiff(raw_data, data_train_1)
prop.table(table(data_test_1$left))
data_train_1$left <- factor(data_train_1$left)
data_test_1$left <- factor(data_test_1$left)
tree_1 <- rpart(formula = left ~ ., data = data_train_1)
prediccion <- predict(tree_1, newdata = data_test_1, type = "class")
res_tree <- confusionMatrix(prediccion, data_test_1[["left"]])
results_tree[i,] <- res_tree$overall["Accuracy"]
}
results_log <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_1 <- sample_frac(raw_data, 0.7)
prop.table(table(data_train_1$left))
data_test_1 <- setdiff(raw_data, data_train_1)
prop.table(table(data_test_1$left))
data_train_1$left <- factor(data_train_1$left)
data_test_1$left <- factor(data_test_1$left)
glm.model <- glm(formula = left ~ ., data = data_train_1, family = "binomial")
lgm.predict <- round(predict(glm.model, data_test_1, type = "response"))
lgm.predict <- factor(lgm.predict)
res_lgm = confusionMatrix(lgm.predict, data_test_1$left)
results_log[i,] <- res_lgm$overall["Accuracy"]
}
results_rf <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_1 <- sample_frac(raw_data, 0.7)
prop.table(table(data_train_1$left))
data_test_1 <- setdiff(raw_data, data_train_1)
prop.table(table(data_test_1$left))
data_train_1$left <- factor(data_train_1$left)
data_test_1$left <- factor(data_test_1$left)
rf.model <- randomForest(left~., data = data_train_1)
rf.prediction <- predict(rf.model, data_test_1, type = "class")
res_random = confusionMatrix(rf.prediction, data_test_1$left)
results_rf[i,] <- res_random$overall["Accuracy"]
}
results_lda <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_1 <- sample_frac(raw_data, 0.7)
prop.table(table(data_train_1$left))
data_test_1 <- setdiff(raw_data, data_train_1)
prop.table(table(data_test_1$left))
data_train_1$left <- factor(data_train_1$left)
data_test_1$left <- factor(data_test_1$left)
lda.model <- train(left ~., data = data_train_1, method = "lda")
lda.predict <- predict(lda.model, data_test_1)
res_lda = confusionMatrix(lda.predict, data_test_1$left)
results_lda[i,] <- res_lda$overall["Accuracy"]
}
| Arbol de Decision | Regresion Logistica | Random Forest | LDA |
|---|---|---|---|
| 0.9889021 | 0.9701466 | 0.9910586 | 0.971527 |
data_set <- raw_data
data_set <- dummy_cols(data_set, select_columns = c("sales"))
data_set$sales = NULL
data_set <- dummy_cols(data_set, select_columns = c("salary"))
data_set$salary = NULL
data_set <- data_set %>% filter(satisfaction_level > 0.48) %>% filter(last_evaluation > 0.50)
ggplot(data_set,
aes(x = satisfaction_level, y = last_evaluation, color = as.factor(left))) + geom_point()
corrplot(cor(data_set), type="upper", method="pie")
imcdiag(data_set, data_set$left)
Call:
imcdiag(x = data_set, y = data_set$left)
All Individual Multicollinearity Diagnostics Result
VIF TOL Wi Fi Leamer CVIF Klein
left 1.4331 0.6978 211.5604 222.7179 0.8353 0 0
satisfaction_level 1.0209 0.9795 10.2101 10.7486 0.9897 0 0
last_evaluation 1.1213 0.8918 59.2678 62.3935 0.9443 0 0
number_project 1.0763 0.9291 37.2578 39.2228 0.9639 0 0
average_montly_hours 1.0875 0.9195 42.7347 44.9885 0.9589 0 0
time_spend_company 1.1922 0.8388 93.8901 98.8419 0.9158 0 0
Work_accident 1.0143 0.9859 6.9662 7.3336 0.9929 0 0
promotion_last_5years 1.0349 0.9663 17.0316 17.9298 0.9830 0 0
sales_accounting Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_hr Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_IT Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_management Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_marketing Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_product_mng Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_RandD Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_sales Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_support Inf 0.0000 Inf Inf 0.0000 NaN 0
sales_technical Inf 0.0000 Inf Inf 0.0000 NaN 0
salary_high Inf 0.0000 Inf Inf 0.0000 NaN 0
salary_low Inf 0.0000 Inf Inf 0.0000 NaN 0
salary_medium Inf 0.0000 Inf Inf 0.0000 NaN 0
1 --> COLLINEARITY is detected by the test
0 --> COLLINEARITY is not detected by the test
satisfaction_level , last_evaluation , number_project , average_montly_hours , time_spend_company , Work_accident , promotion_last_5years , sales_accounting , sales_hr , sales_IT , sales_management , sales_marketing , sales_product_mng , sales_RandD , sales_sales , sales_support , sales_technical , salary_high , coefficient(s) are non-significant may be due to multicollinearity
R-square of y on all x: 1
* use method argument to check which regressors may be the reason of collinearity
===================================
models <- regsubsets(left~., data = data_set, nvmax = 5)
2 linear dependencies found
Reordering variables and trying again:
summary(models)
Subset selection object
Call: regsubsets.formula(left ~ ., data = data_set, nvmax = 5)
20 Variables (and intercept)
Forced in Forced out
satisfaction_level FALSE FALSE
last_evaluation FALSE FALSE
number_project FALSE FALSE
average_montly_hours FALSE FALSE
time_spend_company FALSE FALSE
Work_accident FALSE FALSE
promotion_last_5years FALSE FALSE
sales_accounting FALSE FALSE
sales_hr FALSE FALSE
sales_IT FALSE FALSE
sales_management FALSE FALSE
sales_marketing FALSE FALSE
sales_product_mng FALSE FALSE
sales_RandD FALSE FALSE
sales_sales FALSE FALSE
sales_support FALSE FALSE
salary_high FALSE FALSE
salary_low FALSE FALSE
sales_technical FALSE FALSE
salary_medium FALSE FALSE
1 subsets of each size up to 6
Selection Algorithm: exhaustive
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company
1 ( 1 ) " " " " " " " " "*"
2 ( 1 ) " " "*" " " " " "*"
3 ( 1 ) " " "*" " " "*" "*"
4 ( 1 ) " " "*" "*" "*" "*"
5 ( 1 ) " " "*" "*" "*" "*"
6 ( 1 ) " " "*" "*" "*" "*"
Work_accident promotion_last_5years sales_accounting sales_hr sales_IT sales_management
1 ( 1 ) " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " "
6 ( 1 ) "*" " " " " " " " " " "
sales_marketing sales_product_mng sales_RandD sales_sales sales_support sales_technical
1 ( 1 ) " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " "
6 ( 1 ) " " " " " " " " " " " "
salary_high salary_low salary_medium
1 ( 1 ) " " " " " "
2 ( 1 ) " " " " " "
3 ( 1 ) " " " " " "
4 ( 1 ) " " " " " "
5 ( 1 ) " " "*" " "
6 ( 1 ) " " "*" " "
plot(models)
data_set_1 <- select(data_set, c("left", "satisfaction_level", "last_evaluation", "number_project", "average_montly_hours", "time_spend_company", "Work_accident"))
dim(data_set_1)/dim(raw_data)
[1] 0.6527102 0.7000000
corrplot(cor(data_set_1), type="upper", method="pie")
imcdiag(data_set_1, data_set_1$left)
Call:
imcdiag(x = data_set_1, y = data_set_1$left)
All Individual Multicollinearity Diagnostics Result
VIF TOL Wi Fi Leamer CVIF Klein
left 1.4014 0.7135 654.5625 785.5553 0.8447 0 0
satisfaction_level 1.0195 0.9808 31.8390 38.2107 0.9904 0 0
last_evaluation 1.1180 0.8945 192.4022 230.9063 0.9458 0 0
number_project 1.0749 0.9303 122.0834 146.5151 0.9645 0 0
average_montly_hours 1.0860 0.9208 140.3014 168.3789 0.9596 0 0
time_spend_company 1.1487 0.8705 242.4752 291.0000 0.9330 0 0
Work_accident 1.0121 0.9880 19.7275 23.6755 0.9940 0 0
1 --> COLLINEARITY is detected by the test
0 --> COLLINEARITY is not detected by the test
satisfaction_level , last_evaluation , number_project , average_montly_hours , time_spend_company , Work_accident , coefficient(s) are non-significant may be due to multicollinearity
R-square of y on all x: 1
* use method argument to check which regressors may be the reason of collinearity
===================================
results_tree <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_2 <- sample_frac(data_set_1, 0.7)
prop.table(table(data_train_2$left))
data_test_2 <- setdiff(data_set_1, data_train_2)
prop.table(table(data_test_2$left))
data_train_2$left <- factor(data_train_2$left)
data_test_2$left <- factor(data_test_2$left)
tree_1 <- rpart(formula = left ~ ., data = data_train_2)
prediccion <- predict(tree_1, newdata = data_test_2, type = "class")
res_tree <- confusionMatrix(prediccion, data_test_2[["left"]])
results_tree[i,] <- res_tree$overall["Accuracy"]
}
results_log <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_2 <- sample_frac(data_set_1, 0.7)
prop.table(table(data_train_2$left))
data_test_2 <- setdiff(data_set_1, data_train_2)
prop.table(table(data_test_2$left))
data_train_2$left <- factor(data_train_2$left)
data_test_2$left <- factor(data_test_2$left)
glm.model <- glm(formula = left ~ ., data = data_train_2, family = "binomial")
lgm.predict <- round(predict(glm.model, data_test_2, type = "response"))
lgm.predict <- factor(lgm.predict)
res_lgm = confusionMatrix(lgm.predict, data_test_2$left)
results_log[i,] <- res_lgm$overall["Accuracy"]
}
results_rf <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_2 <- sample_frac(data_set_1, 0.7)
prop.table(table(data_train_2$left))
data_test_2 <- setdiff(data_set_1, data_train_2)
prop.table(table(data_test_2$left))
data_train_2$left <- factor(data_train_2$left)
data_test_2$left <- factor(data_test_2$left)
rf.model <- randomForest(left~., data = data_train_2)
rf.prediction <- predict(rf.model, data_test_2, type = "class")
res_random = confusionMatrix(rf.prediction, data_test_2$left)
results_rf[i,] <- res_random$overall["Accuracy"]
}
results_lda <- matrix(nrow=10,ncol=1)
for (i in 1:10){
data_train_2 <- sample_frac(data_set_1, 0.7)
prop.table(table(data_train_2$left))
data_test_2 <- setdiff(data_set_1, data_train_2)
prop.table(table(data_test_2$left))
data_train_2$left <- factor(data_train_2$left)
data_test_2$left <- factor(data_test_2$left)
lda.model <- train(left ~., data = data_train_2, method = "lda")
lda.predict <- predict(lda.model, data_test_2)
res_lda = confusionMatrix(lda.predict, data_test_2$left)
results_lda[i,] <- res_lda$overall["Accuracy"]
}
| Arbol de Decision | Regresion Logistica | Random Forest | LDA |
|---|---|---|---|
| 0.9889021 | 0.9701466 | 0.9910586 | 0.971527 |
Tras la limpieza de datos y seleccion de variables se obtuvieron valores aceptables de accuracy. Sin embargo deben evaluarse los modelos en forma individual para seleccionar el modelo mas adecuado.