rm(list=ls(all=TRUE))
library(rpart)
library(partykit)
## Loading required package: grid
library(Matrix)
library(missMDA)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(FactoMineR)
library(foreach)
library(nFactors)
## Loading required package: MASS
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
ruta <-"D:/Dropbox/SantoTomas/20161/Intersemestral/Competencia/"
setwd(ruta)
##################### Leyendo las bases #################################
base<-read.csv(file = "customer_satisfaction_train.csv",header = T,sep = ",")
rownames(base)<-base$ID
base$ID<-NULL
base2<-read.csv(file = "sample_submission.csv",header = T,sep = ",")
rownames(base2)<-base2$ID
base2$ID<-NULL
#######################################################################
##################### Preparando las bases #################################
# Base de entrenamiento
X_train <- subset(base,subset = base$TARGET !='NA',select =-TARGET)
y_train <- subset(base,subset = base$TARGET !='NA',select =TARGET)
# Base de prueba
X_test <- subset(base,subset = is.na(base$TARGET),select =-TARGET)
y_test <- base2
#######################################################################
###### Identificando variables con varianza casi nula ################
set.seed(1234)
nzv<-nearZeroVar(X_train,allowParallel = T)
X_train<-X_train[,-nzv]
X_test<-X_test[,-nzv]
#######################################################################
##### Removiendo variables identicas #####################################
features_pair <- combn(names(X_train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
f1 <- pair[1]
f2 <- pair[2]
if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
if (all(X_train[[f1]] == X_train[[f2]])) {
cat(f1, "y", f2, "son iguales.\n")
toRemove <- c(toRemove, f2)
}
}
}
## ind_var37_0 y ind_var37 son iguales.
feature.names <- setdiff(names(X_train), toRemove)
#######################################################################
#### Filtrando por variables importantes ##############################
X_train <- X_train[, feature.names]
X_test <- X_test[, feature.names]
X<-rbind(X_train,X_test)
#######################################################################
############ Identificando variables con valores perdidos #############
perdidos<-matrix(NA,nrow = ncol(X))
for(i in 1:ncol(X)){
perdidos[i,]<-sum(is.na(X[,i]))
}
#perdidos
cat("Variables con valores perdidos", names(X[,perdidos>0]), "\n")
## Variables con valores perdidos var15 num_var4 var38
#######################################################################
########### Imputando datos ############################################
#Análisis de componentes principales
# Determine Number of Factors to Extract
res.pca<-PCA(X)
## Warning in PCA(X): Missing values are imputed by the mean of the variable:
## you should use the imputePCA function of the missMDA package


ev <- res.pca$eig$eigenvalue # get eigenvalues
ap <- parallel(subject=nrow(X), var=ncol(X), rep=100,cent=.05)
nS <- nScree(ev, aparallel=ap$eigen$qevpea)
vp <- min(nS$Components)+1
#Imputando datos
X.comp<-imputePCA(X,ncp = vp)
X<-data.frame(X.comp$completeObs)
row.names(X)<-row.names(base)
#var15
summary(base$var15)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.0 23.0 27.0 33.2 40.0 105.0 15119
X$var15 <- round(X$var15)
summary(X$var15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 24.00 30.00 33.21 38.00 105.00
#num_var4
summary(base$num_var4)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 1.00 1.08 1.00 7.00 3430
X$num_var4 <- round(X$num_var4)
summary(X$num_var4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.078 1.000 7.000
#var38
summary(base$var38)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5164 67850 106500 117200 118800 22030000 5768
summary(X$var38)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5164 70450 114800 117200 118200 22030000
#######################################################################
## Preparando nuevamente los datos ####################################
X_train <- X[row.names(X_train),]
X_test <- X[row.names(X_test),]
train<-data.frame(X_train,y_train)
test <- data.frame(X_test,y_test)
bd<-train[complete.cases(train),]
bd<-bd[,apply(bd,2,sum)!=0]
bd$Class<-factor(bd$TARGET)
bd$TARGET<-NULL
levels(bd$Class)<-c("X0","X1")
table(bd$Class)
##
## X0 X1
## 43229 1771
test$Class<-factor(test$TARGET)
test$TARGET<-NULL
levels(test$Class)<-c("X0", "X1")
table(test$Class)
##
## X0 X1
## 31020 0
######################################################################
# Se preparan tres muestras con diferentes tipos de muestreo Datos originales,
# oversampling, undersampling, todas se van a comparar con la curva ROC
######################################################################
set.seed(1234)
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
orig_fit <- train(Class ~ ., data = bd,
method = "rpart",
metric = "ROC",
tuneLength = 30,
trControl = ctrl)
ctrl$sampling <- "down"
set.seed(1234)
down_outside <- train(Class~.,data=bd,
method = "rpart",
metric = "ROC",
tuneLength = 30,
trControl = ctrl)
ctrl$sampling <- "up"
set.seed(1234)
up_outside <- train(Class~.,data=bd,
method = "rpart",
metric = "ROC",
tuneLength = 30,
trControl = ctrl)
######################################################################
# Comparando los resultados
######################################################################
outside_models <- list(original = orig_fit,
down = down_outside,
up = up_outside)
outside_resampling <- resamples(outside_models)
summary(outside_resampling, metric = "ROC")
##
## Call:
## summary.resamples(object = outside_resampling, metric = "ROC")
##
## Models: original, down, up
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## original 0.7275 0.7699 0.7845 0.7816 0.7974 0.8184 0
## down 0.6999 0.7550 0.7680 0.7653 0.7780 0.8034 0
## up 0.6788 0.7117 0.7390 0.7356 0.7541 0.7814 0
confusionMatrix(predict(orig_fit, test, type = "raw"),test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 30870 0
## X1 150 0
##
## Accuracy : 0.9952
## 95% CI : (0.9943, 0.9959)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9952
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1.0000
## Detection Rate : 0.9952
## Detection Prevalence : 0.9952
## Balanced Accuracy : NA
##
## 'Positive' Class : X0
##
confusionMatrix(predict(down_outside, test, type = "raw"),test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 20486 0
## X1 10534 0
##
## Accuracy : 0.6604
## 95% CI : (0.6551, 0.6657)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6604
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1.0000
## Detection Rate : 0.6604
## Detection Prevalence : 0.6604
## Balanced Accuracy : NA
##
## 'Positive' Class : X0
##
confusionMatrix(predict(up_outside, test, type = "raw"),test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 22904 0
## X1 8116 0
##
## Accuracy : 0.7384
## 95% CI : (0.7334, 0.7432)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7384
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1.0000
## Detection Rate : 0.7384
## Detection Prevalence : 0.7384
## Balanced Accuracy : NA
##
## 'Positive' Class : X0
##
resultados<-data.frame(TARGET=predict(orig_fit, test, type = "prob")[,"X1"])
rownames(resultados)<-rownames(test)
g_fit <- as.party(orig_fit$finalModel)
print(g_fit, header = FALSE)
## [1] root
## | [2] saldo_var30 >= 2.955
## | | [3] var15 < 25.5: X0 (n = 10697, err = 0.5%)
## | | [4] var15 >= 25.5
## | | | [5] num_var22_ult1 < 7.5
## | | | | [6] saldo_var30 >= 89.13
## | | | | | [7] imp_op_var41_ult1 < 3154.77
## | | | | | | [8] imp_op_var41_ult1 < 473.565: X0 (n = 10081, err = 1.3%)
## | | | | | | [9] imp_op_var41_ult1 >= 473.565
## | | | | | | | [10] saldo_medio_var5_ult3 >= 207.3: X0 (n = 1509, err = 2.2%)
## | | | | | | | [11] saldo_medio_var5_ult3 < 207.3
## | | | | | | | | [12] saldo_medio_var5_ult3 < 178.785
## | | | | | | | | | [13] saldo_var30 >= 170.655: X0 (n = 472, err = 3.2%)
## | | | | | | | | | [14] saldo_var30 < 170.655
## | | | | | | | | | | [15] num_meses_var39_vig_ult3 >= 1.5: X0 (n = 37, err = 2.7%)
## | | | | | | | | | | [16] num_meses_var39_vig_ult3 < 1.5: X1 (n = 9, err = 33.3%)
## | | | | | | | | [17] saldo_medio_var5_ult3 >= 178.785
## | | | | | | | | | [18] saldo_medio_var5_hace3 >= 0.705: X0 (n = 24, err = 12.5%)
## | | | | | | | | | [19] saldo_medio_var5_hace3 < 0.705: X1 (n = 9, err = 44.4%)
## | | | | | [20] imp_op_var41_ult1 >= 3154.77: X0 (n = 231, err = 9.1%)
## | | | | [21] saldo_var30 < 89.13
## | | | | | [22] saldo_medio_var5_ult3 >= 2.025
## | | | | | | [23] var15 < 38.5: X0 (n = 7014, err = 2.8%)
## | | | | | | [24] var15 >= 38.5
## | | | | | | | [25] saldo_var5 >= 30.015: X0 (n = 355, err = 3.4%)
## | | | | | | | [26] saldo_var5 < 30.015
## | | | | | | | | [27] imp_op_var41_ult1 < 1287.135: X0 (n = 992, err = 8.1%)
## | | | | | | | | [28] imp_op_var41_ult1 >= 1287.135
## | | | | | | | | | [29] saldo_medio_var5_ult1 >= 185.355: X0 (n = 20, err = 10.0%)
## | | | | | | | | | [30] saldo_medio_var5_ult1 < 185.355: X1 (n = 7, err = 28.6%)
## | | | | | [31] saldo_medio_var5_ult3 < 2.025
## | | | | | | [32] imp_op_var39_ult1 < 3283.005
## | | | | | | | [33] var38 >= 97116.285: X0 (n = 120, err = 6.7%)
## | | | | | | | [34] var38 < 97116.285
## | | | | | | | | [35] var38 < 92607: X0 (n = 109, err = 18.3%)
## | | | | | | | | [36] var38 >= 92607: X1 (n = 9, err = 44.4%)
## | | | | | | [37] imp_op_var39_ult1 >= 3283.005: X1 (n = 7, err = 42.9%)
## | | | [38] num_var22_ult1 >= 7.5
## | | | | [39] imp_op_var41_ult1 < 1483.41: X0 (n = 578, err = 6.7%)
## | | | | [40] imp_op_var41_ult1 >= 1483.41
## | | | | | [41] saldo_var30 >= 137.19
## | | | | | | [42] imp_op_var41_ult1 >= 2452.26: X0 (n = 43, err = 7.0%)
## | | | | | | [43] imp_op_var41_ult1 < 2452.26
## | | | | | | | [44] imp_op_var41_ult1 < 2155.68: X0 (n = 26, err = 15.4%)
## | | | | | | | [45] imp_op_var41_ult1 >= 2155.68: X1 (n = 7, err = 28.6%)
## | | | | | [46] saldo_var30 < 137.19
## | | | | | | [47] var15 < 44: X0 (n = 13, err = 38.5%)
## | | | | | | [48] var15 >= 44: X1 (n = 7, err = 0.0%)
## | [49] saldo_var30 < 2.955
## | | [50] var15 < 26.5: X0 (n = 5264, err = 2.3%)
## | | [51] var15 >= 26.5
## | | | [52] var38 >= 118837.4224: X0 (n = 1790, err = 6.9%)
## | | | [53] var38 < 118837.4224
## | | | | [54] var15 < 32.5: X0 (n = 2586, err = 11.9%)
## | | | | [55] var15 >= 32.5
## | | | | | [56] var36 < 2.5
## | | | | | | [57] saldo_var5 >= -147.045: X0 (n = 301, err = 7.3%)
## | | | | | | [58] saldo_var5 < -147.045: X1 (n = 7, err = 42.9%)
## | | | | | [59] var36 >= 2.5
## | | | | | | [60] ind_var41_0 < 0.5
## | | | | | | | [61] var38 >= 55353.135: X0 (n = 717, err = 13.9%)
## | | | | | | | [62] var38 < 55353.135
## | | | | | | | | [63] var15 >= 53.5: X0 (n = 30, err = 3.3%)
## | | | | | | | | [64] var15 < 53.5
## | | | | | | | | | [65] var15 < 51.5
## | | | | | | | | | | [66] num_var22_hace2 < 1.5
## | | | | | | | | | | | [67] var38 < 51212.22: X0 (n = 91, err = 16.5%)
## | | | | | | | | | | | [68] var38 >= 51212.22
## | | | | | | | | | | | | [69] var38 >= 52966.095: X0 (n = 13, err = 23.1%)
## | | | | | | | | | | | | [70] var38 < 52966.095: X1 (n = 14, err = 42.9%)
## | | | | | | | | | | [71] num_var22_hace2 >= 1.5: X1 (n = 8, err = 37.5%)
## | | | | | | | | | [72] var15 >= 51.5: X1 (n = 10, err = 40.0%)
## | | | | | | [73] ind_var41_0 >= 0.5
## | | | | | | | [74] saldo_medio_var5_ult1 >= 0.255: X0 (n = 89, err = 6.7%)
## | | | | | | | [75] saldo_medio_var5_ult1 < 0.255
## | | | | | | | | [76] num_var22_ult3 < 10.5
## | | | | | | | | | [77] var15 >= 80.5: X0 (n = 30, err = 0.0%)
## | | | | | | | | | [78] var15 < 80.5
## | | | | | | | | | | [79] var15 < 78.5
## | | | | | | | | | | | [80] ind_var37_0 < 0.5
## | | | | | | | | | | | | [81] num_var30_0 < 1.5: X0 (n = 24, err = 4.2%)
## | | | | | | | | | | | | [82] num_var30_0 >= 1.5
## | | | | | | | | | | | | | [83] num_var45_hace3 < 1.5
## | | | | | | | | | | | | | | [84] num_var30_0 < 4.5
## | | | | | | | | | | | | | | | [85] num_meses_var39_vig_ult3 < 1.5: X0 (n = 13, err = 0.0%)
## | | | | | | | | | | | | | | | [86] num_meses_var39_vig_ult3 >= 1.5
## | | | | | | | | | | | | | | | | [87] var15 < 48.5
## | | | | | | | | | | | | | | | | | [88] var15 >= 40.5: X0 (n = 351, err = 16.2%)
## | | | | | | | | | | | | | | | | | [89] var15 < 40.5
## | | | | | | | | | | | | | | | | | | [90] var38 >= 77746.155: X0 (n = 338, err = 19.8%)
## | | | | | | | | | | | | | | | | | | [91] var38 < 77746.155
## | | | | | | | | | | | | | | | | | | | [92] var38 < 75442.935
## | | | | | | | | | | | | | | | | | | | | [93] num_var22_hace3 >= 1.5: X0 (n = 7, err = 0.0%)
## | | | | | | | | | | | | | | | | | | | | [94] num_var22_hace3 < 1.5
## | | | | | | | | | | | | | | | | | | | | | [95] var15 >= 34.5
## | | | | | | | | | | | | | | | | | | | | | | [96] var38 >= 38555.55: X0 (n = 141, err = 19.9%)
## | | | | | | | | | | | | | | | | | | | | | | [97] var38 < 38555.55
## | | | | | | | | | | | | | | | | | | | | | | | [98] var38 < 36121.83: X0 (n = 29, err = 27.6%)
## | | | | | | | | | | | | | | | | | | | | | | | [99] var38 >= 36121.83: X1 (n = 8, err = 37.5%)
## | | | | | | | | | | | | | | | | | | | | | [100] var15 < 34.5: X0 (n = 39, err = 33.3%)
## | | | | | | | | | | | | | | | | | | | [101] var38 >= 75442.935: X1 (n = 9, err = 33.3%)
## | | | | | | | | | | | | | | | | [102] var15 >= 48.5
## | | | | | | | | | | | | | | | | | [103] var38 >= 118150.16656: X0 (n = 41, err = 9.8%)
## | | | | | | | | | | | | | | | | | [104] var38 < 118150.16656
## | | | | | | | | | | | | | | | | | | [105] var15 >= 66.5: X0 (n = 49, err = 16.3%)
## | | | | | | | | | | | | | | | | | | [106] var15 < 66.5
## | | | | | | | | | | | | | | | | | | | [107] num_var22_ult3 >= 1.5: X0 (n = 27, err = 11.1%)
## | | | | | | | | | | | | | | | | | | | [108] num_var22_ult3 < 1.5
## | | | | | | | | | | | | | | | | | | | | [109] var38 >= 33532.71
## | | | | | | | | | | | | | | | | | | | | | [110] var38 < 42311.775: X0 (n = 15, err = 6.7%)
## | | | | | | | | | | | | | | | | | | | | | [111] var38 >= 42311.775
## | | | | | | | | | | | | | | | | | | | | | | [112] var15 < 62.5
## | | | | | | | | | | | | | | | | | | | | | | | [113] var15 >= 52.5: X0 (n = 114, err = 22.8%)
## | | | | | | | | | | | | | | | | | | | | | | | [114] var15 < 52.5
## | | | | | | | | | | | | | | | | | | | | | | | | [115] var38 >= 98362.5: X0 (n = 34, err = 23.5%)
## | | | | | | | | | | | | | | | | | | | | | | | | [116] var38 < 98362.5
## | | | | | | | | | | | | | | | | | | | | | | | | | [117] var15 < 50.5
## | | | | | | | | | | | | | | | | | | | | | | | | | | [118] var38 >= 61897.92: X0 (n = 15, err = 26.7%)
## | | | | | | | | | | | | | | | | | | | | | | | | | | [119] var38 < 61897.92: X1 (n = 11, err = 45.5%)
## | | | | | | | | | | | | | | | | | | | | | | | | | [120] var15 >= 50.5: X1 (n = 16, err = 37.5%)
## | | | | | | | | | | | | | | | | | | | | | | [121] var15 >= 62.5
## | | | | | | | | | | | | | | | | | | | | | | | [122] var38 >= 117034.94951: X0 (n = 8, err = 37.5%)
## | | | | | | | | | | | | | | | | | | | | | | | [123] var38 < 117034.94951: X1 (n = 12, err = 41.7%)
## | | | | | | | | | | | | | | | | | | | | [124] var38 < 33532.71: X1 (n = 12, err = 41.7%)
## | | | | | | | | | | | | | | [125] num_var30_0 >= 4.5: X0 (n = 12, err = 50.0%)
## | | | | | | | | | | | | | [126] num_var45_hace3 >= 1.5
## | | | | | | | | | | | | | | [127] saldo_medio_var5_hace3 >= 19.77: X0 (n = 12, err = 0.0%)
## | | | | | | | | | | | | | | [128] saldo_medio_var5_hace3 < 19.77
## | | | | | | | | | | | | | | | [129] var15 >= 36.5
## | | | | | | | | | | | | | | | | [130] num_var4 < 0.5
## | | | | | | | | | | | | | | | | | [131] num_var22_ult3 >= 7.5: X0 (n = 10, err = 0.0%)
## | | | | | | | | | | | | | | | | | [132] num_var22_ult3 < 7.5
## | | | | | | | | | | | | | | | | | | [133] num_var45_hace2 >= 1.5: X0 (n = 40, err = 15.0%)
## | | | | | | | | | | | | | | | | | | [134] num_var45_hace2 < 1.5
## | | | | | | | | | | | | | | | | | | | [135] var38 >= 66294.09
## | | | | | | | | | | | | | | | | | | | | [136] var38 < 72870.285: X0 (n = 13, err = 7.7%)
## | | | | | | | | | | | | | | | | | | | | [137] var38 >= 72870.285
## | | | | | | | | | | | | | | | | | | | | | [138] var38 >= 76684.92
## | | | | | | | | | | | | | | | | | | | | | | [139] var38 < 82949.835: X0 (n = 12, err = 8.3%)
## | | | | | | | | | | | | | | | | | | | | | | [140] var38 >= 82949.835
## | | | | | | | | | | | | | | | | | | | | | | | [141] var38 >= 87674.115: X0 (n = 85, err = 22.4%)
## | | | | | | | | | | | | | | | | | | | | | | | [142] var38 < 87674.115: X1 (n = 7, err = 42.9%)
## | | | | | | | | | | | | | | | | | | | | | [143] var38 < 76684.92: X1 (n = 9, err = 44.4%)
## | | | | | | | | | | | | | | | | | | | [144] var38 < 66294.09
## | | | | | | | | | | | | | | | | | | | | [145] var38 < 59020.455: X0 (n = 33, err = 27.3%)
## | | | | | | | | | | | | | | | | | | | | [146] var38 >= 59020.455: X1 (n = 12, err = 33.3%)
## | | | | | | | | | | | | | | | | [147] num_var4 >= 0.5: X0 (n = 14, err = 50.0%)
## | | | | | | | | | | | | | | | [148] var15 < 36.5
## | | | | | | | | | | | | | | | | [149] num_var45_ult3 < 13.5: X0 (n = 43, err = 34.9%)
## | | | | | | | | | | | | | | | | [150] num_var45_ult3 >= 13.5: X1 (n = 7, err = 28.6%)
## | | | | | | | | | | | [151] ind_var37_0 >= 0.5: X0 (n = 17, err = 47.1%)
## | | | | | | | | | | [152] var15 >= 78.5: X1 (n = 8, err = 37.5%)
## | | | | | | | | [153] num_var22_ult3 >= 10.5
## | | | | | | | | | [154] ind_var30 < 0.5
## | | | | | | | | | | [155] var38 >= 84552.36: X0 (n = 13, err = 15.4%)
## | | | | | | | | | | [156] var38 < 84552.36: X1 (n = 7, err = 28.6%)
## | | | | | | | | | [157] ind_var30 >= 0.5: X1 (n = 7, err = 0.0%)
##
## Number of inner nodes: 78
## Number of terminal nodes: 79
plot(g_fit)

#Guardando resultados
write.csv(resultados,file="resultados.csv")
write.csv(bd,file="train.csv")
write.csv(test,file="test.csv")