Se llevará a cabo la ejecución de 4 algoritmos destinados a un problema de clasificación. Los algoritmos son:

library(dplyr)
library(ggplot2)
library(caret)
library(e1071) # Par SVM
library(rpart) # Para arbol de clasificación
library(rpart.plot)
library(randomForest) # Para random forest
library(knitr)
library(pander)

Data:

Para más detalle: https://data.world/kramea/tv-commercial-detection

Pre-procesamiento del data

dim(Base)
## [1] 39252   204

Eliminar las variables que presenten más del 30% de sus datos como valores faltantes.

pander(head(data.frame(ConteoNA=colSums(is.na(Base))) %>% arrange(-ConteoNA)))
  ConteoNA
X128 39155
X212 38937
X223 38700
X882 38682
X229 38509
X621 38428
NA_razon=data.frame(Variable=names(Base), RazonNA=round(colSums(is.na(Base))/nrow(Base),4)*100) %>% arrange(-RazonNA)
Drop_var=NA_razon[NA_razon$RazonNA>30,]

variables=Drop_var$Variable
length(variables)
## [1] 96
Base2=Base %>% dplyr::select(-variables)
dim(Base2)
## [1] 39252   108

Imputar el resto de variables con valores faltantes a partir de datos aleatorios de ella misma.

impt=data.frame(variable=names(Base2), conteoNA=colSums(is.na(Base2))) %>% arrange(-conteoNA)
var_imp=impt[impt$conteoNA!=0,"variable"]
length(var_imp)
## [1] 80
impu_ale=function(x){
  nas=is.na(x)
  n_na=sum(nas)
  x_obs=x[!nas]
  imp=x
  imp[nas]=sample(x = x_obs,size = n_na,replace = F)
  return(imp)
}

datos_impu=impu_ale(Base2[names(Base2) %in% var_imp])
dim(datos_impu)
## [1] 39252    80
Base3<-Base2 %>% dplyr::select(-var_imp) %>% cbind(datos_impu)
dim(Base3)
## [1] 39252   108
sum(colSums(is.na(Base3)))
## [1] 0

Explorar la estructura de la variable Target.

summary(Base3$Label)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0000 -1.0000  1.0000  0.2813  1.0000  1.0000
data=Base3 %>% mutate(Label=as.factor(recode(Label, "-1"="0", "1"="1")))
class(data$Label)
## [1] "factor"
# 1 : Es un comercial
# 0 : No es un comercial
contrasts(data$Label)
##   1
## 0 0
## 1 1
round(prop.table(table(data$Label)),4)*100
## 
##     0     1 
## 35.93 64.07

Partición del data

set.seed(301020)
parti<-createDataPartition(y = data$Label,times = 1,p = 0.7,list = F)
data_train<-data[parti,]
data_test<-data[-parti,]

dim(data_train)
## [1] 27477   108
round(prop.table(table(data_train$Label)),2)
## 
##    0    1 
## 0.36 0.64
dim(data_test)
## [1] 11775   108
round(prop.table(table(data_test$Label)),2)
## 
##    0    1 
## 0.36 0.64

Arbol de clasificación

control_rpart<-rpart.control(minsplit = 50,minbucket = 8,cp = 0.02)
modelo_rpart<-rpart(formula = Label~.,data = data_train,
                    method = "class",control =control_rpart)
rpart.plot(modelo_rpart)

y_pred_rpart<-predict(modelo_rpart,data_test,type = "class")
matrix_tree=confusionMatrix(y_pred_rpart,data_test$Label)

Random Forest

sqrt(dim(data)[2])
## [1] 10.3923
modelo_rf<-randomForest(data=data_train,Label~.,mtry=10,ntree=500,metric="Accuracy")

y_pred_rf<-predict(modelo_rf,data_test)
matrix_rf<- confusionMatrix(y_pred_rf,data_test$Label)

Gradient Boosted Model

control_gbm<-trainControl(method = "cv",number = 5) # Para el remuestreo, cv=cross validation con 5 repeticiones
model_gbm<-train(data=data_train,
                 Label~.,trControl=control_gbm,
                 method="gbm",verbose=F)

y_pred_gbm<-predict(model_gbm,data_test)
matrix_gbm<-confusionMatrix(y_pred_gbm,data_test$Label)

Support Vector Machine con Kernel Radial

set.seed(301020)

modelo_svm<-svm(data=data_train,Label~.,
                kernel="linear",scale=T,cost=0.5)
y_pred_svm2<-predict(modelo_svm,data_test)
matrix_svm<-confusionMatrix(y_pred_svm2,data_test$Label)

Comparación de resultados

accuracy<-round(c(matrix_gbm$overall[[1]],
                  matrix_rf$overall[[1]],
                  matrix_svm$overall[[1]],
                  matrix_tree$overall[[1]]),4)
model_names<-c("GBM","RF","SVM","Tree")

accuracies<-data.frame(model_names,accuracy)
pander(arrange(accuracies,-accuracy))
model_names accuracy
RF 0.9237
GBM 0.9158
SVM 0.9073
Tree 0.8274
matrix_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3596  264
##          1  635 7280
##                                           
##                Accuracy : 0.9237          
##                  95% CI : (0.9187, 0.9284)
##     No Information Rate : 0.6407          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8309          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8499          
##             Specificity : 0.9650          
##          Pos Pred Value : 0.9316          
##          Neg Pred Value : 0.9198          
##              Prevalence : 0.3593          
##          Detection Rate : 0.3054          
##    Detection Prevalence : 0.3278          
##       Balanced Accuracy : 0.9075          
##                                           
##        'Positive' Class : 0               
## 

Conclusiones:

El algoritmo de Random Forest obtuvo mejores resultados en cuestión de aciertos (el 92.5%) con respecto a los demás algoritmos