Teoría

El paquete CARET (Classification And REgression Training) es un paquete integral con una amplia variedad de algoritmos para el aprendizaje automático.

Instalar paquetes y llamar librerías

{r message=FALSE, warning=FALSE}

library(ggplot2)
library(lattice)
library(caret)
library(DataExplorer)
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
ruta <- "/Users/samanthagarcia/Desktop/M1_data.csv"
df <- read.csv(ruta)
head(df)
##   trust_apple interest_computers age_computer user_pcmac appleproducts_count
## 1          No                  4            8         PC                   0
## 2         Yes                  2            4         PC                   1
## 3         Yes                  5            6         PC                   0
## 4         Yes                  2            6      Apple                   4
## 5         Yes                  4            4      Apple                   7
## 6         Yes                  3            1      Apple                   2
##   familiarity_m1 f_batterylife f_price f_size f_multitasking f_noise
## 1             No             5       4      3              4       4
## 2             No             5       5      5              3       4
## 3             No             3       4      2              4       1
## 4             No             4       3      3              4       4
## 5            Yes             5       3      3              4       4
## 6             No             5       5      4              4       5
##   f_performance f_neural f_synergy f_performanceloss m1_consideration
## 1             2        2         1                 1                1
## 2             5        2         2                 4                2
## 3             4        2         2                 2                4
## 4             4        4         4                 3                2
## 5             5        3         4                 4                4
## 6             5        5         4                 2                2
##   m1_purchase gender age_group income_group   status          domain
## 1         Yes   Male         2            2  Student         Science
## 2          No   Male         2            3 Employed         Finance
## 3         Yes   Male         2            2  Student IT & Technology
## 4          No Female         2            2  Student  Arts & Culture
## 5         Yes   Male         5            7 Employed     Hospitality
## 6          No Female         2            2  Student        Politics
target <- "m1_purchase"
target <- names(df)[ncol(df)]

df[[target]] <- as.factor(df[[target]])

levels(df[[target]])
##  [1] "Administration & Public Services" "Agriculture"                     
##  [3] "Arts & Culture"                   "Business"                        
##  [5] "Communication "                   "Consulting "                     
##  [7] "Economics"                        "Education"                       
##  [9] "Engineering"                      "Finance"                         
## [11] "Healthcare"                       "Hospitality"                     
## [13] "IT & Technology"                  "Law"                             
## [15] "Logistics"                        "Marketing"                       
## [17] "Politics"                         "Realestate"                      
## [19] "Retail"                           "Retired"                         
## [21] "Science"                          "Social Sciences"
tabla <- table(df[[target]])

clases_pequenas <- names(tabla[tabla < 5])

df[[target]] <- as.character(df[[target]])
df[[target]][df[[target]] %in% clases_pequenas] <- "Other"
df[[target]] <- as.factor(df[[target]])

table(df[[target]])
## 
##  Arts & Culture        Business       Education     Engineering         Finance 
##               6              14               5               7               7 
##     Hospitality IT & Technology       Marketing           Other         Science 
##               6              33              21              21               7 
## Social Sciences 
##               6
formula_modelo <- as.formula(paste(target, "~ ."))
summary(df)
##  trust_apple        interest_computers  age_computer    user_pcmac       
##  Length:133         Min.   :2.000      Min.   :0.000   Length:133        
##  Class :character   1st Qu.:3.000      1st Qu.:1.000   Class :character  
##  Mode  :character   Median :4.000      Median :3.000   Mode  :character  
##                     Mean   :3.812      Mean   :2.827                     
##                     3rd Qu.:5.000      3rd Qu.:5.000                     
##                     Max.   :5.000      Max.   :9.000                     
##                                                                          
##  appleproducts_count familiarity_m1     f_batterylife      f_price     
##  Min.   :0.000       Length:133         Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000       Class :character   1st Qu.:4.000   1st Qu.:3.000  
##  Median :3.000       Mode  :character   Median :5.000   Median :4.000  
##  Mean   :2.609                          Mean   :4.526   Mean   :3.872  
##  3rd Qu.:4.000                          3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :8.000                          Max.   :5.000   Max.   :5.000  
##                                                                        
##      f_size      f_multitasking    f_noise      f_performance      f_neural    
##  Min.   :1.000   Min.   :2.00   Min.   :1.000   Min.   :2.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:4.00   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:2.000  
##  Median :3.000   Median :4.00   Median :4.000   Median :5.000   Median :3.000  
##  Mean   :3.158   Mean   :4.12   Mean   :3.729   Mean   :4.398   Mean   :3.165  
##  3rd Qu.:4.000   3rd Qu.:5.00   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.00   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                                
##    f_synergy     f_performanceloss m1_consideration m1_purchase       
##  Min.   :1.000   Min.   :1.000     Min.   :1.000    Length:133        
##  1st Qu.:3.000   1st Qu.:3.000     1st Qu.:3.000    Class :character  
##  Median :4.000   Median :4.000     Median :4.000    Mode  :character  
##  Mean   :3.466   Mean   :3.376     Mean   :3.609                      
##  3rd Qu.:4.000   3rd Qu.:4.000     3rd Qu.:5.000                      
##  Max.   :5.000   Max.   :5.000     Max.   :5.000                      
##                                                                       
##     gender            age_group      income_group     status         
##  Length:133         Min.   : 1.00   Min.   :1.00   Length:133        
##  Class :character   1st Qu.: 2.00   1st Qu.:1.00   Class :character  
##  Mode  :character   Median : 2.00   Median :2.00   Mode  :character  
##                     Mean   : 2.97   Mean   :2.97                     
##                     3rd Qu.: 3.00   3rd Qu.:4.00                     
##                     Max.   :10.00   Max.   :7.00                     
##                                                                      
##              domain  
##  IT & Technology:33  
##  Marketing      :21  
##  Other          :21  
##  Business       :14  
##  Engineering    : 7  
##  Finance        : 7  
##  (Other)        :30
str(df)
## 'data.frame':    133 obs. of  22 variables:
##  $ trust_apple        : chr  "No" "Yes" "Yes" "Yes" ...
##  $ interest_computers : int  4 2 5 2 4 3 3 3 4 5 ...
##  $ age_computer       : int  8 4 6 6 4 1 2 0 2 0 ...
##  $ user_pcmac         : chr  "PC" "PC" "PC" "Apple" ...
##  $ appleproducts_count: int  0 1 0 4 7 2 7 0 6 7 ...
##  $ familiarity_m1     : chr  "No" "No" "No" "No" ...
##  $ f_batterylife      : int  5 5 3 4 5 5 4 5 4 5 ...
##  $ f_price            : int  4 5 4 3 3 5 3 5 4 3 ...
##  $ f_size             : int  3 5 2 3 3 4 4 4 3 5 ...
##  $ f_multitasking     : int  4 3 4 4 4 4 5 4 4 5 ...
##  $ f_noise            : int  4 4 1 4 4 5 5 3 4 5 ...
##  $ f_performance      : int  2 5 4 4 5 5 5 3 4 5 ...
##  $ f_neural           : int  2 2 2 4 3 5 3 2 3 3 ...
##  $ f_synergy          : int  1 2 2 4 4 4 3 2 3 5 ...
##  $ f_performanceloss  : int  1 4 2 3 4 2 2 3 4 5 ...
##  $ m1_consideration   : int  1 2 4 2 4 2 3 1 5 5 ...
##  $ m1_purchase        : chr  "Yes" "No" "Yes" "No" ...
##  $ gender             : chr  "Male" "Male" "Male" "Female" ...
##  $ age_group          : int  2 2 2 2 5 2 6 2 8 4 ...
##  $ income_group       : int  2 3 2 2 7 2 7 2 7 6 ...
##  $ status             : chr  "Student" "Employed" "Student" "Student" ...
##  $ domain             : Factor w/ 11 levels "Arts & Culture",..: 10 5 7 1 6 9 7 11 7 6 ...
plot_missing(df)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
##   Please report the issue at
##   <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_histogram(df)

plot_correlation(df)

df_num <- df[, sapply(df, is.numeric)]
plot_correlation(df_num)

for (col in names(df)) {
  if (is.character(df[[col]])) df[[col]] <- factor(df[[col]])
}

niveles_df <- lapply(df, function(x) if (is.factor(x)) levels(x) else NULL)


set.seed(123)
renglones_entrenamiento <- createDataPartition(df[[target]], p = 0.8, list = FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]

for (col in names(df)) {
  if (is.factor(df[[col]])) {
    entrenamiento[[col]] <- factor(entrenamiento[[col]], levels = niveles_df[[col]])
    prueba[[col]]        <- factor(prueba[[col]],        levels = niveles_df[[col]])
  }
}
nzv <- nearZeroVar(entrenamiento)
if(length(nzv) > 0){
  entrenamiento <- entrenamiento[, -nzv]
  prueba <- prueba[, -nzv]
}

ctrl <- trainControl(method = "cv", number = 10)

Modelo 1. SVM Lineal

target
## [1] "domain"
str(entrenamiento[[target]])
##  Factor w/ 11 levels "Arts & Culture",..: 10 5 7 6 7 11 6 11 7 7 ...
levels(entrenamiento[[target]])
##  [1] "Arts & Culture"  "Business"        "Education"       "Engineering"    
##  [5] "Finance"         "Hospitality"     "IT & Technology" "Marketing"      
##  [9] "Other"           "Science"         "Social Sciences"
modelo1 <- train(
  formula_modelo, data = entrenamiento,
  method = "svmLinear",
  preProcess = c("scale", "center"),
  trControl = ctrl,
  tuneGrid = data.frame(C = 1)
)

pred_train1 <- predict(modelo1, entrenamiento)
pred_test1  <- predict(modelo1, prueba)

mcre1 <- confusionMatrix(pred_train1, entrenamiento[[target]])
mcrp1 <- confusionMatrix(pred_test1,  prueba[[target]])

mcre1
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               5        0         0           0       0
##   Business                     0       10         0           0       0
##   Education                    0        0         4           0       0
##   Engineering                  0        0         0           6       0
##   Finance                      0        0         0           0       6
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        1         0           0       0
##   Marketing                    0        1         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               5               0         0     0       0
##   IT & Technology           0              26         1     2       0
##   Marketing                 0               0        15     2       1
##   Other                     0               1         1    13       0
##   Science                   0               0         0     0       5
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.8392, 0.9555)
##     No Information Rate : 0.2455          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8942          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        1.00000         0.83333          1.00000
## Specificity                        1.00000         1.00000          1.00000
## Pos Pred Value                     1.00000         1.00000          1.00000
## Neg Pred Value                     1.00000         0.98000          1.00000
## Prevalence                         0.04545         0.10909          0.03636
## Detection Rate                     0.04545         0.09091          0.03636
## Detection Prevalence               0.04545         0.09091          0.03636
## Balanced Accuracy                  1.00000         0.91667          1.00000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     1.00000        1.00000            1.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                  1.00000        1.00000            1.00000
## Neg Pred Value                  1.00000        1.00000            1.00000
## Prevalence                      0.05455        0.05455            0.04545
## Detection Rate                  0.05455        0.05455            0.04545
## Detection Prevalence            0.05455        0.05455            0.04545
## Balanced Accuracy               1.00000        1.00000            1.00000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          0.9630           0.8824       0.7647
## Specificity                          0.9518           0.9570       0.9785
## Pos Pred Value                       0.8667           0.7895       0.8667
## Neg Pred Value                       0.9875           0.9780       0.9579
## Prevalence                           0.2455           0.1545       0.1545
## Detection Rate                       0.2364           0.1364       0.1182
## Detection Prevalence                 0.2727           0.1727       0.1364
## Balanced Accuracy                    0.9574           0.9197       0.8716
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.83333                1.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value              1.00000                1.00000
## Neg Pred Value              0.99048                1.00000
## Prevalence                  0.05455                0.04545
## Detection Rate              0.04545                0.04545
## Detection Prevalence        0.04545                0.04545
## Balanced Accuracy           0.91667                1.00000
mcrp1
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       1
##   Business                     1        0         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        1         0           1       0
##   Marketing                    0        0         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        1         1           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               1         0     0       0
##   Business                  0               0         0     2       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         1     1       0
##   Finance                   0               0         0     0       0
##   Hospitality               1               1         0     0       0
##   IT & Technology           0               3         1     0       0
##   Marketing                 0               1         2     0       1
##   Other                     0               0         0     1       0
##   Science                   0               0         0     0       0
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      1
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3043          
##                  95% CI : (0.1321, 0.5292)
##     No Information Rate : 0.2609          
##     P-Value [Acc > NIR] : 0.3925          
##                                           
##                   Kappa : 0.1947          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000         0.00000          0.00000
## Specificity                        0.90909         0.80952          1.00000
## Pos Pred Value                     0.00000         0.00000              NaN
## Neg Pred Value                     0.95238         0.89474          0.95652
## Prevalence                         0.04348         0.08696          0.04348
## Detection Rate                     0.00000         0.00000          0.00000
## Detection Prevalence               0.08696         0.17391          0.00000
## Balanced Accuracy                  0.45455         0.40476          0.50000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            1.00000
## Specificity                     0.90909        1.00000            0.95455
## Pos Pred Value                  0.00000            NaN            0.50000
## Neg Pred Value                  0.95238        0.95652            1.00000
## Prevalence                      0.04348        0.04348            0.04348
## Detection Rate                  0.00000        0.00000            0.04348
## Detection Prevalence            0.08696        0.00000            0.08696
## Balanced Accuracy               0.45455        0.50000            0.97727
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          0.5000          0.50000      0.25000
## Specificity                          0.8235          0.89474      1.00000
## Pos Pred Value                       0.5000          0.50000      1.00000
## Neg Pred Value                       0.8235          0.89474      0.86364
## Prevalence                           0.2609          0.17391      0.17391
## Detection Rate                       0.1304          0.08696      0.04348
## Detection Prevalence                 0.2609          0.17391      0.04348
## Balanced Accuracy                    0.6618          0.69737      0.62500
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.00000                0.00000
## Specificity                 0.90909                1.00000
## Pos Pred Value              0.00000                    NaN
## Neg Pred Value              0.95238                0.95652
## Prevalence                  0.04348                0.04348
## Detection Rate              0.00000                0.00000
## Detection Prevalence        0.08696                0.00000
## Balanced Accuracy           0.45455                0.50000

Modelo 2. SVM Radial

modelo2 <- train(
  formula_modelo, data = entrenamiento,
  method = "svmRadial",
  preProcess = c("scale", "center"),
  trControl = ctrl,
  tuneGrid = data.frame(sigma = 0.1, C = 1)
)

resultado_entrenamiento2 <- predict(modelo2, entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)

mcre2 <- confusionMatrix(resultado_entrenamiento2, entrenamiento[[target]])
mcre2
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               4        0         0           0       0
##   Business                     0       10         0           0       0
##   Education                    0        0         4           0       0
##   Engineering                  0        0         0           5       0
##   Finance                      0        0         0           0       6
##   Hospitality                  0        0         0           0       0
##   IT & Technology              1        2         0           1       0
##   Marketing                    0        0         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               5               0         0     0       0
##   IT & Technology           0              27         0     0       1
##   Marketing                 0               0        17     0       0
##   Other                     0               0         0    17       1
##   Science                   0               0         0     0       4
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9455          
##                  95% CI : (0.8851, 0.9797)
##     No Information Rate : 0.2455          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9362          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.80000         0.83333          1.00000
## Specificity                        1.00000         1.00000          1.00000
## Pos Pred Value                     1.00000         1.00000          1.00000
## Neg Pred Value                     0.99057         0.98000          1.00000
## Prevalence                         0.04545         0.10909          0.03636
## Detection Rate                     0.03636         0.09091          0.03636
## Detection Prevalence               0.03636         0.09091          0.03636
## Balanced Accuracy                  0.90000         0.91667          1.00000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.83333        1.00000            1.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                  1.00000        1.00000            1.00000
## Neg Pred Value                  0.99048        1.00000            1.00000
## Prevalence                      0.05455        0.05455            0.04545
## Detection Rate                  0.04545        0.05455            0.04545
## Detection Prevalence            0.04545        0.05455            0.04545
## Balanced Accuracy               0.91667        1.00000            1.00000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          1.0000           1.0000       1.0000
## Specificity                          0.9398           1.0000       0.9892
## Pos Pred Value                       0.8438           1.0000       0.9444
## Neg Pred Value                       1.0000           1.0000       1.0000
## Prevalence                           0.2455           0.1545       0.1545
## Detection Rate                       0.2455           0.1545       0.1545
## Detection Prevalence                 0.2909           0.1545       0.1636
## Balanced Accuracy                    0.9699           1.0000       0.9946
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.66667                1.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value              1.00000                1.00000
## Neg Pred Value              0.98113                1.00000
## Prevalence                  0.05455                0.04545
## Detection Rate              0.03636                0.04545
## Detection Prevalence        0.03636                0.04545
## Balanced Accuracy           0.83333                1.00000
mcrp2 <- confusionMatrix(resultado_prueba2, prueba[[target]])
mcrp2
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       0
##   Business                     0        0         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        1         1           1       1
##   Marketing                    1        1         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               1               0         0     0       0
##   IT & Technology           0               6         1     4       1
##   Marketing                 0               0         3     0       0
##   Other                     0               0         0     0       0
##   Science                   0               0         0     0       0
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               1
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4348          
##                  95% CI : (0.2319, 0.6551)
##     No Information Rate : 0.2609          
##     P-Value [Acc > NIR] : 0.05323         
##                                           
##                   Kappa : 0.2635          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000         0.00000          0.00000
## Specificity                        1.00000         1.00000          1.00000
## Pos Pred Value                         NaN             NaN              NaN
## Neg Pred Value                     0.95652         0.91304          0.95652
## Prevalence                         0.04348         0.08696          0.04348
## Detection Rate                     0.00000         0.00000          0.00000
## Detection Prevalence               0.00000         0.00000          0.00000
## Balanced Accuracy                  0.50000         0.50000          0.50000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            1.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                      NaN            NaN            1.00000
## Neg Pred Value                  0.95652        0.95652            1.00000
## Prevalence                      0.04348        0.04348            0.04348
## Detection Rate                  0.00000        0.00000            0.04348
## Detection Prevalence            0.00000        0.00000            0.04348
## Balanced Accuracy               0.50000        0.50000            1.00000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          1.0000           0.7500       0.0000
## Specificity                          0.3529           0.8947       1.0000
## Pos Pred Value                       0.3529           0.6000          NaN
## Neg Pred Value                       1.0000           0.9444       0.8261
## Prevalence                           0.2609           0.1739       0.1739
## Detection Rate                       0.2609           0.1304       0.0000
## Detection Prevalence                 0.7391           0.2174       0.0000
## Balanced Accuracy                    0.6765           0.8224       0.5000
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.00000                0.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value                  NaN                    NaN
## Neg Pred Value              0.95652                0.95652
## Prevalence                  0.04348                0.04348
## Detection Rate              0.00000                0.00000
## Detection Prevalence        0.00000                0.00000
## Balanced Accuracy           0.50000                0.50000

Modelo 3. SVM Polinómico

modelo3 <- train(
  formula_modelo, data = entrenamiento,
  method = "svmPoly",
  preProcess = c("scale", "center"),
  trControl = ctrl,
  tuneGrid = data.frame(degree = 2, scale = 1, C = 1)
)

resultado_entrenamiento3 <- predict(modelo3, entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)

mcre3 <- confusionMatrix(resultado_entrenamiento3, entrenamiento[[target]])
mcre3
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               5        0         0           0       0
##   Business                     0       12         0           0       0
##   Education                    0        0         4           0       0
##   Engineering                  0        0         0           6       0
##   Finance                      0        0         0           0       6
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        0         0           0       0
##   Marketing                    0        0         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               5               0         0     0       0
##   IT & Technology           0              27         0     0       0
##   Marketing                 0               0        17     0       0
##   Other                     0               0         0    17       0
##   Science                   0               0         0     0       6
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               5
## 
## Overall Statistics
##                                     
##                Accuracy : 1         
##                  95% CI : (0.967, 1)
##     No Information Rate : 0.2455    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        1.00000          1.0000          1.00000
## Specificity                        1.00000          1.0000          1.00000
## Pos Pred Value                     1.00000          1.0000          1.00000
## Neg Pred Value                     1.00000          1.0000          1.00000
## Prevalence                         0.04545          0.1091          0.03636
## Detection Rate                     0.04545          0.1091          0.03636
## Detection Prevalence               0.04545          0.1091          0.03636
## Balanced Accuracy                  1.00000          1.0000          1.00000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     1.00000        1.00000            1.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                  1.00000        1.00000            1.00000
## Neg Pred Value                  1.00000        1.00000            1.00000
## Prevalence                      0.05455        0.05455            0.04545
## Detection Rate                  0.05455        0.05455            0.04545
## Detection Prevalence            0.05455        0.05455            0.04545
## Balanced Accuracy               1.00000        1.00000            1.00000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          1.0000           1.0000       1.0000
## Specificity                          1.0000           1.0000       1.0000
## Pos Pred Value                       1.0000           1.0000       1.0000
## Neg Pred Value                       1.0000           1.0000       1.0000
## Prevalence                           0.2455           0.1545       0.1545
## Detection Rate                       0.2455           0.1545       0.1545
## Detection Prevalence                 0.2455           0.1545       0.1545
## Balanced Accuracy                    1.0000           1.0000       1.0000
##                      Class: Science Class: Social Sciences
## Sensitivity                 1.00000                1.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value              1.00000                1.00000
## Neg Pred Value              1.00000                1.00000
## Prevalence                  0.05455                0.04545
## Detection Rate              0.05455                0.04545
## Detection Prevalence        0.05455                0.04545
## Balanced Accuracy           1.00000                1.00000
mcrp3 <- confusionMatrix(resultado_prueba3, prueba[[target]])
mcrp3
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       1
##   Business                     1        0         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        1         1           1       0
##   Marketing                    0        1         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               1         0     0       0
##   Business                  0               1         0     0       0
##   Education                 0               0         0     1       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               1               1         0     0       0
##   IT & Technology           0               2         0     0       0
##   Marketing                 0               0         4     0       1
##   Other                     0               1         0     2       0
##   Science                   0               0         0     0       0
##   Social Sciences           0               0         0     1       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      1
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3913          
##                  95% CI : (0.1971, 0.6146)
##     No Information Rate : 0.2609          
##     P-Value [Acc > NIR] : 0.1196          
##                                           
##                   Kappa : 0.286           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000         0.00000          0.00000
## Specificity                        0.90909         0.85714          0.95455
## Pos Pred Value                     0.00000         0.00000          0.00000
## Neg Pred Value                     0.95238         0.90000          0.95455
## Prevalence                         0.04348         0.08696          0.04348
## Detection Rate                     0.00000         0.00000          0.00000
## Detection Prevalence               0.08696         0.13043          0.04348
## Balanced Accuracy                  0.45455         0.42857          0.47727
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            1.00000
## Specificity                     1.00000        1.00000            0.95455
## Pos Pred Value                      NaN            NaN            0.50000
## Neg Pred Value                  0.95652        0.95652            1.00000
## Prevalence                      0.04348        0.04348            0.04348
## Detection Rate                  0.00000        0.00000            0.04348
## Detection Prevalence            0.00000        0.00000            0.08696
## Balanced Accuracy               0.50000        0.50000            0.97727
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                         0.33333           1.0000      0.50000
## Specificity                         0.82353           0.8947      0.94737
## Pos Pred Value                      0.40000           0.6667      0.66667
## Neg Pred Value                      0.77778           1.0000      0.90000
## Prevalence                          0.26087           0.1739      0.17391
## Detection Rate                      0.08696           0.1739      0.08696
## Detection Prevalence                0.21739           0.2609      0.13043
## Balanced Accuracy                   0.57843           0.9474      0.72368
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.00000                0.00000
## Specificity                 1.00000                0.95455
## Pos Pred Value                  NaN                0.00000
## Neg Pred Value              0.95652                0.95455
## Prevalence                  0.04348                0.04348
## Detection Rate              0.00000                0.00000
## Detection Prevalence        0.00000                0.04348
## Balanced Accuracy           0.50000                0.47727

Modelo 4. Árbol de Decisión

modelo4 <- train(
  formula_modelo, data = entrenamiento,
  method = "rpart",
  preProcess = c("scale", "center"),
  trControl = ctrl,
  tuneLength = 10
)

resultado_entrenamiento4 <- predict(modelo4, entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)

mcre4 <- confusionMatrix(resultado_entrenamiento4, entrenamiento[[target]])
mcre4
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       0
##   Business                     0        0         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        0         0           0       0
##   IT & Technology              3        7         3           4       2
##   Marketing                    2        5         1           2       4
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               0               0         0     0       0
##   IT & Technology           3              24         8    14       2
##   Marketing                 2               3         9     3       4
##   Other                     0               0         0     0       0
##   Science                   0               0         0     0       0
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               2
##   Marketing                     3
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3             
##                  95% CI : (0.2163, 0.3948)
##     No Information Rate : 0.2455          
##     P-Value [Acc > NIR] : 0.113           
##                                           
##                   Kappa : 0.1094          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000          0.0000          0.00000
## Specificity                        1.00000          1.0000          1.00000
## Pos Pred Value                         NaN             NaN              NaN
## Neg Pred Value                     0.95455          0.8909          0.96364
## Prevalence                         0.04545          0.1091          0.03636
## Detection Rate                     0.00000          0.0000          0.00000
## Detection Prevalence               0.00000          0.0000          0.00000
## Balanced Accuracy                  0.50000          0.5000          0.50000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            0.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                      NaN            NaN                NaN
## Neg Pred Value                  0.94545        0.94545            0.95455
## Prevalence                      0.05455        0.05455            0.04545
## Detection Rate                  0.00000        0.00000            0.00000
## Detection Prevalence            0.00000        0.00000            0.00000
## Balanced Accuracy               0.50000        0.50000            0.50000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          0.8889          0.52941       0.0000
## Specificity                          0.4217          0.68817       1.0000
## Pos Pred Value                       0.3333          0.23684          NaN
## Neg Pred Value                       0.9211          0.88889       0.8455
## Prevalence                           0.2455          0.15455       0.1545
## Detection Rate                       0.2182          0.08182       0.0000
## Detection Prevalence                 0.6545          0.34545       0.0000
## Balanced Accuracy                    0.6553          0.60879       0.5000
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.00000                0.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value                  NaN                    NaN
## Neg Pred Value              0.94545                0.95455
## Prevalence                  0.05455                0.04545
## Detection Rate              0.00000                0.00000
## Detection Prevalence        0.00000                0.00000
## Balanced Accuracy           0.50000                0.50000
mcrp4 <- confusionMatrix(resultado_prueba4, prueba[[target]])
mcrp4
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       0
##   Business                     0        0         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        1         1           1       1
##   Marketing                    1        1         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               0               0         0     0       0
##   IT & Technology           1               6         2     2       0
##   Marketing                 0               0         2     2       1
##   Other                     0               0         0     0       0
##   Science                   0               0         0     0       0
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               1
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3478          
##                  95% CI : (0.1638, 0.5727)
##     No Information Rate : 0.2609          
##     P-Value [Acc > NIR] : 0.2325          
##                                           
##                   Kappa : 0.1481          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000         0.00000          0.00000
## Specificity                        1.00000         1.00000          1.00000
## Pos Pred Value                         NaN             NaN              NaN
## Neg Pred Value                     0.95652         0.91304          0.95652
## Prevalence                         0.04348         0.08696          0.04348
## Detection Rate                     0.00000         0.00000          0.00000
## Detection Prevalence               0.00000         0.00000          0.00000
## Balanced Accuracy                  0.50000         0.50000          0.50000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            0.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                      NaN            NaN                NaN
## Neg Pred Value                  0.95652        0.95652            0.95652
## Prevalence                      0.04348        0.04348            0.04348
## Detection Rate                  0.00000        0.00000            0.00000
## Detection Prevalence            0.00000        0.00000            0.00000
## Balanced Accuracy               0.50000        0.50000            0.50000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          1.0000          0.50000       0.0000
## Specificity                          0.4118          0.73684       1.0000
## Pos Pred Value                       0.3750          0.28571          NaN
## Neg Pred Value                       1.0000          0.87500       0.8261
## Prevalence                           0.2609          0.17391       0.1739
## Detection Rate                       0.2609          0.08696       0.0000
## Detection Prevalence                 0.6957          0.30435       0.0000
## Balanced Accuracy                    0.7059          0.61842       0.5000
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.00000                0.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value                  NaN                    NaN
## Neg Pred Value              0.95652                0.95652
## Prevalence                  0.04348                0.04348
## Detection Rate              0.00000                0.00000
## Detection Prevalence        0.00000                0.00000
## Balanced Accuracy           0.50000                0.50000

Modelo 5. Redes Neuronales

modelo5 <- train(
  formula_modelo, data = entrenamiento,
  method = "nnet",
  preProcess = c("scale", "center"),
  trControl = ctrl,
  tuneLength = 5,
  trace = FALSE
)

resultado_entrenamiento5 <- predict(modelo5, entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)

mcre5 <- confusionMatrix(resultado_entrenamiento5, entrenamiento[[target]])
mcre5
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               5        0         0           0       0
##   Business                     0       12         0           0       0
##   Education                    0        0         4           0       0
##   Engineering                  0        0         0           6       0
##   Finance                      0        0         0           0       6
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        0         0           0       0
##   Marketing                    0        0         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               5               0         0     0       0
##   IT & Technology           0              27         0     0       0
##   Marketing                 0               0        17     0       0
##   Other                     0               0         0    17       0
##   Science                   0               0         0     0       6
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               5
## 
## Overall Statistics
##                                     
##                Accuracy : 1         
##                  95% CI : (0.967, 1)
##     No Information Rate : 0.2455    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        1.00000          1.0000          1.00000
## Specificity                        1.00000          1.0000          1.00000
## Pos Pred Value                     1.00000          1.0000          1.00000
## Neg Pred Value                     1.00000          1.0000          1.00000
## Prevalence                         0.04545          0.1091          0.03636
## Detection Rate                     0.04545          0.1091          0.03636
## Detection Prevalence               0.04545          0.1091          0.03636
## Balanced Accuracy                  1.00000          1.0000          1.00000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     1.00000        1.00000            1.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                  1.00000        1.00000            1.00000
## Neg Pred Value                  1.00000        1.00000            1.00000
## Prevalence                      0.05455        0.05455            0.04545
## Detection Rate                  0.05455        0.05455            0.04545
## Detection Prevalence            0.05455        0.05455            0.04545
## Balanced Accuracy               1.00000        1.00000            1.00000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          1.0000           1.0000       1.0000
## Specificity                          1.0000           1.0000       1.0000
## Pos Pred Value                       1.0000           1.0000       1.0000
## Neg Pred Value                       1.0000           1.0000       1.0000
## Prevalence                           0.2455           0.1545       0.1545
## Detection Rate                       0.2455           0.1545       0.1545
## Detection Prevalence                 0.2455           0.1545       0.1545
## Balanced Accuracy                    1.0000           1.0000       1.0000
##                      Class: Science Class: Social Sciences
## Sensitivity                 1.00000                1.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value              1.00000                1.00000
## Neg Pred Value              1.00000                1.00000
## Prevalence                  0.05455                0.04545
## Detection Rate              0.05455                0.04545
## Detection Prevalence        0.05455                0.04545
## Balanced Accuracy           1.00000                1.00000
mcrp5 <- confusionMatrix(resultado_prueba5, prueba[[target]])
mcrp5
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       1
##   Business                     0        1         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        1         0           0       0
##   IT & Technology              1        0         0           1       0
##   Marketing                    0        0         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         1           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               1         0     0       0
##   Engineering               0               1         0     0       0
##   Finance                   0               0         1     0       1
##   Hospitality               1               0         0     0       0
##   IT & Technology           0               2         1     0       0
##   Marketing                 0               0         2     1       0
##   Other                     0               1         0     3       0
##   Science                   0               1         0     0       0
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       1
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3913          
##                  95% CI : (0.1971, 0.6146)
##     No Information Rate : 0.2609          
##     P-Value [Acc > NIR] : 0.1196          
##                                           
##                   Kappa : 0.2985          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000         0.50000          0.00000
## Specificity                        0.95455         1.00000          0.95455
## Pos Pred Value                     0.00000         1.00000          0.00000
## Neg Pred Value                     0.95455         0.95455          0.95455
## Prevalence                         0.04348         0.08696          0.04348
## Detection Rate                     0.00000         0.04348          0.00000
## Detection Prevalence               0.04348         0.04348          0.04348
## Balanced Accuracy                  0.47727         0.75000          0.47727
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            1.00000
## Specificity                     0.95455        0.86364            0.95455
## Pos Pred Value                  0.00000        0.00000            0.50000
## Neg Pred Value                  0.95455        0.95000            1.00000
## Prevalence                      0.04348        0.04348            0.04348
## Detection Rate                  0.00000        0.00000            0.04348
## Detection Prevalence            0.04348        0.13043            0.08696
## Balanced Accuracy               0.47727        0.43182            0.97727
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                         0.33333          0.50000       0.7500
## Specificity                         0.82353          0.94737       0.9474
## Pos Pred Value                      0.40000          0.66667       0.7500
## Neg Pred Value                      0.77778          0.90000       0.9474
## Prevalence                          0.26087          0.17391       0.1739
## Detection Rate                      0.08696          0.08696       0.1304
## Detection Prevalence                0.21739          0.13043       0.1739
## Balanced Accuracy                   0.57843          0.72368       0.8487
##                      Class: Science Class: Social Sciences
## Sensitivity                 0.00000                0.00000
## Specificity                 0.90909                1.00000
## Pos Pred Value              0.00000                    NaN
## Neg Pred Value              0.95238                0.95652
## Prevalence                  0.04348                0.04348
## Detection Rate              0.00000                0.00000
## Detection Prevalence        0.08696                0.00000
## Balanced Accuracy           0.45455                0.50000

Modelo 6. Bosques Aleatorios

modelo6 <- train(
  formula_modelo, data = entrenamiento,
  method = "rf",
  preProcess = c("scale", "center"),
  trControl = ctrl,
  tuneGrid = expand.grid(mtry = c(2, 4, 6))
)

resultado_entrenamiento6 <- predict(modelo6, entrenamiento)
resultado_prueba6 <- predict(modelo6, prueba)

mcre6 <- confusionMatrix(resultado_entrenamiento6, entrenamiento[[target]])
mcre6
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               5        0         0           0       0
##   Business                     0       12         0           0       0
##   Education                    0        0         4           0       0
##   Engineering                  0        0         0           6       0
##   Finance                      0        0         0           0       6
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        0         0           0       0
##   Marketing                    0        0         0           0       0
##   Other                        0        0         0           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               0         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     0       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               5               0         0     0       0
##   IT & Technology           0              27         0     0       0
##   Marketing                 0               0        17     0       0
##   Other                     0               0         0    17       0
##   Science                   0               0         0     0       6
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               0
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               5
## 
## Overall Statistics
##                                     
##                Accuracy : 1         
##                  95% CI : (0.967, 1)
##     No Information Rate : 0.2455    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        1.00000          1.0000          1.00000
## Specificity                        1.00000          1.0000          1.00000
## Pos Pred Value                     1.00000          1.0000          1.00000
## Neg Pred Value                     1.00000          1.0000          1.00000
## Prevalence                         0.04545          0.1091          0.03636
## Detection Rate                     0.04545          0.1091          0.03636
## Detection Prevalence               0.04545          0.1091          0.03636
## Balanced Accuracy                  1.00000          1.0000          1.00000
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     1.00000        1.00000            1.00000
## Specificity                     1.00000        1.00000            1.00000
## Pos Pred Value                  1.00000        1.00000            1.00000
## Neg Pred Value                  1.00000        1.00000            1.00000
## Prevalence                      0.05455        0.05455            0.04545
## Detection Rate                  0.05455        0.05455            0.04545
## Detection Prevalence            0.05455        0.05455            0.04545
## Balanced Accuracy               1.00000        1.00000            1.00000
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          1.0000           1.0000       1.0000
## Specificity                          1.0000           1.0000       1.0000
## Pos Pred Value                       1.0000           1.0000       1.0000
## Neg Pred Value                       1.0000           1.0000       1.0000
## Prevalence                           0.2455           0.1545       0.1545
## Detection Rate                       0.2455           0.1545       0.1545
## Detection Prevalence                 0.2455           0.1545       0.1545
## Balanced Accuracy                    1.0000           1.0000       1.0000
##                      Class: Science Class: Social Sciences
## Sensitivity                 1.00000                1.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value              1.00000                1.00000
## Neg Pred Value              1.00000                1.00000
## Prevalence                  0.05455                0.04545
## Detection Rate              0.05455                0.04545
## Detection Prevalence        0.05455                0.04545
## Balanced Accuracy           1.00000                1.00000
mcrp6 <- confusionMatrix(resultado_prueba6, prueba[[target]])
mcrp6
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Arts & Culture Business Education Engineering Finance
##   Arts & Culture               0        0         0           0       1
##   Business                     0        0         0           0       0
##   Education                    0        0         0           0       0
##   Engineering                  0        0         0           0       0
##   Finance                      0        0         0           0       0
##   Hospitality                  0        0         0           0       0
##   IT & Technology              0        1         0           1       0
##   Marketing                    0        1         0           0       0
##   Other                        1        0         1           0       0
##   Science                      0        0         0           0       0
##   Social Sciences              0        0         0           0       0
##                  Reference
## Prediction        Hospitality IT & Technology Marketing Other Science
##   Arts & Culture            0               1         0     0       0
##   Business                  0               0         0     0       0
##   Education                 0               0         0     1       0
##   Engineering               0               0         0     0       0
##   Finance                   0               0         0     0       0
##   Hospitality               1               1         0     1       0
##   IT & Technology           0               4         2     1       0
##   Marketing                 0               0         2     1       0
##   Other                     0               0         0     0       0
##   Science                   0               0         0     0       1
##   Social Sciences           0               0         0     0       0
##                  Reference
## Prediction        Social Sciences
##   Arts & Culture                0
##   Business                      0
##   Education                     0
##   Engineering                   0
##   Finance                       0
##   Hospitality                   0
##   IT & Technology               1
##   Marketing                     0
##   Other                         0
##   Science                       0
##   Social Sciences               0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3478          
##                  95% CI : (0.1638, 0.5727)
##     No Information Rate : 0.2609          
##     P-Value [Acc > NIR] : 0.2325          
##                                           
##                   Kappa : 0.2123          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Arts & Culture Class: Business Class: Education
## Sensitivity                        0.00000         0.00000          0.00000
## Specificity                        0.90909         1.00000          0.95455
## Pos Pred Value                     0.00000             NaN          0.00000
## Neg Pred Value                     0.95238         0.91304          0.95455
## Prevalence                         0.04348         0.08696          0.04348
## Detection Rate                     0.00000         0.00000          0.00000
## Detection Prevalence               0.08696         0.00000          0.04348
## Balanced Accuracy                  0.45455         0.50000          0.47727
##                      Class: Engineering Class: Finance Class: Hospitality
## Sensitivity                     0.00000        0.00000            1.00000
## Specificity                     1.00000        1.00000            0.90909
## Pos Pred Value                      NaN            NaN            0.33333
## Neg Pred Value                  0.95652        0.95652            1.00000
## Prevalence                      0.04348        0.04348            0.04348
## Detection Rate                  0.00000        0.00000            0.04348
## Detection Prevalence            0.00000        0.00000            0.13043
## Balanced Accuracy               0.50000        0.50000            0.95455
##                      Class: IT & Technology Class: Marketing Class: Other
## Sensitivity                          0.6667          0.50000      0.00000
## Specificity                          0.6471          0.89474      0.89474
## Pos Pred Value                       0.4000          0.50000      0.00000
## Neg Pred Value                       0.8462          0.89474      0.80952
## Prevalence                           0.2609          0.17391      0.17391
## Detection Rate                       0.1739          0.08696      0.00000
## Detection Prevalence                 0.4348          0.17391      0.08696
## Balanced Accuracy                    0.6569          0.69737      0.44737
##                      Class: Science Class: Social Sciences
## Sensitivity                 1.00000                0.00000
## Specificity                 1.00000                1.00000
## Pos Pred Value              1.00000                    NaN
## Neg Pred Value              1.00000                0.95652
## Prevalence                  0.04348                0.04348
## Detection Rate              0.04348                0.00000
## Detection Prevalence        0.04348                0.00000
## Balanced Accuracy           1.00000                0.50000
table(pred_test1)
## pred_test1
##  Arts & Culture        Business       Education     Engineering         Finance 
##               2               4               0               2               0 
##     Hospitality IT & Technology       Marketing           Other         Science 
##               2               6               4               1               2 
## Social Sciences 
##               0