Random forest para extraer las variables mas importantes dentro de un conjunto de datos.

library("readxl")
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(reshape)
## Warning: package 'reshape' was built under R version 4.0.3
my_data <- read_excel("data_e.xlsx", sheet = 2, col_names = TRUE, col_types = NULL, na = "", skip = 0)
head(my_data, 10)
## # A tibble: 10 x 32
##    SOC_TCH SOC_Modelado Aspect    BSI   EVI Ecosystm Elevation    IBI Litologia
##      <dbl>        <dbl>  <dbl>  <dbl> <dbl>    <dbl>     <dbl>  <dbl>     <dbl>
##  1   395.         310.   328.  NA     NA          NA      2.25 NA            NA
##  2   120.         295.   354.  NA     NA          NA   1171.   NA            NA
##  3   548.         438.   302.  -0.272  2.41        0      7.83 -0.251         0
##  4    57.2         83.3  353.  -0.367  2.62        0      1.04 -0.326         0
##  5   226.         271.    69.9 -0.534  2.56        0     11.7  -0.498         0
##  6   448.         376.   160.  -0.464  2.64        0     17.1  -0.441        30
##  7   437.         310.    NA   -0.492  2.32      398      3.46 -0.473        50
##  8   196.         297.   205.  -0.444  2.43        0      5.34 -0.471         0
##  9   245.         299.   214.  -0.410  2.22        0      4.82 -0.414        52
## 10   500.         411.   230.  -0.488  2.38        0      4.52 -0.491         5
## # ... with 23 more variables: MNDVI <dbl>, MeanCurvature <dbl>, NDFI <dbl>,
## #   NDSI <dbl>, NDVI <dbl>, NDWBI <dbl>, NDWI <dbl>, NIRv <dbl>,
## #   Precipitacionmean <dbl>, SAVI <dbl>, Slope <dbl>, Suelo_Map <dbl>,
## #   Temp_Cmean <dbl>, VH <dbl>, VV <dbl>, `clay_15-30cm_mean` <dbl>,
## #   `nitrogen_15-30cm_mean` <dbl>, `ocd_15-30cm_mean` <dbl>,
## #   `ocs_0-30cm_mean` <dbl>, `phh2o_15-30cm_mean` <dbl>,
## #   `sand_15-30cm_mean` <dbl>, `silt_15-30cm_mean` <dbl>,
## #   `soc_15-30cm_mean` <dbl>
my_data <- my_data[,c(1:24)]
head(my_data, 10)
## # A tibble: 10 x 24
##    SOC_TCH SOC_Modelado Aspect    BSI   EVI Ecosystm Elevation    IBI Litologia
##      <dbl>        <dbl>  <dbl>  <dbl> <dbl>    <dbl>     <dbl>  <dbl>     <dbl>
##  1   395.         310.   328.  NA     NA          NA      2.25 NA            NA
##  2   120.         295.   354.  NA     NA          NA   1171.   NA            NA
##  3   548.         438.   302.  -0.272  2.41        0      7.83 -0.251         0
##  4    57.2         83.3  353.  -0.367  2.62        0      1.04 -0.326         0
##  5   226.         271.    69.9 -0.534  2.56        0     11.7  -0.498         0
##  6   448.         376.   160.  -0.464  2.64        0     17.1  -0.441        30
##  7   437.         310.    NA   -0.492  2.32      398      3.46 -0.473        50
##  8   196.         297.   205.  -0.444  2.43        0      5.34 -0.471         0
##  9   245.         299.   214.  -0.410  2.22        0      4.82 -0.414        52
## 10   500.         411.   230.  -0.488  2.38        0      4.52 -0.491         5
## # ... with 15 more variables: MNDVI <dbl>, MeanCurvature <dbl>, NDFI <dbl>,
## #   NDSI <dbl>, NDVI <dbl>, NDWBI <dbl>, NDWI <dbl>, NIRv <dbl>,
## #   Precipitacionmean <dbl>, SAVI <dbl>, Slope <dbl>, Suelo_Map <dbl>,
## #   Temp_Cmean <dbl>, VH <dbl>, VV <dbl>
modelo1 <- randomForest(SOC_Modelado~.,data=my_data,
              na.action=na.exclude, ntree=500,importance=TRUE,maxnodes=10,mtry=15)

#Creamos un objeto con las "importancias" de las variables
importancia=data.frame(importance(modelo1))

importancia
##                     X.IncMSE IncNodePurity
## SOC_TCH           82.7975789   1236497.178
## Aspect             0.6757742      2844.196
## BSI                4.6984524      6898.734
## EVI                1.8391752      3585.374
## Ecosystm          -0.6317713      2485.744
## Elevation          4.4431005     10920.708
## IBI                7.1033365     29820.822
## Litologia          0.7528086      2036.331
## MNDVI              5.1785647     12031.317
## MeanCurvature     11.0511140     31182.950
## NDFI               1.7860819      2581.574
## NDSI               3.5711645     10694.038
## NDVI               3.9045482      4328.771
## NDWBI              5.1324803      8794.663
## NDWI               6.5644762     26598.865
## NIRv               4.2853375     10072.954
## Precipitacionmean -1.7314727      3778.785
## SAVI               2.6109916      3178.828
## Slope              8.6014344     85566.273
## Suelo_Map          4.4306046     14687.146
## Temp_Cmean         7.6030904     21324.051
## VH                 0.6145729      2790.763
## VV                 0.3827696      2185.542
#importancia<-sort_df(importancia,vars='MeanDecreaseGini')
importancia <- importancia[order(-importancia$X.IncMSE),] 
importancia
##                     X.IncMSE IncNodePurity
## SOC_TCH           82.7975789   1236497.178
## MeanCurvature     11.0511140     31182.950
## Slope              8.6014344     85566.273
## Temp_Cmean         7.6030904     21324.051
## IBI                7.1033365     29820.822
## NDWI               6.5644762     26598.865
## MNDVI              5.1785647     12031.317
## NDWBI              5.1324803      8794.663
## BSI                4.6984524      6898.734
## Elevation          4.4431005     10920.708
## Suelo_Map          4.4306046     14687.146
## NIRv               4.2853375     10072.954
## NDVI               3.9045482      4328.771
## NDSI               3.5711645     10694.038
## SAVI               2.6109916      3178.828
## EVI                1.8391752      3585.374
## NDFI               1.7860819      2581.574
## Litologia          0.7528086      2036.331
## Aspect             0.6757742      2844.196
## VH                 0.6145729      2790.763
## VV                 0.3827696      2185.542
## Ecosystm          -0.6317713      2485.744
## Precipitacionmean -1.7314727      3778.785
varImpPlot(modelo1)

dotchart(sort(importancia[,1]), xlim=c(0,90), xlab="%IncMSE")