Operaciones vectorizadas

# Creando vectores para combinar
names.first <- c("Juan Gabriel", "Zach", "Jack", "Sansa")
names.last <- c("Gomila", "Effron", "Sparrow", "Stark")

#La función paste, combina los dos vectores como si hubieramos hecho un bucle for
paste(names.first, names.last)
## [1] "Juan Gabriel Gomila" "Zach Effron"         "Jack Sparrow"       
## [4] "Sansa Stark"

Supongamos que uno de los dos vectores es más largo:

single.surname <- c("Zuccherberg")

#La función paste sirve para combinar incluso vectores de diferente tamaño!
paste(names.first, single.surname)
## [1] "Juan Gabriel Zuccherberg" "Zach Zuccherberg"        
## [3] "Jack Zuccherberg"         "Sansa Zuccherberg"

Una función que tomara dos vectores y los combinara:

username <- function(first, last){
  tolower(paste0(last, substr(first, 1, 2)))
}

username(names.first, names.last)
## [1] "gomilaju"  "effronza"  "sparrowja" "starksa"

Uso de la función apply

Esun afunción para aplicar a un conjunto de filas o columnas de un matriz:

m <- matrix(seq(1,16), 4, 4)
m
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16

Operaciones con apply:

# 1 -> función aplicada por filas
# 2 -> función aplicada por columnas
# c(1,2) -> función aplicada a elementos

apply(m, 1, min ) # mínimo por fila
## [1] 1 2 3 4
apply(m, 2, max) # màximo por columna
## [1]  4  8 12 16
apply(m, 1, prod) # producto por fila
## [1]  585 1680 3465 6144
apply(m, 2, sum) # suma de a columna
## [1] 10 26 42 58
apply(m, c(1,2), function(x){x^2}) # elevar al cuadrado cada una de las entradas para fila y columnas
##      [,1] [,2] [,3] [,4]
## [1,]    1   25   81  169
## [2,]    4   36  100  196
## [3,]    9   49  121  225
## [4,]   16   64  144  256
apply(m, 1, quantile, probs = c(0.4, 0.6, 0.8)) # aplicando una función de percentiles o cuartiles a una columna (ejemplo)
##     [,1] [,2] [,3] [,4]
## 40%  5.8  6.8  7.8  8.8
## 60%  8.2  9.2 10.2 11.2
## 80% 10.6 11.6 12.6 13.6
# suma de columnas y filas
colSums(m) 
## [1] 10 26 42 58
rowSums(m)
## [1] 28 32 36 40
# media de columnas y filas
colMeans(m)
## [1]  2.5  6.5 10.5 14.5
rowMeans(m)
## [1]  7  8  9 10

Aplicando “apply” en array. Primero creamos le array 3D:

array3D <- array(seq(1, 32), dim = c(4,4,2))
array3D
## , , 1
## 
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
## 
## , , 2
## 
##      [,1] [,2] [,3] [,4]
## [1,]   17   21   25   29
## [2,]   18   22   26   30
## [3,]   19   23   27   31
## [4,]   20   24   28   32

Aplicamos las operaciones:

apply(array3D, 3, sum) # indicamos la 3a dimension que se puede sumar
## [1] 136 392
sum(1:16)
## [1] 136
apply(array3D, 3, mean)
## [1]  8.5 24.5
apply(array3D, c(1,2), sum)
##      [,1] [,2] [,3] [,4]
## [1,]   18   26   34   42
## [2,]   20   28   36   44
## [3,]   22   30   38   46
## [4,]   24   32   40   48

lapply y sapply

Hay dos mejores para trabajar con vectores, listas y dataframes:

lapply -> l = lista, se puede aplicar vectores, list, data frame sapply -> lista o vector

auto <- read.csv("../DataSets/auto-mpg.csv", stringsAsFactors = F)

Creamos un vector que aplicaremos directamente en el dataframe:

x <- c(1,2,3)
x
## [1] 1 2 3
# aplicamos la raíz cuadrada con las dos funciones
lapply(x, sqrt) # crea una llista
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 1.414214
## 
## [[3]]
## [1] 1.732051
class(lapply(x, sqrt)) # podemos ver que es una lista
## [1] "list"
x <- list(a = 1:10, b = c(1,10,100,1000), c = seq(5,50, by = 5))
x
## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $b
## [1]    1   10  100 1000
## 
## $c
##  [1]  5 10 15 20 25 30 35 40 45 50
lapply(x, mean)
## $a
## [1] 5.5
## 
## $b
## [1] 277.75
## 
## $c
## [1] 27.5
class(lapply(x,mean))
## [1] "list"

Miramos como es con un sapply()

sapply(x, sqrt) # crea un vector
## $a
##  [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751
##  [8] 2.828427 3.000000 3.162278
## 
## $b
## [1]  1.000000  3.162278 10.000000 31.622777
## 
## $c
##  [1] 2.236068 3.162278 3.872983 4.472136 5.000000 5.477226 5.916080
##  [8] 6.324555 6.708204 7.071068
class(sapply(x, sqrt))
## [1] "list"
sapply(x, mean)
##      a      b      c 
##   5.50 277.75  27.50
class(sapply(x, mean))
## [1] "numeric"

Aplicando las funciones sobre el dataset:

lapply(auto[,2:8], min)
## $mpg
## [1] 9
## 
## $cylinders
## [1] 3
## 
## $displacement
## [1] 68
## 
## $horsepower
## [1] 46
## 
## $weight
## [1] 1613
## 
## $acceleration
## [1] 8
## 
## $model_year
## [1] 70
sapply(auto[,2:8], min)
##          mpg    cylinders displacement   horsepower       weight 
##            9            3           68           46         1613 
## acceleration   model_year 
##            8           70

La función summary por defecto devuelve una matriz:

lapply(auto[,2:8], summary)
## $mpg
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   17.50   23.00   23.51   29.00   46.60 
## 
## $cylinders
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   4.000   4.000   5.455   8.000   8.000 
## 
## $displacement
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   104.2   148.5   193.4   262.0   455.0 
## 
## $horsepower
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    46.0    76.0    92.0   104.1   125.0   230.0 
## 
## $weight
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1613    2224    2804    2970    3608    5140 
## 
## $acceleration
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   13.82   15.50   15.57   17.18   24.80 
## 
## $model_year
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   70.00   73.00   76.00   76.01   79.00   82.00
sapply(auto[,2:8], summary)
##              mpg cylinders displacement horsepower   weight acceleration
## Min.     9.00000  3.000000      68.0000    46.0000 1613.000      8.00000
## 1st Qu. 17.50000  4.000000     104.2500    76.0000 2223.750     13.82500
## Median  23.00000  4.000000     148.5000    92.0000 2803.500     15.50000
## Mean    23.51457  5.454774     193.4259   104.1281 2970.425     15.56809
## 3rd Qu. 29.00000  8.000000     262.0000   125.0000 3608.000     17.17500
## Max.    46.60000  8.000000     455.0000   230.0000 5140.000     24.80000
##         model_year
## Min.      70.00000
## 1st Qu.   73.00000
## Median    76.00000
## Mean      76.01005
## 3rd Qu.   79.00000
## Max.      82.00000
sapply(auto[,2:8], range) # range devuelve el mínimo y el máximo
##       mpg cylinders displacement horsepower weight acceleration model_year
## [1,]  9.0         3           68         46   1613          8.0         70
## [2,] 46.6         8          455        230   5140         24.8         82
sapply(auto[,2:8], min) 
##          mpg    cylinders displacement   horsepower       weight 
##            9            3           68           46         1613 
## acceleration   model_year 
##            8           70
sapply(auto[,2],   min)#esto falla porque R trabaja con vector
##   [1] 28.0 19.0 36.0 28.0 21.0 23.0 15.5 32.9 16.0 13.0 12.0 30.7 13.0 27.9
##  [15] 13.0 23.8 29.0 14.0 14.0 29.0 20.5 26.6 20.0 20.0 26.4 16.0 40.8 15.0
##  [29] 18.0 35.0 26.5 13.0 25.8 39.1 25.0 14.0 19.4 30.0 32.0 26.0 20.6 17.5
##  [43] 18.0 14.0 27.0 25.1 14.0 19.1 17.0 23.5 21.5 19.0 22.0 19.4 20.0 32.0
##  [57] 30.9 29.0 14.0 14.0 38.0 24.0 14.0 14.0 16.5 31.0 19.9 12.0 16.0 17.0
##  [71] 33.5 15.0 19.0 31.3 18.0 13.0 20.5 21.0 14.0 15.5 32.0 33.8 36.1 22.0
##  [85] 21.0 17.6 40.9 15.5 26.0 24.0 15.0 13.0 36.0 37.2 43.4 25.0 31.5 15.0
##  [99] 27.0 11.0 26.0 26.0 26.0 12.0 24.5 26.0 34.5 15.0 27.5 18.0 32.3 18.0
## [113] 10.0 23.6 22.4 37.0 21.5 33.0 27.0 25.0 23.0 14.0 26.6 18.0 14.0 14.0
## [127] 34.0 29.5 16.9 21.0 26.0 33.0 16.0 15.0 10.0 28.0 20.2 29.8 14.0 11.0
## [141] 15.0 38.0 13.0 41.5 19.2 34.1 21.0 36.0 18.0 44.0 37.7 16.0 31.0 25.5
## [155] 34.3 26.0 16.0 13.0 27.2 23.0 24.2 30.0 14.0 13.0 30.0 17.5 17.5 29.0
## [169] 22.0 15.0 36.4 15.0 36.0 21.0 15.5 36.1 17.5 22.0 26.0 18.0 25.0 23.0
## [183] 15.0 22.0 23.0 12.0 15.0 21.6 22.0  9.0 28.0 22.0 14.5 20.0 19.0 22.3
## [197] 24.0 12.0 18.0 29.5 19.0 25.4 24.0 26.0 23.0 13.0 18.0 27.2 18.0 24.0
## [211] 29.0 26.0 13.0 31.0 20.6 33.5 27.4 30.0 18.0 24.0 25.0 12.0 20.0 13.0
## [225] 26.8 15.0 26.0 31.0 34.7 30.5 25.0 21.0 18.1 32.4 15.0 24.0 28.0 18.6
## [239] 19.8 13.0 19.0 25.0 23.0 27.0 25.0 35.7 20.0 16.5 34.0 20.0 23.7 18.2
## [253] 11.0 16.0 24.5 33.7 37.3 16.2 31.0 27.2 32.0 31.6 43.1 30.0 32.4 27.0
## [267] 17.0 29.0 46.6 20.5 34.2 32.0 18.5 17.7 25.0 20.2 17.0 28.0 28.0 28.0
## [281] 18.0 17.5 13.0 15.0 15.0 27.0 20.2 24.0 23.0 25.4 11.0 15.0 27.0 20.0
## [295] 29.9 36.0 23.2 37.0 24.0 38.0 35.0 29.8 26.0 27.0 17.6 13.0 24.0 22.0
## [309] 26.0 18.5 16.0 19.0 31.0 28.0 18.0 20.3 34.1 31.0 36.0 15.5 13.0 27.0
## [323] 19.0 18.0 18.0 32.2 31.8 17.0 33.5 25.5 30.0 33.0 21.0 19.0 44.3 22.0
## [337] 19.0 24.0 28.1 14.0 19.2 32.0 17.0 18.5 13.0 28.4 19.2 29.0 31.5 23.9
## [351] 13.0 34.4 28.8 23.0 38.0 20.0 16.0 16.0 25.0 30.0 32.8 20.2 18.1 32.7
## [365] 21.5 22.0 24.3 14.0 37.0 25.0 28.0 31.9 16.0 19.0 39.4 29.0 17.0 13.0
## [379] 23.0 22.5 20.8 16.0 39.0 21.1 18.0 35.1 14.0 44.6 16.0 14.0 32.1 23.9
## [393] 13.0 16.5 34.5 38.1 30.5 19.0
sapply(as.data.frame(auto[,2]),min, simplify = F)
## $`auto[, 2]`
## [1] 9
#tapply
auto$cylinders <- factor(auto$cylinders,
                         levels = c(3,4,5,6,8),
                         labels = c("3C", "4C", "5C", "6C", "8C"))

tapply(auto$mpg, auto$cylinders, mean)
##       3C       4C       5C       6C       8C 
## 20.55000 29.28676 27.36667 19.98571 14.96311
tapply(auto$mpg, list(cyl = auto$cylinders), mean)
## cyl
##       3C       4C       5C       6C       8C 
## 20.55000 29.28676 27.36667 19.98571 14.96311
by(auto, auto$cylinders,
   function(row){ cor(row$mpg, row$acceleration)} )
## auto$cylinders: 3C
## [1] -0.8188727
## -------------------------------------------------------- 
## auto$cylinders: 4C
## [1] 0.07488465
## -------------------------------------------------------- 
## auto$cylinders: 5C
## [1] 0.714897
## -------------------------------------------------------- 
## auto$cylinders: 6C
## [1] -0.339605
## -------------------------------------------------------- 
## auto$cylinders: 8C
## [1] 0.3212278