¿Qué tenemos aquí? - El análisis exploratorio de datos

## 
## /*----------------------------------------------------------------------------+
## |                                                                             |
## |                   copyright (c) 2020 by Kevin Hidalgo.                      |
## |                                                                             |
## +-------------+---------------------------------------------------------------+
## | producto:   | Curso completo de Machine Learning: Data Science con RStudio  |
## | Tema:       | Direcciones carpeta                                           |
## | programa:   | direcciones.r                                                 |
## | soporte:    | kfhidalgoh@unal.edu.co                                        |
## | version:    | version 4.0.1 See Things Now                                  |
## | lenguaje:   | R                                                             |
## +-------------+---------------------------------------------------------------+
## | proposito:  | seteo de rutas de trabajo                                     |
## +-------------+---------------------------------------------------------------+
## | parametros: | direcciones de carpetas de trabajo                            |
## +-------------+---------------------------------------------------------------+
## | Salidas   : | valores de direcciones                                        |
## | Generadas   |                                                               |
## +-------------+---------------------------------------------------------------+
## | comentarios:| se debe cambiar las rutas de archivos para su adecuada        |
## |             | ejecución                                                     |
## +-------------+---------------------------------------------------------------+
## | Autor(es):  | Kevin Hidalgo | JUN2020   | initial creation                  |
## +-------------+---------------------------------------------------------------|
## | Basado en:  |                                                               |
## +-------------+--------------+-----------+-----------------------------------*/

Resumiendo nuestros datos con summary y str

data <- read.csv(paste0(dir.Data.Input, "/auto-mpg.csv"), stringsAsFactors = FALSE)

data$cylinders <- factor(data$cylinders, levels = c(3, 4, 5, 6, 8), labels = c("3cil", 
    "4cil", "5cil", "6cil", "8cil"))

nosotros podemos resumir las variables con la función summary()

summary(data)

       No           mpg       cylinders   displacement   horsepower 
 Min.   :  1   Min.   : 9.0   3cil:  4   Min.   : 68   Min.   : 46  
 1st Qu.:100   1st Qu.:17.5   4cil:204   1st Qu.:104   1st Qu.: 76  
 Median :200   Median :23.0   5cil:  3   Median :148   Median : 92  
 Mean   :200   Mean   :23.5   6cil: 84   Mean   :193   Mean   :104  
 3rd Qu.:299   3rd Qu.:29.0   8cil:103   3rd Qu.:262   3rd Qu.:125  
 Max.   :398   Max.   :46.6              Max.   :455   Max.   :230  
     weight      acceleration    model_year   car_name        
 Min.   :1613   Min.   : 8.0   Min.   :70   Length:398        
 1st Qu.:2224   1st Qu.:13.8   1st Qu.:73   Class :character  
 Median :2804   Median :15.5   Median :76   Mode  :character  
 Mean   :2970   Mean   :15.6   Mean   :76                     
 3rd Qu.:3608   3rd Qu.:17.2   3rd Qu.:79                     
 Max.   :5140   Max.   :24.8   Max.   :82

la función str() nos da una idea de la estructura con la cual R ha generado el objeto

str(data)

'data.frame':   398 obs. of  9 variables:
 $ No          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ mpg         : num  28 19 36 28 21 23 15.5 32.9 16 13 ...
 $ cylinders   : Factor w/ 5 levels "3cil","4cil",..: 2 1 2 2 4 2 5 2 4 5 ...
 $ displacement: num  140 70 107 97 199 115 304 119 250 318 ...
 $ horsepower  : int  90 97 75 92 90 95 120 100 105 150 ...
 $ weight      : int  2264 2330 2205 2288 2648 2694 3962 2615 3897 3755 ...
 $ acceleration: num  15.5 13.5 14.5 17 15 15 13.9 14.8 18.5 14 ...
 $ model_year  : int  71 72 82 72 70 75 76 81 75 76 ...
 $ car_name    : chr  "chevrolet vega 2300" "mazda rx2 coupe" "honda accord" "datsun 510 (sw)" ...

son dos herramientas muy utiles por que nos dan la idea inicial de la base de datos, la función summary se puede usar para variables muy concretas

summary(data$cylinders)

3cil 4cil 5cil 6cil 8cil 
   4  204    3   84  103

str(data$cylinders)

 Factor w/ 5 levels "3cil","4cil",..: 2 1 2 2 4 2 5 2 4 5 ...

summary(data$mpg)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    9.0    17.5    23.0    23.5    29.0    46.6

Estadísticos y medidas básicas

pack <- c("modeest", "raster", "moments")
ipak(pack)

modeest  raster moments 
   TRUE    TRUE    TRUE

modeest: se usa para calcular la moda
raster: cuantiles y coeficiente de variación
moments: coeficiente de asimetría como curtosis

x <- data$mpg

Medidas de centralización

media aritmética \[ \bar{X}=\frac{\sum_{i\,=\,1}^{n}x_{i}}{n} \]

mean(x)  #sum(x)/length(x)

[1] 23.51

mediana \[ P\left(X\leq m\right)=0.5 \]

median(x)

[1] 23

moda \[ p\left(X=M\right)\geq p\left(x=x_{i}\right)\qquad\forall1\leq i\leq n \]

mfv(x)

[1] 13

percentiles \[ P\left(X\leq x_{p}\right)=p\qquad p\in\left[0,1\right] \]

quantile(x)

  0%  25%  50%  75% 100% 
 9.0 17.5 23.0 29.0 46.6

Medidas de dispersión

varianza \[ s^{2}=\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{2}}{n-1} \]

var(x)

[1] 61.09

desviación típica \[ s=+\sqrt{s^{2}} \]

sd(x)

[1] 7.816

coeficientes de variación \[ C_{v}=\frac{s}{\bar{x}}\cdot100 \]

cv(x)

[1] 33.24

Momento de orden r respecto de la media

\[ m_{r}=\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{r}}{n} \]

Medidas de asimetría

asimetría de fisher \[ CA_{F}=\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{3}}{n\cdot s^{3}} \]

skewness(x)

[1] 0.4553

curtosis \[ c=\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{4}}{n\cdot s^{4}}-3 \]

kurtosis(x)

[1] 2.481

Subconjunto de datos

vamos a ver dos métodos, uno con el nombre de las columnas(basado en índices) y el segundo con la función subset. Cargamos de nuevo los datos

data <- read.csv(paste0(dir.Data.Input, "/auto-mpg.csv"), stringsAsFactors = FALSE)

Index by position

data[1:5, 8:9]

  model_year            car_name
1         71 chevrolet vega 2300
2         72     mazda rx2 coupe
3         82        honda accord
4         72     datsun 510 (sw)
5         70         amc gremlin

data[1:5, c(8, 9)]

  model_year            car_name
1         71 chevrolet vega 2300
2         72     mazda rx2 coupe
3         82        honda accord
4         72     datsun 510 (sw)
5         70         amc gremlin

index por nombre

data[1:5, c("model_year", "car_name")]

  model_year            car_name
1         71 chevrolet vega 2300
2         72     mazda rx2 coupe
3         82        honda accord
4         72     datsun 510 (sw)
5         70         amc gremlin

& : AND
: OR
! : NOT
== : igual comparación

min y max de mpg

data[data$mpg == max(data$mpg) | data$mpg == min(data$mpg), ]

     No  mpg cylinders displacement horsepower weight acceleration model_year
190 190  9.0         8          304        193   4732         18.5         70
269 269 46.6         4           86         65   2110         17.9         80
     car_name
190  hi 1200d
269 mazda glc

filtros con condiciones

data[data$mpg > 30 & data$cylinders == 6, c("car_name", "mpg")]

                             car_name  mpg
12                       volvo diesel 30.7
300 oldsmobile cutlass ciera (diesel) 38.0
364                     datsun 280-zx 32.7

data[data$mpg > 30 & data$cyl == 6, c("car_name", "mpg")]

                             car_name  mpg
12                       volvo diesel 30.7
300 oldsmobile cutlass ciera (diesel) 38.0
364                     datsun 280-zx 32.7

subset

subset(data, mpg > 30 & cylinders == 6, select = c("car_name", "mpg"))

                             car_name  mpg
12                       volvo diesel 30.7
300 oldsmobile cutlass ciera (diesel) 38.0
364                     datsun 280-zx 32.7

Divisiones con split

sirve para dividir el data frame por medio de un factor y tenemos su función inversa que es unsplit

car.list <- split(data, data$cylinders)  #genera una lista 
# primer dataframe
car.list[1]  # objeto interno cuyo valor es un data frame

$`3`
     No  mpg cylinders displacement horsepower weight acceleration model_year
2     2 19.0         3           70         97   2330         13.5         72
199 199 18.0         3           70         90   2124         13.5         73
251 251 23.7         3           70        100   2420         12.5         80
365 365 21.5         3           80        110   2720         13.5         77
           car_name
2   mazda rx2 coupe
199       maxda rx3
251   mazda rx-7 gs
365      mazda rx-4

car.list[[1]]  # este si es el data frame

     No  mpg cylinders displacement horsepower weight acceleration model_year
2     2 19.0         3           70         97   2330         13.5         72
199 199 18.0         3           70         90   2124         13.5         73
251 251 23.7         3           70        100   2420         12.5         80
365 365 21.5         3           80        110   2720         13.5         77
           car_name
2   mazda rx2 coupe
199       maxda rx3
251   mazda rx-7 gs
365      mazda rx-4

str(car.list[1])

List of 1
 $ 3:'data.frame':  4 obs. of  9 variables:
  ..$ No          : int [1:4] 2 199 251 365
  ..$ mpg         : num [1:4] 19 18 23.7 21.5
  ..$ cylinders   : int [1:4] 3 3 3 3
  ..$ displacement: num [1:4] 70 70 70 80
  ..$ horsepower  : int [1:4] 97 90 100 110
  ..$ weight      : int [1:4] 2330 2124 2420 2720
  ..$ acceleration: num [1:4] 13.5 13.5 12.5 13.5
  ..$ model_year  : int [1:4] 72 73 80 77
  ..$ car_name    : chr [1:4] "mazda rx2 coupe" "maxda rx3" "mazda rx-7 gs" "mazda rx-4"

names(car.list[[1]])

[1] "No"           "mpg"          "cylinders"    "displacement" "horsepower"  
[6] "weight"       "acceleration" "model_year"   "car_name"

Partición de data frames con variables númericas

para saber si el modelo lo primero es hacer una partición donde una parte es para entrenar el modelo y la otra es para evaluar su desempeño, para esto usaremos el paquete caret

pack <- c("caret")
ipak(pack)

caret 
 TRUE

ahora leeremos nuestra base de datos la cual será BostonHousing.csv

data <- read.csv(paste0(dir.Data.Input, "/BostonHousing.csv"))

vamos ha hacer una partición de entrenamiento con el 80%

training.ids <- createDataPartition(data$MEDV, p = 0.8, list = F)  #id de partición por medio de la columna MEDV
data.training <- data[training.ids, ]
str(data.training)

'data.frame':   407 obs. of  14 variables:
 $ CRIM   : num  0.00632 0.02729 0.03237 0.06905 0.02985 ...
 $ ZN     : num  18 0 0 0 0 12.5 12.5 12.5 12.5 12.5 ...
 $ INDUS  : num  2.31 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 7.87 ...
 $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX    : num  0.538 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 0.524 ...
 $ RM     : num  6.58 7.18 7 7.15 6.43 ...
 $ AGE    : num  65.2 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 94.3 ...
 $ DIS    : num  4.09 4.97 6.06 6.06 6.06 ...
 $ RAD    : int  1 2 3 3 3 5 5 5 5 5 ...
 $ TAX    : int  296 242 222 222 222 311 311 311 311 311 ...
 $ PTRATIO: num  15.3 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 15.2 ...
 $ B      : num  397 393 395 397 394 ...
 $ LSTAT  : num  4.98 4.03 2.94 5.33 5.21 ...
 $ MEDV   : num  24 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15 ...

data.validation <- data[-training.ids, ]
str(data.validation)

'data.frame':   99 obs. of  14 variables:
 $ CRIM   : num  0.0273 0.1175 0.6274 1.2325 1.3547 ...
 $ ZN     : num  0 12.5 0 0 0 0 0 0 0 0 ...
 $ INDUS  : num  7.07 7.87 8.14 8.14 8.14 8.14 5.96 5.96 6.91 6.91 ...
 $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX    : num  0.469 0.524 0.538 0.538 0.538 0.538 0.499 0.499 0.448 0.448 ...
 $ RM     : num  6.42 6.01 5.83 6.14 6.07 ...
 $ AGE    : num  78.9 82.9 56.5 91.7 100 96.9 61.4 30.2 6.6 33.3 ...
 $ DIS    : num  4.97 6.23 4.5 3.98 4.17 ...
 $ RAD    : int  2 5 4 4 4 4 5 5 3 3 ...
 $ TAX    : int  242 311 307 307 307 307 279 279 233 233 ...
 $ PTRATIO: num  17.8 15.2 21 21 21 21 19.2 19.2 17.9 17.9 ...
 $ B      : num  397 397 396 397 377 ...
 $ LSTAT  : num  9.14 13.27 8.47 18.72 13.04 ...
 $ MEDV   : num  21.6 18.9 19.9 15.2 14.5 13.5 20 24.7 25.3 20 ...

hay procesos que requieren tres particiones dos para construir el modelo y la tercera de test

training.ids.2 <- createDataPartition(data$MEDV, p = 0.7, list = F)
data.training.2 <- data[training.ids.2, ]
str(data.training.2)

'data.frame':   356 obs. of  14 variables:
 $ CRIM   : num  0.0273 0.0299 0.0883 0.1446 0.17 ...
 $ ZN     : num  0 0 12.5 12.5 12.5 12.5 12.5 0 0 0 ...
 $ INDUS  : num  7.07 2.18 7.87 7.87 7.87 7.87 7.87 8.14 8.14 8.14 ...
 $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX    : num  0.469 0.458 0.524 0.524 0.524 0.524 0.524 0.538 0.538 0.538 ...
 $ RM     : num  7.18 6.43 6.01 6.17 6 ...
 $ AGE    : num  61.1 58.7 66.6 96.1 85.9 94.3 82.9 61.8 84.5 56.5 ...
 $ DIS    : num  4.97 6.06 5.56 5.95 6.59 ...
 $ RAD    : int  2 3 5 5 5 5 5 4 4 4 ...
 $ TAX    : int  242 222 311 311 311 311 311 307 307 307 ...
 $ PTRATIO: num  17.8 18.7 15.2 15.2 15.2 15.2 15.2 21 21 21 ...
 $ B      : num  393 394 396 397 387 ...
 $ LSTAT  : num  4.03 5.21 12.43 19.15 17.1 ...
 $ MEDV   : num  34.7 28.7 22.9 27.1 18.9 15 18.9 20.4 18.2 19.9 ...

temp <- data[-training.ids.2, ]
validation.ids.2 <- createDataPartition(temp$MEDV, p = 0.5, list = F)
data.validation <- temp[validation.ids.2, ]
str(data.validation)

'data.frame':   75 obs. of  14 variables:
 $ CRIM   : num  0.0273 0.0324 0.069 0.2112 0.0938 ...
 $ ZN     : num  0 0 0 12.5 12.5 0 0 0 0 0 ...
 $ INDUS  : num  7.07 2.18 2.18 7.87 7.87 8.14 8.14 8.14 8.14 8.14 ...
 $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX    : num  0.469 0.458 0.458 0.524 0.524 0.538 0.538 0.538 0.538 0.538 ...
 $ RM     : num  6.42 7 7.15 5.63 5.89 ...
 $ AGE    : num  78.9 45.8 54.2 100 39 81.7 98.1 94.1 94.4 96.9 ...
 $ DIS    : num  4.97 6.06 6.06 6.08 5.45 ...
 $ RAD    : int  2 3 3 5 5 4 4 4 4 4 ...
 $ TAX    : int  242 222 222 311 311 307 307 307 307 307 ...
 $ PTRATIO: num  17.8 18.7 18.7 15.2 15.2 21 21 21 21 21 ...
 $ B      : num  397 395 397 387 390 ...
 $ LSTAT  : num  9.14 2.94 5.33 29.93 15.71 ...
 $ MEDV   : num  21.6 33.4 36.2 16.5 21.7 17.5 13.6 15.6 18.4 13.5 ...

data.testing <- temp[-validation.ids.2, ]
str(data.testing)

'data.frame':   75 obs. of  14 variables:
 $ CRIM   : num  0.00632 1.23247 0.0536 0.15445 0.10328 ...
 $ ZN     : num  18 0 21 25 25 17.5 0 0 25 0 ...
 $ INDUS  : num  2.31 8.14 5.64 5.13 5.13 ...
 $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX    : num  0.538 0.538 0.439 0.453 0.453 ...
 $ RM     : num  6.58 6.14 6.51 6.14 5.93 ...
 $ AGE    : num  65.2 91.7 21.1 29.2 47.2 59.5 7.8 74.5 33.5 45.1 ...
 $ DIS    : num  4.09 3.98 6.81 7.81 6.93 ...
 $ RAD    : int  1 4 4 8 8 3 4 5 4 3 ...
 $ TAX    : int  296 307 243 284 284 216 305 398 281 247 ...
 $ PTRATIO: num  15.3 21 16.8 19.7 19.7 18.6 19.2 18.7 19 18.5 ...
 $ B      : num  397 397 397 391 397 ...
 $ LSTAT  : num  4.98 18.72 5.28 6.86 9.22 ...
 $ MEDV   : num  24 15.2 25 23.3 19.6 33 22.8 20 28 22.5 ...

esta es una función que aleatoriamente selecciona indices de fila, haciendo un muestreo basado en grupos de percentiles

Partición de data frames con variables categóricas

cargamos los datos

data.2 <- read.csv(paste0(dir.Data.Input, "/boston-housing-classification.csv"))
str(data.2)

'data.frame':   363 obs. of  14 variables:
 $ CRIM    : num  0.00632 0.02729 0.03237 0.06905 0.02985 ...
 $ ZN      : num  18 0 0 0 0 12.5 12.5 12.5 12.5 12.5 ...
 $ INDUS   : num  2.31 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 7.87 ...
 $ CHAS    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX     : num  0.538 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 0.524 ...
 $ RM      : num  6.58 7.18 7 7.15 6.43 ...
 $ AGE     : num  65.2 61.1 45.8 54.2 58.7 96.1 100 85.9 94.3 82.9 ...
 $ DIS     : num  4.09 4.97 6.06 6.06 6.06 ...
 $ RAD     : int  1 2 3 3 3 5 5 5 5 5 ...
 $ TAX     : int  296 242 222 222 222 311 311 311 311 311 ...
 $ PTRATIO : num  15.3 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 15.2 ...
 $ B       : num  397 393 395 397 394 ...
 $ LSTAT   : num  4.98 4.03 2.94 5.33 5.21 ...
 $ MEDV_CAT: chr  "High" "High" "High" "High" ...

crearemos la partición por medio de la variable MEDV_CAT

training.ids.3 <- createDataPartition(data.2$MEDV_CAT, p = 0.7, list = F)
data.training.3 <- data.2[training.ids.3, ]
str(data.training.3)

'data.frame':   255 obs. of  14 variables:
 $ CRIM    : num  0.00632 0.02729 0.03237 0.06905 0.21124 ...
 $ ZN      : num  18 0 0 0 12.5 12.5 12.5 12.5 0 0 ...
 $ INDUS   : num  2.31 7.07 2.18 2.18 7.87 7.87 7.87 7.87 8.14 8.14 ...
 $ CHAS    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NOX     : num  0.538 0.469 0.458 0.458 0.524 0.524 0.524 0.524 0.538 0.538 ...
 $ RM      : num  6.58 7.18 7 7.15 5.63 ...
 $ AGE     : num  65.2 61.1 45.8 54.2 100 85.9 94.3 82.9 84.5 98.1 ...
 $ DIS     : num  4.09 4.97 6.06 6.06 6.08 ...
 $ RAD     : int  1 2 3 3 5 5 5 5 4 4 ...
 $ TAX     : int  296 242 222 222 311 311 311 311 307 307 ...
 $ PTRATIO : num  15.3 17.8 18.7 18.7 15.2 15.2 15.2 15.2 21 21 ...
 $ B       : num  397 393 395 397 387 ...
 $ LSTAT   : num  4.98 4.03 2.94 5.33 29.93 ...
 $ MEDV_CAT: chr  "High" "High" "High" "High" ...

data.validation.3 <- data.2[-training.ids.3]
str(data.validation.3)

 chr [1:4827] " 0.02985" " 0.14455" " 1.05393" " 0.78420" " 0.72580" ...

si queremos hacer tres particiones sería muy similar a como lo realizamos anteriormente vamos a crear una función que nos ayudará ha automatizar este proceso

rda.cb.partition2 <- function(dataframe, target.index, prob) {
    library(caret)
    traininig.ids <- createDataPartition(dataframe[, target.index], p = prob, list = F)
    list(train = dataframe[traininig.ids, ], val = dataframe[-traininig.ids, ])
}


rda.cb.partition3 <- function(dataframe, target.index, prob.train, prob.val) {
    library(caret)
    traininig.ids <- createDataPartition(dataframe[, target.index], p = prob.train, 
        list = F)
    train.data <- dataframe[traininig.ids, ]
    temp <- dataframe[-traininig.ids, ]
    validation.ids <- createDataPartition(temp[, target.index], p = prob.val, list = F)
    list(train = train.data, val = temp[validation.ids, ], test = temp[-validation.ids])
}

Histogramas, boxplots y scatterplots

cargamos datos

auto <- read.csv(paste0(dir.Data.Input, "/auto-mpg.csv"))

vamos a sobreescribir la variable cilindros para que esta sea una categoría

auto$cylinders <- factor(auto$cylinders, levels = c(3, 4, 5, 6, 8), labels = c("3cil", 
    "4cil", "5cil", "6cil", "8cil"))

usaré attach para que auto forme parte de la carga principal de R

attach(auto)
head(cylinders)

[1] 4cil 3cil 4cil 4cil 6cil 4cil
Levels: 3cil 4cil 5cil 6cil 8cil

R en automático reconoce la escala, títulos entre otros, estas opciones se pueden modificar facilmente

Histograma

hist(auto$acceleration, xlab = "Aceleración", ylab = "Frecuencias", main = "Histograma de las aceleraciones", 
    breaks = 16)

Boxplot

se especifica el parámetro data para que se pueda renderizar en Markdown

boxplot(auto$mpg, xlab = "Millas por galón")

boxplot(mpg ~ model_year, data = auto, xlab = "Millas por galón (año)")

boxplot(mpg ~ cylinders, data = auto, xlab = "Consumo por número de cilindros")

# ggplot2
ggplot(data = auto, aes(x = as.factor(model_year), y = mpg, fill = as.factor(model_year))) + 
    geom_boxplot() + ylab("Millas por galón") + xlab("Año")

ggplot(data = auto, aes(x = cylinders, y = mpg, fill = cylinders)) + geom_boxplot() + 
    ylab("Millas por galón") + xlab("Consumo por número de cilindros")

Scatterplot

plot(x = auto$horsepower, y = auto$mpg)

# matriz de scatterplot
pairs(~mpg + displacement + horsepower + weight, data = auto)

# ggplot usando el paquete GGally
library(GGally)
ggpairs(auto, columns = c("mpg", "displacement", "horsepower", "weight"))

Personalizando nuestros gráficos

# histograma
hist(auto$acceleration, col = rainbow(12), xlab = "Aceleración", ylab = "Frecuencias", 
    main = "Histograma de las aceleraciones", breaks = 12)

hist(auto$mpg, prob = T)
lines(density(auto$mpg))

# scatterplot

plot(x = auto$horsepower, y = auto$mpg)
linearmodel <- lm(auto$mpg ~ auto$horsepower)
abline(linearmodel)

# agregar colores para cada cilindraje
plot(x = auto$horsepower, y = auto$mpg, type = "n")
linearmodel <- lm(auto$mpg ~ auto$horsepower)
abline(linearmodel)
with(subset(auto, cylinders == "8cil"), points(horsepower, mpg, col = "red"))
with(subset(auto, cylinders == "6cil"), points(horsepower, mpg, col = "yellow"))
with(subset(auto, cylinders == "5cil"), points(horsepower, mpg, col = "green"))
with(subset(auto, cylinders == "4cil"), points(horsepower, mpg, col = "blue"))
with(subset(auto, cylinders == "3cil"), points(horsepower, mpg))  # como no indicamos color, por defecto usará el color negro

ggplot(data = auto, aes(x = horsepower, y = mpg)) + geom_point(aes(colour = cylinders)) + 
    geom_smooth(method = lm, se = FALSE)

El paquete lattice

gráficos bastantes chulos para representar relaciones multivariantes entre las diferentes variables del data set, es uno de los que lleva las relaciones más complejas

pack <- c("lattice")
ipak(pack)

lattice 
   TRUE

auto <- read.csv(paste0(dir.Data.Input, "/auto-mpg.csv"), stringsAsFactors = F)
auto$cylinders <- factor(auto$cylinders, levels = c(3, 4, 5, 6, 8), labels = c("3c", 
    "4c", "5c", "6c", "8c"))

para gráficar boxplot usamos la función bwplot

bwplot(~auto$mpg | auto$cylinders, main = "MPG según cilindrada", xlab = "Millas por galón")

para un sactterplot usaremos la función xyplot

xyplot(mpg ~ weight | cylinders, data = auto, main = "Peso VS Consumo VS Cilindraje", 
    xlab = "Peso(kg)", ylab = "Consumo(mpg)")

los gráficos del paquete lattice se forman en 4 pasos

tipo de gráfico (bwplot,xyplot,densityplot,splom)
formula separada por | que viene a ser los factores
data
parametros adicionales

se puede cambiar el esquema (personalización) con la siguiente sentencia

trellis.par.set(theme = col.whitebg())
xyplot(mpg ~ weight | cylinders, data = auto, main = "Peso VS Consumo VS Cilindraje", 
    xlab = "Peso(kg)", ylab = "Consumo(mpg)")

tenemos también los parámetros layout y aspect

bwplot(~auto$mpg | auto$cylinders, main = "MPG según cilindrada", xlab = "Millas por galón", 
    layout = c(2, 3), aspect = 1)

Comparación a través de representaciones

data <- read.csv(paste0(dir.Data.Input, "/daily-bike-rentals.csv"))
str(data)

'data.frame':   731 obs. of  16 variables:
 $ instant   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ dteday    : chr  "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" ...
 $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ yr        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mnth      : int  1 1 1 1 1 1 1 1 1 1 ...
 $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ weekday   : int  6 0 1 2 3 4 5 6 0 1 ...
 $ workingday: int  0 0 1 1 1 1 1 0 0 1 ...
 $ weathersit: int  2 2 1 1 1 1 2 2 1 1 ...
 $ temp      : num  0.344 0.363 0.196 0.2 0.227 ...
 $ atemp     : num  0.364 0.354 0.189 0.212 0.229 ...
 $ hum       : num  0.806 0.696 0.437 0.59 0.437 ...
 $ windspeed : num  0.16 0.249 0.248 0.16 0.187 ...
 $ casual    : int  331 131 120 108 82 88 148 68 54 41 ...
 $ registered: int  654 670 1229 1454 1518 1518 1362 891 768 1280 ...
 $ cnt       : int  985 801 1349 1562 1600 1606 1510 959 822 1321 ...

data$season <- factor(data$season, levels = c(1, 2, 3, 4), labels = c("Invierno", 
    "Primavera", "Verano", "Otoño"))
data$workingday <- factor(data$workingday, levels = c(0, 1), labels = c("Festivo", 
    "De trabajo"))
data$weathersit <- factor(data$weathersit, levels = c(1, 2, 3), labels = c("Despejado", 
    "Nublado", "Lluvia/Nieve ligera"))

data$dteday <- as.Date(data$dteday, format = "%Y-%m-%d")

winter <- subset(data, season == "Invierno")$cnt
spring <- subset(data, season == "Primavera")$cnt
summer <- subset(data, season == "Verano")$cnt
fall <- subset(data, season == "Invierno")$cnt

media de color rojo y la mediana de color azul

par(mfrow = c(2, 2))
hist(winter, prob = TRUE, xlab = "Alquiler diario en invierno", main = "")
lines(density(winter))
abline(v = mean(winter), col = "red")
abline(v = median(winter), col = "blue")
hist(spring, prob = TRUE, xlab = "Alquiler diario en primavera", main = "")
lines(density(spring))
abline(v = mean(spring), col = "red")
abline(v = median(spring), col = "blue")
hist(summer, prob = TRUE, xlab = "Alquiler diario en Verano", main = "")
lines(density(summer))
abline(v = mean(summer), col = "red")
abline(v = median(summer), col = "blue")
hist(fall, prob = TRUE, xlab = "Alquiler diario en otoño", main = "")
lines(density(fall))
abline(v = mean(fall), col = "red")
abline(v = median(fall), col = "blue")

El gráfico de las judias

se puede utilizar para mostrar una distribución como el boxplot pero que ademas las frecuencias en forma de histograma dentro de ella

pack <- c("beanplot")
ipak(pack)

beanplot 
    TRUE

par(mfrow = c(1, 1))
beanplot(data$cnt ~ data$season, col = c("blue", "red", "yellow"))

Análisis de causalidad

bwplot(cnt ~ weathersit, data = data, layout = c(1, 1), xlab = "Pronostico del tiempo", 
    ylab = "Frecuencias", par.settings = list(box.rectangle = list(fill = c("red", 
        "yellow", "green"))))

bwplot(cnt ~ weathersit, data = data, layout = c(1, 1), xlab = "Pronostico del tiempo", 
    ylab = "Frecuencias", panel = function(x, y, ...) {
        panel.bwplot(x, y, ...)
        panel.stripplot(x, y, jitter.data = TRUE, ...)
    }, par.settings = list(box.rectangle = list(fill = c("red", "yellow", "green"))))