Comandos_R

Carga de datos

#Carga normal de un archivo .csv 
matricula <-read.csv("ds_educacion/matricula_de_educacion_formal.csv", header = TRUE, sep = ";", encoding="UTF-8")

#- libreria necesaria para cargar archivos .xlsx
library(readxl)

#- identificamos las columnas del archivo de excel
excel_sheets("MEDMUN2019.xlsx")

## [1] "asignacionfinal" "consolidado"

#- importamos el archivo .xlsx sin parametros adicionales con lo cual importaremos la primera prestana
medmun_excel <- read_excel("MEDMUN2019.xlsx")

#- analizamos su estructura con head, str y summary
head(medmun_excel)

## # A tibble: 6 x 5
##     nro comision delegacion                     ie             formador
##   <dbl> <chr>    <chr>                          <chr>          <chr>   
## 1     1 UNODC    REINO UNIDO DE GRAN BRETAÑA    MADRE LAURA 18 JUANJO  
## 2     2 UNODC    ESTADOS UNIDOS MEXICANOS       REY 17         MP      
## 3     3 UNODC    REPÚBLICA FEDERAL DE ALEMANIA  REY 18         MP      
## 4     4 UNODC    REPÚBLICA PORTUGUESA           CONCEJO 13     LAURA   
## 5     5 UNODC    ESTADOS UNIDOS DE NORTEAMÉRICA PAÚL 17        MP      
## 6     6 UNODC    REPÚBLICA FRANCESA             REY 19         MP

str(medmun_excel)

## Classes 'tbl_df', 'tbl' and 'data.frame':    323 obs. of  5 variables:
##  $ nro       : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ comision  : chr  "UNODC" "UNODC" "UNODC" "UNODC" ...
##  $ delegacion: chr  "REINO UNIDO DE GRAN BRETAÑA" "ESTADOS UNIDOS MEXICANOS" "REPÚBLICA FEDERAL DE ALEMANIA" "REPÚBLICA PORTUGUESA" ...
##  $ ie        : chr  "MADRE LAURA 18" "REY 17" "REY 18" "CONCEJO 13" ...
##  $ formador  : chr  "JUANJO" "MP" "MP" "LAURA" ...

summary(medmun_excel)

##       nro          comision          delegacion             ie           
##  Min.   :  1.0   Length:323         Length:323         Length:323        
##  1st Qu.: 81.5   Class :character   Class :character   Class :character  
##  Median :162.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :171.2                                                           
##  3rd Qu.:242.5                                                           
##  Max.   :394.0                                                           
##    formador        
##  Length:323        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

#- consultamos  las variaslbes del dataframe
names(medmun_excel)

## [1] "nro"        "comision"   "delegacion" "ie"         "formador"

#- importamos la segunda pestana del archivo
medmun_excel2 <- read_excel("MEDMUN2019.xlsx", sheet ="consolidado")

#- analizamos su estructura con head, str y summary
head(medmun_excel2)

## # A tibble: 6 x 7
##   formador      ie        delegados presidentes secretarios comunicaciones  kids
##   <chr>         <chr>         <dbl>       <dbl>       <dbl>          <dbl> <dbl>
## 1 Luis Guiller… IE Santa…        15           4           3              0    22
## 2 Luis Guiller… IE Lucre…        25           3           0              0    10
## 3 Luis Guiller… IE Mater…        20           2           1              0    25
## 4 Angie Marcel… IE Monse…        11           2           0              0     0
## 5 Angie Marcel… IE Lola …        13           3           0              0     0
## 6 Angie Marcel… IE CEFA          31           0           0              2     0

str(medmun_excel2)

## Classes 'tbl_df', 'tbl' and 'data.frame':    21 obs. of  7 variables:
##  $ formador      : chr  "Luis Guillermo Brand Rendón" "Luis Guillermo Brand Rendón" "Luis Guillermo Brand Rendón" "Angie Marcela Nanez Alvarado" ...
##  $ ie            : chr  "IE Santa Catalina de Siena" "IE Lucrecio Jaramillo Vélez" "IE Mater Dei" "IE Monseñor Gerardo Valencia" ...
##  $ delegados     : num  15 25 20 11 13 31 16 24 23 16 ...
##  $ presidentes   : num  4 3 2 2 3 0 0 0 0 0 ...
##  $ secretarios   : num  3 0 1 0 0 0 0 0 0 0 ...
##  $ comunicaciones: num  0 0 0 0 0 2 2 2 2 0 ...
##  $ kids          : num  22 10 25 0 0 0 0 12 0 0 ...

summary(medmun_excel2)

##    formador              ie              delegados      presidentes   
##  Length:21          Length:21          Min.   : 0.00   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:15.00   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :20.00   Median :0.000  
##                                        Mean   :17.95   Mean   :1.095  
##                                        3rd Qu.:22.00   3rd Qu.:2.000  
##                                        Max.   :31.00   Max.   :4.000  
##   secretarios     comunicaciones        kids       
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median :0.0000   Median :0.0000   Median : 0.000  
##  Mean   :0.1905   Mean   :0.9048   Mean   : 3.429  
##  3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.: 0.000  
##  Max.   :3.0000   Max.   :6.0000   Max.   :25.000

#- consultamos  las variaslbes del dataframe
names(medmun_excel2)

## [1] "formador"       "ie"             "delegados"      "presidentes"   
## [5] "secretarios"    "comunicaciones" "kids"

#-----------------------------

#- cargar varios .csv que se encuentran en la carpeta "ds_varios" del directorio de trabajo seleccionado en R

library(purrr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.2.1     ✓ readr   1.3.1
## ✓ tibble  2.1.3     ✓ stringr 1.4.0
## ✓ tidyr   1.0.0     ✓ forcats 0.4.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

files <- dir("ds_varios", full.names = TRUE)
tmp <- lapply(files, read.csv, header= TRUE, sep = ";")
matriz <- do.call(rbind, tmp)
names(matriz)

##  [1] "anio"          "mes"           "dane"          "tipo_servicio"
##  [5] "comuna"        "nucleo"        "nombre"        "pre_jardin"   
##  [9] "jardin"        "transicion"    "primero"       "segundo"      
## [13] "tercero"       "cuarto"        "quinto"        "sexto"        
## [17] "septimo"       "octavo"        "noveno"        "decimo"       
## [21] "once"          "doce"          "trece"         "catorce"      
## [25] "clei0"         "clei1"         "clei2"         "clei3"        
## [29] "clei4"         "clei5"         "clei6"         "aceleracion"

Transformacion de los datos

#- Carga normal de un archivo .csv 
matricula <-read.csv("ds_educacion/matricula_de_educacion_formal.csv", header = TRUE, sep = ";", encoding = "UTF-8")
orangeec <- read.csv("orangeec.csv", header = TRUE, sep = ",", encoding = "UTF-8")
cars <- mtcars

#- Conocer la clase de una varibale
class(cars$vs)

## [1] "numeric"

#- transformar la clase de una variable
cars$vs <- as.logical(cars$vs)
cars$am <- as.logical(cars$am)

str(cars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : logi  FALSE FALSE TRUE TRUE FALSE TRUE ...
##  $ am  : logi  TRUE TRUE TRUE FALSE FALSE FALSE ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

#- resumen del dataset
summary(orangeec)

##       Country       GDP.PC       GDP.US.bill         GDP.Growth..  
##  Argentina: 1   Min.   : 5600   Min.   :     13.7   Min.   :0.800  
##  Belize   : 1   1st Qu.: 8300   1st Qu.:     37.1   1st Qu.:2.000  
##  Bolivia  : 1   Median :13300   Median :     75.7   Median :2.800  
##  Brazil   : 1   Mean   :14053   Mean   : 188693.0   Mean   :2.959  
##  Chile    : 1   3rd Qu.:19900   3rd Qu.:    309.2   3rd Qu.:4.200  
##  Colombia : 1   Max.   :25400   Max.   :2055000.0   Max.   :5.400  
##  (Other)  :11                                                      
##  Services...GDP  Creat.Ind...GDP   Inflation       Unemployment   
##  Min.   :50.00   Min.   :1.000   Min.   : 0.400   Min.   : 2.300  
##  1st Qu.:56.90   1st Qu.:2.000   1st Qu.: 1.600   1st Qu.: 5.500  
##  Median :62.20   Median :2.600   Median : 3.400   Median : 6.700  
##  Mean   :62.64   Mean   :3.291   Mean   : 4.365   Mean   : 6.794  
##  3rd Qu.:64.90   3rd Qu.:3.950   3rd Qu.: 4.300   3rd Qu.: 8.100  
##  Max.   :82.00   Max.   :7.400   Max.   :25.700   Max.   :11.800  
##                  NA's   :6                                        
##  X..pop.below.poverty.line Internet.penetration...population   Median.age   
##  Min.   : 4.20             Min.   :38.20                     Min.   :22.10  
##  1st Qu.:21.70             1st Qu.:57.70                     1st Qu.:25.70  
##  Median :25.70             Median :69.70                     Median :28.20  
##  Mean   :27.65             Mean   :68.42                     Mean   :28.28  
##  3rd Qu.:32.70             3rd Qu.:79.90                     3rd Qu.:31.30  
##  Max.   :59.30             Max.   :93.10                     Max.   :35.00  
##                                                                             
##   X..pop.25.54   Education.invest...GDP
##  Min.   :34.12   Min.   :2.800         
##  1st Qu.:39.23   1st Qu.:4.400         
##  Median :40.19   Median :5.000         
##  Mean   :39.88   Mean   :5.082         
##  3rd Qu.:41.08   3rd Qu.:5.900         
##  Max.   :44.03   Max.   :7.400         
##

summary(cars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec           vs         
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Mode :logical  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   FALSE:18       
##  Median :3.695   Median :3.325   Median :17.71   TRUE :14       
##  Mean   :3.597   Mean   :3.217   Mean   :17.85                  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90                  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90                  
##      am               gear            carb      
##  Mode :logical   Min.   :3.000   Min.   :1.000  
##  FALSE:19        1st Qu.:3.000   1st Qu.:2.000  
##  TRUE :13        Median :4.000   Median :2.000  
##                  Mean   :3.688   Mean   :2.812  
##                  3rd Qu.:4.000   3rd Qu.:4.000  
##                  Max.   :5.000   Max.   :8.000

#- trasformacion de libras a kilos de una variable de cars
cars.new <- transform(cars, wt = wt * 1000 / 2.204623)
cars <- cars.new

Vectores en R

#- para los vectores utilizamos contenedores tipo c

#- vector con numeros
tiempo_gym <- c(60,120,120,60,120)
tiempo_lectura <- c(60,60,120,120,120)
tiempo_calidad <- tiempo_gym + tiempo_lectura

#- vector con letras
dias_semana <- c("Lunes", "Martes", "Miercoles", "Jueves", "Viernes")

#- Total tiempo gym y tiempo lectura
tiempo_total_gym <- sum(tiempo_gym)
tiempo_total_gym

## [1] 480

tiempo_total_lectura <- sum(tiempo_lectura)
tiempo_total_lectura

## [1] 480

Matrices o tablas en R

#- si una matriz tiene datos del mismo tipo son matrices como tal y si tiene datos de diferentes tipos son dataframes
#- Sobre matrices es importante tener los conceptos de algebra lineal para entener por ejemplo la transpuesta
#- mtcars por ejemplo es un dataset con diferentes tipos de datos. 

#- Construccion de una matriz

tiempo_matrix <- matrix(c(tiempo_gym, tiempo_lectura), nrow = 2, byrow = TRUE)

dias <-  c("Lunes", "Martes", "Miercoles", "Jueves", "Viernes")
tiempo <- c("Tiempo gym", "Tiempo lecturas")

colnames(tiempo_matrix) <- dias
rownames(tiempo_matrix) <- tiempo

tiempo_matrix

##                 Lunes Martes Miercoles Jueves Viernes
## Tiempo gym         60    120       120     60     120
## Tiempo lecturas    60     60       120    120     120

#- Suma de las columnas de la matriz
colSums(tiempo_matrix)

##     Lunes    Martes Miercoles    Jueves   Viernes 
##       120       180       240       180       240

#- Agregar una fila a la matriz

final_matrix <- rbind(tiempo_matrix, c(30,40,30,50,25))
colSums(final_matrix)

##     Lunes    Martes Miercoles    Jueves   Viernes 
##       150       220       270       230       265

#- Agrgar una columna
Sabado = c(80,50,75)
final_matrix <- cbind(final_matrix,Sabado)
tiempo2 <- c("Tiempo gym", "Tiempo lecturas", "Audiolibros")
rownames(final_matrix) <- tiempo2
final_matrix

##                 Lunes Martes Miercoles Jueves Viernes Sabado
## Tiempo gym         60    120       120     60     120     80
## Tiempo lecturas    60     60       120    120     120     50
## Audiolibros        30     40        30     50      25     75

Operadores para comparar y ubicar datos

#- ==     igual
#- !=     no igual(diferente)
#- <      menor que
#- <=     menor o igual que
#- >      mayor que
#- >=     mayor o igual que
#- |      o 
#- !      no
#- %in%   que este en el dataset

#- Ejemplo, con el dataset de mtcars podemos averiguar cuantos carros tienen determinados cilindros

cars[cars$cyl < 6, ]

##                 mpg cyl  disp  hp drat        wt  qsec    vs    am gear carb
## Datsun 710     22.8   4 108.0  93 3.85 1052.3341 18.61  TRUE  TRUE    4    1
## Merc 240D      24.4   4 146.7  62 3.69 1446.9594 20.00  TRUE FALSE    4    2
## Merc 230       22.8   4 140.8  95 3.92 1428.8157 22.90  TRUE FALSE    4    2
## Fiat 128       32.4   4  78.7  66 4.08  997.9030 19.47  TRUE  TRUE    4    1
## Honda Civic    30.4   4  75.7  52 4.93  732.5516 18.52  TRUE  TRUE    4    2
## Toyota Corolla 33.9   4  71.1  65 4.22  832.3419 19.90  TRUE  TRUE    4    1
## Toyota Corona  21.5   4 120.1  97 3.70 1118.1050 20.01  TRUE FALSE    3    1
## Fiat X1-9      27.3   4  79.0  66 4.08  877.7011 18.90  TRUE  TRUE    4    1
## Porsche 914-2  26.0   4 120.3  91 4.43  970.6875 16.70 FALSE  TRUE    5    2
## Lotus Europa   30.4   4  95.1 113 3.77  686.2851 16.90  TRUE  TRUE    5    2
## Volvo 142E     21.4   4 121.0 109 4.11 1260.9866 18.60  TRUE  TRUE    4    2

#- Ejemplo, en el de economia naranja, que paises tienen un PIB per capita mayor o igual a 15000

orangeec[orangeec$GDP.PC >= 15000, ]

##       Country GDP.PC GDP.US.bill GDP.Growth.. Services...GDP Creat.Ind...GDP
## 1   Argentina  20900       637.7          2.9           60.9             3.8
## 4      Brazil  15600   2055000.0          1.0           72.8             2.6
## 5       Chile  24500       277.0          1.5           64.3             2.2
## 7  Costa Rica  16900        58.1          3.2           73.5             2.0
## 12     Mexico  19900   1149000.0          2.0           64.0             7.4
## 14     Panama  25400        61.8          5.4           82.0             6.3
## 17    Uruguay  22400        58.4          3.1           68.8             1.0
##    Inflation Unemployment X..pop.below.poverty.line
## 1       25.7          8.1                      25.7
## 4        3.4         11.8                       4.2
## 5        2.2          7.0                      14.4
## 7        1.6          8.1                      21.7
## 12       6.0          3.6                      46.2
## 14       0.9          5.5                      23.0
## 17       6.2          7.3                       9.7
##    Internet.penetration...population Median.age X..pop.25.54
## 1                               93.1       31.7        39.38
## 4                               70.7       32.0        43.86
## 5                               77.5       34.4        43.08
## 7                               86.7       31.3        44.03
## 12                              65.0       28.3        40.81
## 14                              69.7       29.2        40.35
## 17                              88.2       35.0        39.34
##    Education.invest...GDP
## 1                     5.9
## 4                     5.9
## 5                     4.9
## 7                     7.1
## 12                    5.3
## 14                    3.2
## 17                    4.4

#- Ejemplo.

orangeec[orangeec$Creat.Ind...GDP <=2,  ]

##         Country GDP.PC GDP.US.bill GDP.Growth.. Services...GDP Creat.Ind...GDP
## NA         <NA>     NA          NA           NA             NA              NA
## NA.1       <NA>     NA          NA           NA             NA              NA
## 7    Costa Rica  16900        58.1          3.2           73.5             2.0
## 8       Ecuador  11500       102.3          2.7           56.9             2.0
## NA.2       <NA>     NA          NA           NA             NA              NA
## NA.3       <NA>     NA          NA           NA             NA              NA
## NA.4       <NA>     NA          NA           NA             NA              NA
## NA.5       <NA>     NA          NA           NA             NA              NA
## 16         Peru  13300       215.2          2.5           56.8             1.5
## 17      Uruguay  22400        58.4          3.1           68.8             1.0
##      Inflation Unemployment X..pop.below.poverty.line
## NA          NA           NA                        NA
## NA.1        NA           NA                        NA
## 7          1.6          8.1                      21.7
## 8          0.4          4.6                      21.5
## NA.2        NA           NA                        NA
## NA.3        NA           NA                        NA
## NA.4        NA           NA                        NA
## NA.5        NA           NA                        NA
## 16         2.8          6.7                      22.7
## 17         6.2          7.3                       9.7
##      Internet.penetration...population Median.age X..pop.25.54
## NA                                  NA         NA           NA
## NA.1                                NA         NA           NA
## 7                                 86.7       31.3        44.03
## 8                                 79.9       27.7        39.59
## NA.2                                NA         NA           NA
## NA.3                                NA         NA           NA
## NA.4                                NA         NA           NA
## NA.5                                NA         NA           NA
## 16                                67.6       28.0        40.19
## 17                                88.2       35.0        39.34
##      Education.invest...GDP
## NA                       NA
## NA.1                     NA
## 7                       7.1
## 8                       5.0
## NA.2                     NA
## NA.3                     NA
## NA.4                     NA
## NA.5                     NA
## 16                      3.8
## 17                      4.4

#- Una mejor forma es utilizar subset que nos permite crear un nuevo dataset con el filtro que queremos aplicarle

neworangeec <- subset(orangeec, Internet.penetration...population >80 & Education.invest...GDP >= 4.5)
neworangeec

##       Country GDP.PC GDP.US.bill GDP.Growth.. Services...GDP Creat.Ind...GDP
## 1   Argentina  20900       637.7          2.9           60.9             3.8
## 7  Costa Rica  16900        58.1          3.2           73.5             2.0
## 15   Paraguay   9800        29.6          4.3           54.5             4.1
##    Inflation Unemployment X..pop.below.poverty.line
## 1       25.7          8.1                      25.7
## 7        1.6          8.1                      21.7
## 15       3.6          6.5                      22.2
##    Internet.penetration...population Median.age X..pop.25.54
## 1                               93.1       31.7        39.38
## 7                               86.7       31.3        44.03
## 15                              89.6       28.2        41.08
##    Education.invest...GDP
## 1                     5.9
## 7                     7.1
## 15                    5.0

#- Podemos agregarle el argumento select para seleccionar columnas (variables), por ejemplo, queremos ver la seleccion anterior desde el aporte de la economia naranja
neworangeec <- subset(orangeec, Internet.penetration...population >80 & Education.invest...GDP >= 4.5, select = Creat.Ind...GDP)
neworangeec

##    Creat.Ind...GDP
## 1              3.8
## 7              2.0
## 15             4.1

#- Podemos renombrar una variable de nuestro dataset orangeec, para ello debemos tener instalado el paquete plyr. En caso de no tener el paquete instalado solamente corremos en la consola el código install.packages(“plyr”).

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

rename(orangeec, c("Creat.Ind...GDP" = "AporteEcNja"))

##        Country GDP.PC GDP.US.bill GDP.Growth.. Services...GDP AporteEcNja
## 1    Argentina  20900       637.7          2.9           60.9         3.8
## 2       Belize   8300      1854.0          0.8           62.2          NA
## 3      Bolivia   7500        37.1          4.2           50.0          NA
## 4       Brazil  15600   2055000.0          1.0           72.8         2.6
## 5        Chile  24500       277.0          1.5           64.3         2.2
## 6     Colombia  14500       309.2          1.8           61.4         3.3
## 7   Costa Rica  16900        58.1          3.2           73.5         2.0
## 8      Ecuador  11500       102.3          2.7           56.9         2.0
## 9  El Salvador   8900        28.0          2.4           64.9          NA
## 10   Guatemala   8100        75.7          2.8           63.2          NA
## 11    Honduras   5600        22.9          4.8           57.8          NA
## 12      Mexico  19900   1149000.0          2.0           64.0         7.4
## 13   Nicaragua   5800        13.7          4.9           50.8          NA
## 14      Panama  25400        61.8          5.4           82.0         6.3
## 15    Paraguay   9800        29.6          4.3           54.5         4.1
## 16        Peru  13300       215.2          2.5           56.8         1.5
## 17     Uruguay  22400        58.4          3.1           68.8         1.0
##    Inflation Unemployment X..pop.below.poverty.line
## 1       25.7          8.1                      25.7
## 2        1.1         10.1                      41.0
## 3        2.8          4.0                      38.6
## 4        3.4         11.8                       4.2
## 5        2.2          7.0                      14.4
## 6        4.3         10.5                      28.0
## 7        1.6          8.1                      21.7
## 8        0.4          4.6                      21.5
## 9        1.0          7.0                      32.7
## 10       4.4          2.3                      59.3
## 11       3.9          5.9                      29.6
## 12       6.0          3.6                      46.2
## 13       3.9          6.5                      29.6
## 14       0.9          5.5                      23.0
## 15       3.6          6.5                      22.2
## 16       2.8          6.7                      22.7
## 17       6.2          7.3                       9.7
##    Internet.penetration...population Median.age X..pop.25.54
## 1                               93.1       31.7        39.38
## 2                               52.3       22.7        36.62
## 3                               78.6       24.3        37.48
## 4                               70.7       32.0        43.86
## 5                               77.5       34.4        43.08
## 6                               63.2       30.0        41.91
## 7                               86.7       31.3        44.03
## 8                               79.9       27.7        39.59
## 9                               57.7       27.1        39.23
## 10                              42.1       22.1        34.12
## 11                              38.2       23.0        36.63
## 12                              65.0       28.3        40.81
## 13                              43.0       25.7        40.24
## 14                              69.7       29.2        40.35
## 15                              89.6       28.2        41.08
## 16                              67.6       28.0        40.19
## 17                              88.2       35.0        39.34
##    Education.invest...GDP
## 1                     5.9
## 2                     7.4
## 3                     7.3
## 4                     5.9
## 5                     4.9
## 6                     4.5
## 7                     7.1
## 8                     5.0
## 9                     3.5
## 10                    2.8
## 11                    5.9
## 12                    5.3
## 13                    4.5
## 14                    3.2
## 15                    5.0
## 16                    3.8
## 17                    4.4

#Factores

#- Se usa para almacenar Palabras, Labers o Etiquetas

nivel_curso <- c("Basico", "Intermedio", "Avanzado")

#- head: es una función que nos retorna los primeros elementos de un dataset, por defecto nos retorna los primeros 6.
#- tail: función similar a head solamente que esta función nos retorna los últimos elementos.
head(cars)

##                    mpg cyl disp  hp drat       wt  qsec    vs    am gear carb
## Mazda RX4         21.0   6  160 110 3.90 1188.412 16.46 FALSE  TRUE    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 1304.078 17.02 FALSE  TRUE    4    4
## Datsun 710        22.8   4  108  93 3.85 1052.334 18.61  TRUE  TRUE    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 1458.299 19.44  TRUE FALSE    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 1560.357 17.02 FALSE FALSE    3    2
## Valiant           18.1   6  225 105 2.76 1569.429 20.22  TRUE FALSE    3    1

tail(cars)

##                 mpg cyl  disp  hp drat        wt qsec    vs   am gear carb
## Porsche 914-2  26.0   4 120.3  91 4.43  970.6875 16.7 FALSE TRUE    5    2
## Lotus Europa   30.4   4  95.1 113 3.77  686.2851 16.9  TRUE TRUE    5    2
## Ford Pantera L 15.8   8 351.0 264 4.22 1437.8876 14.5 FALSE TRUE    5    4
## Ferrari Dino   19.7   6 145.0 175 3.62 1256.4506 15.5 FALSE TRUE    5    6
## Maserati Bora  15.0   8 301.0 335 3.54 1619.3245 14.6 FALSE TRUE    5    8
## Volvo 142E     21.4   4 121.0 109 4.11 1260.9866 18.6  TRUE TRUE    4    2

#- Además de poder visualizar un dataset con str podemos instalar el paquete dplyr: install.packages(“dplyr”). Una vez instalado usamos la función glimpse. Pronunciacion: diplaier

library(dplyr)

glimpse(cars)

## Observations: 32
## Variables: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8…
## $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 1…
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 18…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92…
## $ wt   <dbl> 1188.4118, 1304.0778, 1052.3341, 1458.2992, 1560.3575, 1569.4293…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 1…
## $ vs   <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, …
## $ am   <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2…

glimpse(orangeec)

## Observations: 17
## Variables: 13
## $ Country                           <fct> Argentina, Belize, Bolivia, Brazil,…
## $ GDP.PC                            <int> 20900, 8300, 7500, 15600, 24500, 14…
## $ GDP.US.bill                       <dbl> 637.7, 1854.0, 37.1, 2055000.0, 277…
## $ GDP.Growth..                      <dbl> 2.9, 0.8, 4.2, 1.0, 1.5, 1.8, 3.2, …
## $ Services...GDP                    <dbl> 60.9, 62.2, 50.0, 72.8, 64.3, 61.4,…
## $ Creat.Ind...GDP                   <dbl> 3.8, NA, NA, 2.6, 2.2, 3.3, 2.0, 2.…
## $ Inflation                         <dbl> 25.7, 1.1, 2.8, 3.4, 2.2, 4.3, 1.6,…
## $ Unemployment                      <dbl> 8.1, 10.1, 4.0, 11.8, 7.0, 10.5, 8.…
## $ X..pop.below.poverty.line         <dbl> 25.7, 41.0, 38.6, 4.2, 14.4, 28.0, …
## $ Internet.penetration...population <dbl> 93.1, 52.3, 78.6, 70.7, 77.5, 63.2,…
## $ Median.age                        <dbl> 31.7, 22.7, 24.3, 32.0, 34.4, 30.0,…
## $ X..pop.25.54                      <dbl> 39.38, 36.62, 37.48, 43.86, 43.08, …
## $ Education.invest...GDP            <dbl> 5.9, 7.4, 7.3, 5.9, 4.9, 4.5, 7.1, …

Listas

#- Hacemos un vector, una matriz y un df y todo eso lo almacenamos en una lista.

my_vector <-  1:8 #- un vector con numeros del 1 al 8
my_vector

## [1] 1 2 3 4 5 6 7 8

my_matrix <- matrix(1:9, ncol = 3)
my_matrix

##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9

my_df <- cars[1:4, ]
my_df

##                 mpg cyl disp  hp drat       wt  qsec    vs    am gear carb
## Mazda RX4      21.0   6  160 110 3.90 1188.412 16.46 FALSE  TRUE    4    4
## Mazda RX4 Wag  21.0   6  160 110 3.90 1304.078 17.02 FALSE  TRUE    4    4
## Datsun 710     22.8   4  108  93 3.85 1052.334 18.61  TRUE  TRUE    4    1
## Hornet 4 Drive 21.4   6  258 110 3.08 1458.299 19.44  TRUE FALSE    3    1

my_list <- list(my_vector, my_matrix, my_df)
my_list

## [[1]]
## [1] 1 2 3 4 5 6 7 8
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## [[3]]
##                 mpg cyl disp  hp drat       wt  qsec    vs    am gear carb
## Mazda RX4      21.0   6  160 110 3.90 1188.412 16.46 FALSE  TRUE    4    4
## Mazda RX4 Wag  21.0   6  160 110 3.90 1304.078 17.02 FALSE  TRUE    4    4
## Datsun 710     22.8   4  108  93 3.85 1052.334 18.61  TRUE  TRUE    4    1
## Hornet 4 Drive 21.4   6  258 110 3.08 1458.299 19.44  TRUE FALSE    3    1

Tipos de visualizaciones en EDA

#- Qué es EDA: Exploratory Data Analysis - Analisis exploratorio de Datos: importancia de visualizar los datos antes de enfocarnos en las formulas estadisticas
#- El cuarteto de Anscombe- Anscombe’s quartet- nos dice la importancia de visualizar los datos antes de enfocarnos en las formulas estadisticas

#- las principales visualizaciones en EDA son el HISTOGRAMA, EL GRAFICO DE DISPERSION -Scatterplot, BOX PLOT

#- El histograma nos permite visualizar la distribucion de las frecuencias de una variable, el histograma organiza de menor a mayor, diferente al grafico de barras que organiza en el orden como queramos. 
#- Con el Scatterplot cruzamos variables continuas o datos numericos, y los puntos no los podemos unir como si lo hacemos en graficos de lineas, el scatterplot obedece a un x,y de coordenada, ejemplo podria ser un conjuntos de datos de un supermercado que nos indica el numero de cajas y el tiempo de espera. Ubicamos la variable independiente en el eje X y la variable dependiente en el eje Y, en el ejemplo anterior ubicariamos el numero de cajas en el eje X y el tiempo de espera en el eje Y.

#- el Boxplot nos muestra 5 elementos claves en estadistica descriptiva que son el minimo, el maximo, el primer cuartil, la mediana o segundo cuartir y el tercer cuartil

#- Los 5 puntos clave en estadística descriptiva se pueden visualizar en el box plot:
#- Primer cuartil: es el piso de la caja o línea inferior.
#- Tercer cuartil: es el techo de la caja o línea superior.
#- Mediana: es la línea que se encuentra dentro de la caja.
#- Mínimo: la extensión inferior de la caja.
#- Máximo: la extensión superior de la caja.

#- Ejemplos

#- EDA Scatterplot mtcars

plot(cars$mpg ~ cars$cyl, xlab = "Cilindros", ylab = "Millas por galon", main = "Relacion Clindros y Millas por Galon")

plot(cars$mpg ~ cars$hp, xlab = "caballos de fuerza", ylab = "Millas por galon", main = "Relacion caballos de fuerza y Millas por Galon")

#- Ojo, la correlacion negativa del anteiror plot y en general cualquier correlacion no impolica causalidad

plot(orangeec$Unemployment ~ orangeec$Education.invest...GDP, xlab= "Inversion Educacion (%PIB)", ylab = "Desempleo", main = "Relacion inversion en educacion y desempleo" )

#- Los datos estan muy disperso

plot(orangeec$GDP.PC ~ orangeec$Creat.Ind...GDP, 
                                              xlab= "Aporte economia naranja al PIB (%)", 
                                              ylab = "PIB per capita", 
                                              main = "Relacion Economia Naranja y PIB per capita")

#- vemos que estan muy dispersos los datos

#HISTOGRAMAS con qplot

qplot(cars$hp,
      geom = "histogram",
      xlab = "Caballos de fuerza",
      main = "Carros segun caballos de fuerza")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Ahora vamos a usar el paquete ggplot2 

library(ggplot2)

#histograma basico
ggplot(cars, aes(x=hp)) +
  geom_histogram() + 
  labs(x ="Caballos de fuerza", y = "Cantidad de fuerza en carros seleccionados" )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#- Histograma con modificaciones de estilo blanco de background
ggplot(cars, aes(x=hp)) +
  geom_histogram() + 
  labs(x ="Caballos de fuerza", y = "Cantidad de fuerza en carros seleccionados" ) + 
  theme(legend.position = "none") +
  theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#- Histograma con modificaciones de estilo con barras mas gruesas
ggplot(cars, aes(x=hp)) +
  geom_histogram(binwidth = 30) + 
  labs(x ="Caballos de fuerza", y = "Cantidad de fuerza en carros seleccionados" ) + 
  theme(legend.position = "none") +
  theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

#- Otra forma 

ggplot() + geom_histogram(data = cars,
                          aes(x=hp), fill = "blue", color = "red",
                          binwidth = 20) + 
  labs(x ="Caballos de fuerza", y = "Cantidad de fuerza en carros seleccionados", title = "Caballos de fuerza en carros seleccionados") + 
  xlim(c(80,280)) +
  theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## Warning: Removed 6 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

#Visualizacion con el dataset de Economia Naranja

ggplot() + geom_histogram(data = orangeec,
                          aes(x=GDP.PC), fill = "blue", color = "red",
                          binwidth = 2000) + 
  labs(x ="PIB per capita", y = "Cantidad de paises", title = "PIB per capita en paises LATAM" ) + 
  theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

ggplot() + geom_histogram(data = orangeec,
                          aes(x=Creat.Ind...GDP), fill = "blue", color = "red",
                          binwidth = 1) + 
  labs(x ="Aporte Economia Naranja al PIB(%)", y = "Cantidad de paises", title = "Contribucion Economia Naranja al PIB en paises LATAM" ) + 
  theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## Warning: Removed 6 rows containing non-finite values (stat_bin).

ggplot() + geom_histogram(data = orangeec,
                          aes(x=Internet.penetration...population), fill = "red", color = "yellow",
                          binwidth = 5) + 
  labs(x ="Penetracion internet(%) poblacion", y = "Cantidad de paises", title = "Penetracion de internet en paises LATAM" ) + 
  theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

#boxplot

boxplot(cars$hp,
        ylab = "caballos de fuerza",
        main = "Caballos de fuerza en dataset")

ggplot(cars,aes(x=as.factor(cars$cyl), y=hp, fill = cyl)) + 
  geom_boxplot() +
  labs(x = "Cilindors", y = "Caballos de fuerza",
       title = "Caballos de fuerza segun cilindros en cars") + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

#- Cambiamos colores de las cajas

ggplot(cars,aes(x=as.factor(cars$cyl), y=hp, fill = cyl)) + 
  geom_boxplot(alpha = 0.6) +
  labs(x = "Cilindors", y = "Caballos de fuerza",
       title = "Caballos de fuerza segun cilindros en cars") + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

ggplot(cars,aes(x=am, y=mpg, fill = am)) + 
  geom_boxplot() +
  labs(x = "Tipo de caja", y = "Millas por galon",
       title = "Millas por galon segun tipo de caja") + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

#- Cambiamos los labels de False y True a Automatico y Manual para mejor interpretacion

cars$am <- factor(cars$am, levels = unique(cars$am), labels = c("Manual","Automatico") )

#- Volvemos a graficar

ggplot(cars,aes(x=am, y=mpg, fill = am)) + 
  geom_boxplot() +
  labs(x = "Tipo de caja", y = "Millas por galon",
       title = "Millas por galon segun tipo de caja") + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

#- sacar promedio 
promedio_economy <- mean(orangeec$GDP.PC)
promedio_economy

## [1] 14052.94

#dplyr
library(dplyr)

#vamos a crear una nueva variable con la funcion MUTATE del paquete dplyr
orangeec <- orangeec %>%
  mutate(strong_economy = ifelse(GDP.PC < promedio_economy, 
                                 "Por debajo del promedio PIB per capita", "Sobre - Arriba del promedio PIB per capita"))

#- con esa nueva variable categoria podemos hacer un boxplot

ggplot(orangeec, 
       aes(x = strong_economy, y = Creat.Ind...GDP, fill = strong_economy))+
  geom_boxplot(alpha = 0.4) + 
  labs(x = "Tipo de pais", y = "Aporte Economia Naranja al PIB", title = "Aporte Economia Naranja en PIB paises LATAM con alto y bajo PIB per capita") + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Veamos ahora la penetracion de internet en estos dos tipos de paises

ggplot(orangeec, 
       aes(x = strong_economy, y = Internet.penetration...population, fill = strong_economy))+
  geom_boxplot(alpha = 0.4) + 
  labs(x = "Tipo de pais", y = "Penetracion de Intenet (%)", title = "Penetracion de internet en paises LATAM con alto y bajo PIB per capita") + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

#scatter plot con ggplot  en cars con dos variables

ggplot(cars,
       aes(hp,mpg)) + 
  geom_point() + 
  labs(x = "Caballos de fuerza", y = "Millas por Galon", title = "Relacion caballos de fuerza y millas por galon" ) + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

#Peso y Potencia

ggplot(cars,
       aes(wt,hp)) + 
  geom_point() + 
  labs(x = "Peso", y = "Potencia", title = "Relacion Peso-Potencia" ) + 
  theme(legend.position = "none") +  
  theme(panel.background = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

#Vamos a relacionar 4 variables - 

ggplot(cars,
       aes(hp,qsec)) + 
  geom_point(aes(color = am, size = cyl )) + 
  labs(x = "Caballos de fuerza", y = "Tiempo en 1/4 de milla", title = "Relacion Caballos de fuerza entre velocidad segun cilindraje y caja" )

# Ahora con el dataset de orange ecomomy 

ggplot(orangeec,
       aes(Internet.penetration...population,Creat.Ind...GDP)) + 
  geom_point(aes(color = factor(strong_economy), size = GDP.Growth..)) + 
  labs(x = "Penetracion Internet", y = "Aporte Economia Naranja al PIB", title = "Relacion entre penetracion de internet y aporte economia naranja segun economia y crecimiento del PIB" )

## Warning: Removed 6 rows containing missing values (geom_point).

# finalmente hacemos lo anterior usando plotly para obtener una mejor visualizacion


my_graph <- ggplot(orangeec,
       aes(Internet.penetration...population,Creat.Ind...GDP)) + 
  geom_point(aes(color = factor(strong_economy), size = GDP.Growth..)) + 
  labs(x = "Penetracion Internet", y = "Aporte Economia Naranja al PIB", title = "Relacion entre penetracion de internet y aporte economia naranja segun economia y crecimiento del PIB" ) 

library(plotly)

## 
## Attaching package: 'plotly'

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

my_graph

## Warning: Removed 6 rows containing missing values (geom_point).

p <- ggplotly(my_graph)
p

Buscando correlaciones con pais

pairs(cars[ ,2:6])

#- Vemos varios graficos de dispersion - lo interpretamos de la siguiente manera: cyl se encuentra en el eje X  y en el Y esta disp, hp, drat y wt para la primera columna de graficos que tenemos

Comandos_R_0574

Alejandro Henao Ruiz

12/17/2019

Carga de datos

Transformacion de los datos

Vectores en R

Matrices o tablas en R

Operadores para comparar y ubicar datos

Listas

Tipos de visualizaciones en EDA

Buscando correlaciones con pais