Grupo de estudio de R

Introducción a R

R es un lenguaje de programación y un entorno de software libre para computación estadística y gráficos respaldados por la Fundación R para Computación Estadística. El lenguaje R se usa ampliamente entre estadísticos y mineros de datos para desarrollar software estadístico y análisis de datos .

Descarga de R

[https://cloud.r-project.org]

[https://www.rstudio.com/products/rstudio/download/]

Consola de R

R studio

Material original.

Revisenlo:

[https://www.youtube.com/c/RafaGonzalezGouveia/videos]

Medio de contacto:

Podcast de economía:

[https://open.spotify.com/show/5MXu2PgxTyQ5KCgy6XfEUe?si=6bb7mMwzR2Ou_4O3J31-eg] [luisalbertoruedatapia1@gmail.com]

Instalacion de paquetes y librerias

#install.packages("readxl")
library(readxl)
library(ggplot2)
#help(ggplot2)

Creacion de objetos en R

Objetos y Vectores

Al ser R un lenguaje de programación orientado a objetos, se crearan objetos con el simbolo “<-” que puede ser escrito de manera rapida con Alt + “-”

tortillas <- 2
arepas <- 3
tomate <- 1


resultado <- 3*tortillas+5*arepas+4*tomate
resultado

## [1] 25

#Funcion class nos ayuda a conocer con que tipo de objeto estamos tratando
class(resultado)

## [1] "numeric"

nombre<- "rafa"
class(nombre)

## [1] "character"

lolipop <- FALSE
class(lolipop)

## [1] "logical"

#VECTORES

#Los vectores son conjuntos de datos, los podemos entender como columnas

pelicula<- c("pelicula 1","pelicula 2","pelicula 3","pelicula 4")

pelicula

## [1] "pelicula 1" "pelicula 2" "pelicula 3" "pelicula 4"

puntacion<-c(7,9,7,2)

posterior_2005<-(c(FALSE,FALSE,TRUE,TRUE))

#Operaciones con vectores

puntacion*2

## [1] 14 18 14  4

p2<- c(1,2,3,4,5,6,7,8)

# Función length para saber la longitud del vector
length(p2)

## [1] 8

length(puntacion)

## [1] 4

puntacion-p2

## [1]  6  7  4 -2  2  3  0 -6

#Rastrear un dato

pelicula[3]

## [1] "pelicula 3"

pelicula[c(1,4)]

## [1] "pelicula 1" "pelicula 4"

#Aplicar condiciones logicas a un vector
pbaja<-puntacion<7
pbaja

## [1] FALSE FALSE FALSE  TRUE

#combinando pelicula con pbaja
pelicula[pbaja]

## [1] "pelicula 4"

Matrices

Se puede entender como matrices como un conjunto de vectores del mismo tipo.

warner <- c(20,21,16,17,17,22,17,18,19)
d <- c(11,13,11,8,12,11,12,8,10)
f <- c(18,15,15,15,16,17,15,13,11)

#Se revisa la longitud de los elementos
length(warner)

## [1] 9

length(d)

## [1] 9

length(f)

## [1] 9

#Se crea la matriz uzando la funcion "matrix".
pelis <- matrix(c(warner,d,f),
                    nrow = 9,
                    ncol = 3)

pelis

##       [,1] [,2] [,3]
##  [1,]   20   11   18
##  [2,]   21   13   15
##  [3,]   16   11   15
##  [4,]   17    8   15
##  [5,]   17   12   16
##  [6,]   22   11   17
##  [7,]   17   12   15
##  [8,]   18    8   13
##  [9,]   19   10   11

#Aplicamos nombres a las columnas con la funcion "colnames"

colnames(pelis) <-c("warner","disney","fox") 

#Nuestra nueva base ya con nombres
pelis

##       warner disney fox
##  [1,]     20     11  18
##  [2,]     21     13  15
##  [3,]     16     11  15
##  [4,]     17      8  15
##  [5,]     17     12  16
##  [6,]     22     11  17
##  [7,]     17     12  15
##  [8,]     18      8  13
##  [9,]     19     10  11

#Aplicamos nombres a las filas con la funcion "rownames"
rownames(pelis) <- c("2001","2002","2003","2004","2005","2006","2007","2008","2009")

#Nuestra nueva base ya con nombres totales
pelis

##      warner disney fox
## 2001     20     11  18
## 2002     21     13  15
## 2003     16     11  15
## 2004     17      8  15
## 2005     17     12  16
## 2006     22     11  17
## 2007     17     12  15
## 2008     18      8  13
## 2009     19     10  11

#Operaciones

#Si multiplicamos por otra matriz la miltitplicación se hara 1:1 (Primer valor con el primero y asi sucesivamente)

pelis*pelis

##      warner disney fox
## 2001    400    121 324
## 2002    441    169 225
## 2003    256    121 225
## 2004    289     64 225
## 2005    289    144 256
## 2006    484    121 289
## 2007    289    144 225
## 2008    324     64 169
## 2009    361    100 121

#De igual manera si multiplicamos por un vector o un escalar.

pelis*d

##      warner disney fox
## 2001    220    121 198
## 2002    273    169 195
## 2003    176    121 165
## 2004    136     64 120
## 2005    204    144 192
## 2006    242    121 187
## 2007    204    144 180
## 2008    144     64 104
## 2009    190    100 110

#Buscar datos en una matriz
#Por lugar en la matriz
pelis[3,2]

## [1] 11

#Por nombre asociado a la matriz
pelis["2003","disney"]

## [1] 11

#Varios valores por lugar
pelis[c(3,4),c(1,2)]

##      warner disney
## 2003     16     11
## 2004     17      8

#Varios valores combinados
pelis[c(3,4),c("disney","fox")]

##      disney fox
## 2003     11  15
## 2004      8  15

#Toda una fila o columna
pelis[3,]

## warner disney    fox 
##     16     11     15

Factores

Cuando nuestros vectores no usan numeros, si no nombres de variables se le llama a estos factores.

tallas <- c("m","g","s","s","M","m")

#Para trabajarlas tenemos que usar la función "factor".

plot(factor(tallas))

#Podemos recodificar las variables en el casi de que queramos agruparlas de manera distintas
tallas_recodificado <- factor(tallas,
                              levels = c("g","m","M","s"),
                              labels = c("G","M","M","S"))
#Graficamos nuestra variable recodificada
plot(factor(tallas_recodificado))

#Si queremos indicarle a R una relacion entre las variables.
tallas_recodificado1 <- factor(tallas,
                              ordered = TRUE,
                              levels = c("s","m","M","g"),
                              labels = c("S","M","M","G"))
#si vemos la variable tenemos una relacion de tamaño menor que entre las variables
tallas_recodificado1

## [1] M G S S M M
## Levels: S < M < G

plot(factor(tallas_recodificado1))

DataFrame

Un DataFrame es una tabla que engloba variables de diferentes tipos y el la base del analisis en R.

#Creacion de dataframe 
pelis_df <- data.frame(pelicula,
           puntacion,
           posterior_2005)
pelis_df

##     pelicula puntacion posterior_2005
## 1 pelicula 1         7          FALSE
## 2 pelicula 2         9          FALSE
## 3 pelicula 3         7           TRUE
## 4 pelicula 4         2           TRUE

#Renombramos el nombre de las variables de los dataframes
names(pelis_df) <- c("NOMBRE","PUNTUACION","POSTERIOR")


#Si solo queremos una columna para algun analisis podemos filtrar con:
pelis_df$NOMBRE

## [1] "pelicula 1" "pelicula 2" "pelicula 3" "pelicula 4"

#------Si queremos ordenar los datos por alguna variable, en este caso la puntuacion de menor a mayor


order(pelis_df$PUNTUACION) #nos da en base a su posicion en el data frame

## [1] 4 1 3 2

#Si quisieramos el orden de mayor a menor
orden_a <- order(pelis_df$PUNTUACION, decreasing = TRUE)
orden_a

## [1] 2 1 3 4

# Asi queda la variable ya pidiendole que la ordene el dataframe con el orden que establecimos anteriormente.
df_ord <- pelis_df[orden_a,] 
df_ord

##       NOMBRE PUNTUACION POSTERIOR
## 2 pelicula 2          9     FALSE
## 1 pelicula 1          7     FALSE
## 3 pelicula 3          7      TRUE
## 4 pelicula 4          2      TRUE

Listas

Las listas son conjuntos de datos de diferentes tipos por ejemplo (DataFrames y Matrices).

Sierven para guardar y relacionar objetos.

#Se crean listas con el comando "list"
lista_curso <- list(pelicula,
                    pelis)

#Nombramos cada uno de los objetos de la lista
names(lista_curso) <- c("vector","matriz")

names(lista_curso)

## [1] "vector" "matriz"

#Si quiero visualizar solo un objetos lo puedo filtrar asi:
lista_curso[['vector']]

## [1] "pelicula 1" "pelicula 2" "pelicula 3" "pelicula 4"

#Si quiero un dato tendro de un objeto de la lista lo puedo seleccionar con []

lista_curso[['vector']][2]

## [1] "pelicula 2"

#De igual manera en una matriz.
lista_curso[["matriz"]][2,1]

## [1] 21

#Agregar elementos a la lista

lista_curso[["nevo"]] <- pelis_df

#Visualizamos la lista
  
lista_curso

## $vector
## [1] "pelicula 1" "pelicula 2" "pelicula 3" "pelicula 4"
## 
## $matriz
##      warner disney fox
## 2001     20     11  18
## 2002     21     13  15
## 2003     16     11  15
## 2004     17      8  15
## 2005     17     12  16
## 2006     22     11  17
## 2007     17     12  15
## 2008     18      8  13
## 2009     19     10  11
## 
## $nevo
##       NOMBRE PUNTUACION POSTERIOR
## 1 pelicula 1          7     FALSE
## 2 pelicula 2          9     FALSE
## 3 pelicula 3          7      TRUE
## 4 pelicula 4          2      TRUE

#Eliminar objetos de una lista
  
lista_curso[["nevo"]] <- NULL

#Visualizamos la lista
lista_curso

## $vector
## [1] "pelicula 1" "pelicula 2" "pelicula 3" "pelicula 4"
## 
## $matriz
##      warner disney fox
## 2001     20     11  18
## 2002     21     13  15
## 2003     16     11  15
## 2004     17      8  15
## 2005     17     12  16
## 2006     22     11  17
## 2007     17     12  15
## 2008     18      8  13
## 2009     19     10  11

Importación de datos de excel

XLS

#Necesitaremos la paqueteria
#install.packages("readxl")

#Corremos la libreria.
library(readxl)
# para importar archivos usando una ventana
## ruta del archivo

#Importante "file.choose" te permite buscar la direcion de cualquier documento en tu ordenador en terminos que R lo entiende

#ruta <- file.choose


ruta <- "C:\\Users\\LUIS\\Downloads\\gapminder_importar_a_r.xlsx"

# Comando para ver hojas del archivo de excel
excel_sheets(ruta)

## [1] "Hoja1" "Hoja2" "Hoja3"

#importar cuando empieza directamente en la primer hoja
"1a" <- read_excel(ruta)

#cuando queremo la segunda hoja

"2a" <- read_excel(ruta,
                   sheet = "Hoja2")


#para importar cuando no estan en la primera fila ni en la primer hoja

"3a" <- read_excel(ruta,
                   sheet = "Hoja3",
                   range = "C7:F17")



#importar desde file
#file-importdataset

CSV

library(readr)

#ruta <- file.choose()
ruta <- "C:\\Users\\LUIS\\Downloads\\gapminder.csv"

#cuando esta delimitado por comas
pl <- read_csv(ruta)
pl

## # A tibble: 10 x 4
##    pais       anio esperanza_de_vida poblacion
##    <chr>     <dbl>             <dbl>     <dbl>
##  1 Argentina  2007              75.3  40301927
##  2 Brasil     2007              72.4 190010647
##  3 Chile      2007              78.6  16284741
##  4 Colombia   2007              72.9  44227550
##  5 Ecuador    2007              75.0  13755680
##  6 Mexico     2007              76.2 108700891
##  7 Nicaragua  2007              72.9   5675356
##  8 Peru       2007              71.4  28674757
##  9 Uruguay    2007              76.4   3447496
## 10 Venezuela  2007              73.7  26084662

#cuando no tiene titulos

#aa <- read_csv(ruta,
         #col_names = FALSE)


#si queremos agregarle un titulo


#colnames(aa) <- c("pais","año","vida","pob")

#aa

#cuando la separacion esta en puntos y comas


#conpyc <- read_csv2(ruta)
#conpyc


#tambien se puede importar tesde file

Graficos

[https://www.r-graph-gallery.com/index.html]

year <- c("00","01","02","03","04")
disney <- c(11,13,11,8,12)

#Funcion mas basica para graficos de dispersión

plot(x=year,
     y=disney,
     main="disney",
     xlab = "year",
     ylab = "disney",
     col="red",
     pch=16,
     panel.first = grid()) #lineas de fondo

#Grafico de barras mas sencillo.
barplot(disney)

#Grafico de pastel mas sencillo
pie(disney)

#Graficos mas complejos con ggplot

library(ggplot2)

a <- data.frame(year,disney)

ggplot(data=a,
       mapping = aes(x=year,
                     y=disney))+
  geom_point()+
  labs(title = "Peliculas")

#--------histogramas--------
hist(disney,3)

data("mtcars")
#Listas de coches
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

#Histograma mas sencillo
hist(mtcars$hp)


#Histograma donde podemos configurar el numero de cortes
hist(mtcars$hp,
     breaks = 5) #cortes

#Histogramas donde podemos configurar el intervalo
 #primero numero donde empieza
    #Segundo donde termina
       #Tercero el intervalo

hist(mtcars$hp,
     breaks = seq(50,350,50))

#Histograma con mas configuración.
hist(mtcars$hp,
     breaks = seq(50,350,50),
     col="red",
     border = "green",
     main="histograma de autos",
     xlab="caballos de potencia",
     ylab = "conteo")

##para ggplot2
#Histograma con ggplot2

ggplot(data=mtcars,
       mapping = aes(x=hp,))+
  geom_histogram(bins = 6) #bins que tan dividido queremos el histograma

#Histograma con mas configuración graficando dos variables

ggplot(data=mtcars,
       mapping = aes(x=hp,
                     fill=factor(vs)))+
  geom_histogram(bins = 9,
                 position= "identity",
                 alpha=0.8)+
  labs(title = "Histograma de dos variables",
       fill="tipo de motor",
       x="caballos de fuerza",
       y="conteos",
       subtitle = "ejercicio ggplot2",
       caption="fuente de datos R")

#Que tipo de formato tiene mtcars
class(mtcars)

## [1] "data.frame"

#Mas graficos de barras

#genera una  tabla con las variables y su conteo
table(mtcars$cyl)

## 
##  4  6  8 
## 11  7 14

barplot(table(mtcars$cyl),
        col="green",
        border="red",
        main="Mi grafico",
        sub = "Subtitutlo del grafico",
        xlab = "cantidadad de cilindros del motor",
        ylab = "conteo",
        horiz = FALSE)

#con ggplot
library(ggplot2)

ggplot(data = mtcars,
       mapping = aes(x=factor(cyl)))+
  geom_bar()+
  coord_flip() #para rotar la grafica

#funcion con dos factores

p <- ggplot(data = mtcars,
            mapping = aes(factor(cyl),
                          fill=factor(gear)))

#clasificadas 1 acumuladas
p+geom_bar(position = "stack", stat = "count")

#clasificadas 2 separadas
p+geom_bar(position = "dodge", stat = "count")

#clasificadas 2 acumuladas porcentajes

p+geom_bar(position = "fill", stat = "count")

#Graficas mas complejas


library(ggplot2)

data("iris")

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

#geom_point

ggplot(iris,
       mapping = aes(x=Sepal.Length,
                     y=Sepal.Width,
                     color=Species))+
  geom_point()+
  geom_smooth()# regression local "loess"  # geom_smooth(method ="lm") # regresion lineal

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Superposicion de lineas

##linea normal
ggplot(data=iris[iris$Species=="setosa",],
       mapping = aes(x=1:50,  #numero de variables setosa
                     y=Petal.Width))+
  geom_line()

##superposicion

ggplot(data=iris,
       mapping = aes(x= rep(1:50,3),  #numero de variables setosa
                     y=Petal.Width,
                     color=Species))+
  geom_line()

#boxplot

ggplot(iris,
       mapping = aes(x=Species,
                     y=Petal.Width,
                     fill=Species))+
  geom_boxplot()+
  geom_jitter()#para poner sobrepuestos los puntos

Graficos dinamicos

library(tidyverse)
library(gganimate)
library(gifski)
library(gapminder)

data("gapminder")

head(gapminder)

## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

#esperazonza de vida a lo largo de los años

gapminder%>% 
  group_by(year,continent) %>% 
  summarise(mean_life=mean(lifeExp)) %>% 
  ggplot(aes(x=year,
             y=mean_life,
             color=continent))+
  geom_line()

#hacerla dinamica

gapminder%>% 
  group_by(year,continent) %>% 
  summarise(mean_life=mean(lifeExp)) %>% 
  ggplot(aes(x=year,
             y=mean_life,
             color=continent))+
  geom_line()+
  transition_reveal(year)


#Grafica mas chula

gapminder%>% 
  group_by(year,continent) %>% 
  summarise(mean_life=mean(lifeExp)) %>% 
  ggplot(aes(x=year,
             y=mean_life,
             color=continent))+
  geom_line(size=2)+
  geom_point(size=10)+
  labs(title = "Esperanza de vida en el tiempo",
       x="Fecha",
       y="Años de vida",
       color= "continente")+
  theme_minimal()+
  transition_reveal(year)

#otro grafico con varios factores en varios graficos
gapminder %>% 
  filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap,
             y=lifeExp,
             color=continent))+
  geom_point()+
  facet_wrap(~continent)

# Get data:
library(gapminder)

# Charge libraries:
library(ggplot2)
library(gganimate)

# Make a ggplot, but add frame=year: one image per year
ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) +
  geom_point(alpha = 0.7, show.legend = FALSE) +
  scale_colour_manual(values = country_colors) +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  facet_wrap(~continent) +
  # Here comes the gganimate specific bits
  labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
  transition_time(year) +
  ease_aes('linear')

#anim_save("271-ggplot2-animated-gif-chart-with-gganimate2.gif")





# Get data:
library(gapminder)

head(gapminder)

## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

# Charge libraries:
library(ggplot2)
library(gganimate)

# Make a ggplot, but add frame=year: one image per year
ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, color = continent)) +
  geom_point() +
  scale_x_log10() +
  theme_bw() +
  # gganimate specific bits:
  labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
  transition_time(year) +
  ease_aes('linear')

# Save at gif:
#anim_save("271-ggplot2-animated-gif-chart-with-gganimate1.gif")

Graficos interactivos

Sorry, esto estara listo en unos dias.

Tidyverse

library(tidyverse)
library(gapminder)

data("gapminder")
head(gapminder)

## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

#filtrado normal
filter(gapminder, country=="Mexico")

## # A tibble: 12 x 6
##    country continent  year lifeExp       pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Mexico  Americas   1952    50.8  30144317     3478.
##  2 Mexico  Americas   1957    55.2  35015548     4132.
##  3 Mexico  Americas   1962    58.3  41121485     4582.
##  4 Mexico  Americas   1967    60.1  47995559     5755.
##  5 Mexico  Americas   1972    62.4  55984294     6809.
##  6 Mexico  Americas   1977    65.0  63759976     7675.
##  7 Mexico  Americas   1982    67.4  71640904     9611.
##  8 Mexico  Americas   1987    69.5  80122492     8688.
##  9 Mexico  Americas   1992    71.5  88111030     9472.
## 10 Mexico  Americas   1997    73.7  95895146     9767.
## 11 Mexico  Americas   2002    74.9 102479927    10742.
## 12 Mexico  Americas   2007    76.2 108700891    11978.

#filtrado tidyverse
gapminder %>% 
  filter(country=="Mexico")

## # A tibble: 12 x 6
##    country continent  year lifeExp       pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Mexico  Americas   1952    50.8  30144317     3478.
##  2 Mexico  Americas   1957    55.2  35015548     4132.
##  3 Mexico  Americas   1962    58.3  41121485     4582.
##  4 Mexico  Americas   1967    60.1  47995559     5755.
##  5 Mexico  Americas   1972    62.4  55984294     6809.
##  6 Mexico  Americas   1977    65.0  63759976     7675.
##  7 Mexico  Americas   1982    67.4  71640904     9611.
##  8 Mexico  Americas   1987    69.5  80122492     8688.
##  9 Mexico  Americas   1992    71.5  88111030     9472.
## 10 Mexico  Americas   1997    73.7  95895146     9767.
## 11 Mexico  Americas   2002    74.9 102479927    10742.
## 12 Mexico  Americas   2007    76.2 108700891    11978.

gapminder %>% 
  filter(year=="2002")

## # A tibble: 142 x 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Afghanistan Asia       2002    42.1  25268405      727.
##  2 Albania     Europe     2002    75.7   3508512     4604.
##  3 Algeria     Africa     2002    71.0  31287142     5288.
##  4 Angola      Africa     2002    41.0  10866106     2773.
##  5 Argentina   Americas   2002    74.3  38331121     8798.
##  6 Australia   Oceania    2002    80.4  19546792    30688.
##  7 Austria     Europe     2002    79.0   8148312    32418.
##  8 Bahrain     Asia       2002    74.8    656397    23404.
##  9 Bangladesh  Asia       2002    62.0 135656790     1136.
## 10 Belgium     Europe     2002    78.3  10311970    30486.
## # ... with 132 more rows

#doble filtrado
gapminder %>% 
  filter(year=="2002",
         lifeExp <=40)

## # A tibble: 2 x 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zambia   Africa     2002    39.2 10595811     1072.
## 2 Zimbabwe Africa     2002    40.0 11926563      672.

##----Resumenes de datos----

#Resumen general
summary(gapminder)

##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
##

#Resumen general 2
library(Hmisc)
a <- describe(gapminder)
class(a)

## [1] "describe"

## gapminder 
## 
##  6  Variables      1704  Observations
## --------------------------------------------------------------------------------
## country 
##        n  missing distinct 
##     1704        0      142 
## 
## lowest : Afghanistan        Albania            Algeria            Angola             Argentina         
## highest: Vietnam            West Bank and Gaza Yemen, Rep.        Zambia             Zimbabwe          
## --------------------------------------------------------------------------------
## continent 
##        n  missing distinct 
##     1704        0        5 
## 
## lowest : Africa   Americas Asia     Europe   Oceania 
## highest: Africa   Americas Asia     Europe   Oceania 
##                                                        
## Value        Africa Americas     Asia   Europe  Oceania
## Frequency       624      300      396      360       24
## Proportion    0.366    0.176    0.232    0.211    0.014
## --------------------------------------------------------------------------------
## year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1704        0       12    0.993     1980    19.87     1952     1957 
##      .25      .50      .75      .90      .95 
##     1966     1980     1993     2002     2007 
## 
## lowest : 1952 1957 1962 1967 1972, highest: 1987 1992 1997 2002 2007
##                                                                             
## Value       1952  1957  1962  1967  1972  1977  1982  1987  1992  1997  2002
## Frequency    142   142   142   142   142   142   142   142   142   142   142
## Proportion 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083
##                 
## Value       2007
## Frequency    142
## Proportion 0.083
## --------------------------------------------------------------------------------
## lifeExp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1704        0     1626        1    59.47    14.82    38.49    41.51 
##      .25      .50      .75      .90      .95 
##    48.20    60.71    70.85    75.10    77.44 
## 
## lowest : 23.599 28.801 30.000 30.015 30.331, highest: 81.701 81.757 82.000 82.208 82.603
## --------------------------------------------------------------------------------
## pop 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1704        0     1704        1 29601212 46384459   475459   946367 
##      .25      .50      .75      .90      .95 
##  2793664  7023596 19585222 54801370 89822054 
## 
## lowest :      60011      61325      63149      65345      70787
## highest: 1110396331 1164970000 1230075000 1280400000 1318683096
## --------------------------------------------------------------------------------
## gdpPercap 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1704        0     1704        1     7215     8573    548.0    687.7 
##      .25      .50      .75      .90      .95 
##   1202.1   3531.8   9325.5  19449.1  26608.3 
## 
## lowest :    241.1659    277.5519    298.8462    299.8503    312.1884
## highest:  80894.8833  95458.1118 108382.3529 109347.8670 113523.1329
## --------------------------------------------------------------------------------

names(a)

## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"

#Conseguir un dato del resumen

#conteo de paises por continente
gapminder %>% 
  filter(continent=="Asia",
         year=="2007") %>% 
  summarise(conteo=n())

## # A tibble: 1 x 1
##   conteo
##    <int>
## 1     33

#maxima esperanza de vida entre todos los datos

gapminder %>% 
  filter(lifeExp==max(lifeExp))

## # A tibble: 1 x 6
##   country continent  year lifeExp       pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Japan   Asia       2007    82.6 127467972    31656.

max(gapminder$lifeExp)

## [1] 82.603

gapminder %>% 
  summarise(max_lifeExp=max(lifeExp)) #guardamos en la variable max_lifeexp

## # A tibble: 1 x 1
##   max_lifeExp
##         <dbl>
## 1        82.6

#Agrupandopor esperanza de vida promedio

gapminder %>% 
  group_by(year) %>% 
  summarise(prom_vida=mean(lifeExp))

## # A tibble: 12 x 2
##     year prom_vida
##    <int>     <dbl>
##  1  1952      49.1
##  2  1957      51.5
##  3  1962      53.6
##  4  1967      55.7
##  5  1972      57.6
##  6  1977      59.6
##  7  1982      61.5
##  8  1987      63.2
##  9  1992      64.2
## 10  1997      65.0
## 11  2002      65.7
## 12  2007      67.0

#histogramas
hist(gapminder$lifeExp,50) #50 intervalos de clase

head(gapminder)

## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

#bloxplot
boxplot(gapminder$lifeExp)

#correlacion
#pairs()
#corr()

library(corrplot)

Series de tiempo

library(lubridate)

#file.choose()

data <- read_csv("C:\\Users\\LUIS\\Downloads\\2020-02.csv")


head(data)

## # A tibble: 6 x 9
##   Genero_Usuario Edad_Usuario  Bici Ciclo_Estacion_Ret~ Fecha_Retiro Hora_Retiro
##   <chr>                 <dbl> <dbl>               <dbl> <chr>        <time>     
## 1 M                        44  4357                 442 01/02/2020   00'38"     
## 2 M                        22 12083                  66 01/02/2020   00'53"     
## 3 M                        29 11562                 331 01/02/2020   00'55"     
## 4 M                        27 10206                 164 01/02/2020   01'18"     
## 5 M                        27 10101                 120 01/02/2020   01'18"     
## 6 M                        26  8458                  69 01/02/2020   01'30"     
## # ... with 3 more variables: Ciclo_Estacion_Arribo <dbl>, Fecha_Arribo <chr>,
## #   Hora_Arribo <time>

viajes_diarios <- data %>% 
  mutate(fecha_h = dmy_hms(paste(Fecha_Retiro,Hora_Retiro))) %>% #"dia, mes año hora minuto segundo" %>% 
  filter(fecha_h  >= as.Date("2020-02-24"),
         fecha_h <= as.Date("2020-02-27")) %>% 
  group_by(horas=floor_date(fecha_h, unit = "hour")) %>% 
  summarise(conteo=n())


head(viajes_diarios)

## # A tibble: 6 x 2
##   horas               conteo
##   <dttm>               <int>
## 1 2020-02-24 00:00:00     34
## 2 2020-02-24 01:00:00      7
## 3 2020-02-24 02:00:00      5
## 4 2020-02-24 05:00:00    145
## 5 2020-02-24 06:00:00    697
## 6 2020-02-24 07:00:00   1809

#Tenemos un hueco en las horas. Tenemos que poner 0 a las horas faltantes

# hacemos un tabla de horas
h_compl <- data.frame(
  horas=seq(
    floor_date(min(viajes_diarios$horas),unit = "hour" ),
    floor_date(max(viajes_diarios$horas),unit = "hour" ),
    by="hour"
))

head(h_compl)

##                 horas
## 1 2020-02-24 00:00:00
## 2 2020-02-24 01:00:00
## 3 2020-02-24 02:00:00
## 4 2020-02-24 03:00:00
## 5 2020-02-24 04:00:00
## 6 2020-02-24 05:00:00

#concatenamos dos tablas con la variables horas y agregamos cero
v_horas <- h_compl %>% 
  group_by(h_red=floor_date(horas,unit = "hour")) %>% 
  left_join(viajes_diarios) %>% 
  mutate(conteo= ifelse(is.na(conteo),0,conteo))


#grafica de lineas

ggplot(data=v_horas,
       aes(x=horas,
           y=conteo))+
  geom_line()

###Modelo SARIMA
#primero lo cambiamos a ts time series

conteo_ts <- ts(v_horas$conteo,
                start = 1,
                frequency = 24)#por 24 horas

conteo_ts

## Time Series:
## Start = c(1, 1) 
## End = c(3, 24) 
## Frequency = 24 
##  [1]   34    7    5    0    0  145  697 1809 3158 2251 1155  968 1020 1338 1740
## [16] 1794 1657 2463 3509 2615 1504  986  525  208   57   10    7    0    0  206
## [31]  778 2080 3448 2341 1339 1121 1131 1458 1844 1801 1726 2519 3476 2640 1554
## [46] 1074  547  248   68    6    3    0    0  176  799 2052 3394 2356 1369 1068
## [61] 1147 1502 1901 1795 1724 2489 3520 2572 1515 1026  497  205

##modelo
library(forecast)

ajuste <- auto.arima(y=conteo_ts)

summary(ajuste)

## Series: conteo_ts 
## ARIMA(3,1,0)(1,1,0)[24] 
## 
## Coefficients:
##           ar1      ar2     ar3    sar1
##       -0.3454  -0.2328  0.0394  -0.272
## s.e.   0.1452   0.1491  0.1419   0.194
## 
## sigma^2 estimated as 3136:  log likelihood=-254.81
## AIC=519.62   AICc=521.08   BIC=528.87
## 
## Training set error measures:
##                     ME    RMSE      MAE MPE MAPE      MASE         ACF1
## Training set -1.164243 43.2755 24.45498 NaN  Inf 0.4549763 -0.006269555

#PRONOSTICO

predicciones <- forecast(ajuste)

head(predicciones)

## $method
## [1] "ARIMA(3,1,0)(1,1,0)[24]"
## 
## $model
## Series: conteo_ts 
## ARIMA(3,1,0)(1,1,0)[24] 
## 
## Coefficients:
##           ar1      ar2     ar3    sar1
##       -0.3454  -0.2328  0.0394  -0.272
## s.e.   0.1452   0.1491  0.1419   0.194
## 
## sigma^2 estimated as 3136:  log likelihood=-254.81
## AIC=519.62   AICc=521.08   BIC=528.87
## 
## $level
## [1] 80 95
## 
## $mean
## Time Series:
## Start = c(4, 1) 
## End = c(5, 24) 
## Frequency = 24 
##  [1]   33.476772  -28.201866  -29.572155  -33.324826  -33.968099  150.400274
##  [7]  759.619174 2025.842021 3374.937455 2318.189312 1327.092775 1048.670930
## [13] 1108.906721 1456.288184 1851.752164 1762.889039 1690.800679 2463.416697
## [19] 3474.288650 2556.752866 1491.864783 1005.312825  476.856824  182.952804
## [25]    9.123877  -52.642172  -54.455747  -58.003689  -58.471990  123.620165
## [31]  736.587557 1999.213759 3346.379218 2294.730614 1304.748344 1020.185188
## [37] 1085.524888 1434.978611 1831.404396 1737.880002 1666.087678 2436.632121
## [43] 3452.978950 2527.156828 1464.414322  977.196491  448.592521  155.206399
## 
## $lower
## Time Series:
## Start = c(4, 1) 
## End = c(5, 24) 
## Frequency = 24 
##                  80%        95%
## 4.000000  -38.287818  -76.27770
## 4.041667 -113.973714 -159.37859
## 4.083333 -123.723874 -173.56478
## 4.125000 -140.042539 -196.53548
## 4.166667 -150.665702 -212.44168
## 4.208333   25.223989  -41.04033
## 4.250000  625.872777  555.07171
## 4.291667 1884.141144 1809.12923
## 4.333333 3225.829615 3146.89669
## 4.375000 2161.942521 2079.23047
## 4.416667 1164.023651 1077.70007
## 4.458333  879.074313  789.29529
## 4.500000  933.013972  839.90198
## 4.541667 1274.316271 1177.98616
## 4.583333 1663.901036 1564.45866
## 4.625000 1569.335862 1466.87500
## 4.666667 1491.708385 1386.31529
## 4.708333 2258.935797 2150.69015
## 4.750000 3264.557393 3153.53237
## 4.791667 2341.899442 2228.16291
## 4.833333 1272.008573 1155.62373
## 4.875000  780.565146  661.59091
## 4.916667  247.321876  125.81341
## 4.958333  -51.271576 -175.26247
## 5.000000 -245.116344 -379.70299
## 5.041667 -319.422840 -460.64799
## 5.083333 -331.551892 -478.23773
## 5.125000 -347.323744 -500.48053
## 5.166667 -358.919499 -517.96680
## 5.208333 -187.193610 -351.72848
## 5.250000  415.410837  245.39015
## 5.291667 1668.051050 1492.74410
## 5.333333 3005.591314 2825.18909
## 5.375000 1944.535425 1759.15329
## 5.416667  945.394556  755.16415
## 5.458333  651.912455  456.96065
## 5.500000  708.537866  508.97299
## 5.541667 1049.473793  845.39987
## 5.583333 1437.567954 1229.08354
## 5.625000 1335.883767 1123.07981
## 5.666667 1256.093829 1039.05620
## 5.708333 2018.794026 1797.60390
## 5.750000 3027.441088 2802.17495
## 5.791667 2094.056019 1864.78630
## 5.833333 1023.880434  790.67588
## 5.875000  529.352867  292.27877
## 5.916667   -6.443438 -247.32492
## 5.958333 -306.909961 -551.53959
## 
## $upper
## Time Series:
## Start = c(4, 1) 
## End = c(5, 24) 
## Frequency = 24 
##                 80%       95%
## 4.000000  105.24136  143.2312
## 4.041667   57.56998  102.9749
## 4.083333   64.57956  114.4205
## 4.125000   73.39289  129.8858
## 4.166667   82.72951  144.5055
## 4.208333  275.57656  341.8409
## 4.250000  893.36557  964.1666
## 4.291667 2167.54290 2242.5548
## 4.333333 3524.04530 3602.9782
## 4.375000 2474.43610 2557.1482
## 4.416667 1490.16190 1576.4855
## 4.458333 1218.26755 1308.0466
## 4.500000 1284.79947 1377.9115
## 4.541667 1638.26010 1734.5902
## 4.583333 2039.60329 2139.0457
## 4.625000 1956.44222 2058.9031
## 4.666667 1889.89297 1995.2861
## 4.708333 2667.89760 2776.1432
## 4.750000 3684.01991 3795.0449
## 4.791667 2771.60629 2885.3428
## 4.833333 1711.72099 1828.1058
## 4.875000 1230.06050 1349.0347
## 4.916667  706.39177  827.9002
## 4.958333  417.17718  541.1681
## 5.000000  263.36410  397.9507
## 5.041667  214.13850  355.3636
## 5.083333  222.64040  369.3262
## 5.125000  231.31637  384.4732
## 5.166667  241.97552  401.0228
## 5.208333  434.43394  598.9688
## 5.250000 1057.76428 1227.7850
## 5.291667 2330.37647 2505.6834
## 5.333333 3687.16712 3867.5693
## 5.375000 2644.92580 2830.3079
## 5.416667 1664.10213 1854.3325
## 5.458333 1388.45792 1583.4097
## 5.500000 1462.51191 1662.0768
## 5.541667 1820.48343 2024.5574
## 5.583333 2225.24084 2433.7253
## 5.625000 2139.87624 2352.6802
## 5.666667 2076.08153 2293.1192
## 5.708333 2854.47022 3075.6603
## 5.750000 3878.51681 4103.7829
## 5.791667 2960.25764 3189.5274
## 5.833333 1904.94821 2138.1528
## 5.875000 1425.04011 1662.1142
## 5.916667  903.62848 1144.5100
## 5.958333  617.32276  861.9524

min(predicciones[["lower"]])

## [1] -551.5396

max(predicciones[["upper"]])

## [1] 4103.783

p_pre <- autoplot(predicciones)
p_pre

#comprobamos con los datos reales



viajes_diarios <- data %>% 
  mutate(fecha_h = dmy_hms(paste(Fecha_Retiro,Hora_Retiro))) %>% #"dia, mes año hora minuto segundo" %>% 
  filter(fecha_h  >= as.Date("2020-02-24"),
         fecha_h <= as.Date("2020-02-29")) %>% 
  group_by(horas=floor_date(fecha_h, unit = "hour")) %>% 
  summarise(conteo=n())

#Tenemos un huevo en las horas. Tenemos que poner 0 a las horas faltantes

# hacemos un tabla de horas
h_compl <- data.frame(
  horas=seq(
    floor_date(min(viajes_diarios$horas),unit = "hour" ),
    floor_date(max(viajes_diarios$horas),unit = "hour" ),
    by="hour"
  ))


#concatenamos dos tablas con la variables horas y agregamos cero
v_horas <- h_compl %>% 
  group_by(h_red=floor_date(horas,unit = "hour")) %>% 
  left_join(viajes_diarios) %>% 
  mutate(conteo= ifelse(is.na(conteo),0,conteo))


#grafica de lineas

ggplot(data=v_horas,
       aes(x=horas,
           y=conteo))+
  geom_line()+
  ylim(-551.5395,4103.783)+
  labs(title = "Realidad")

Modelos y algoritmos.

Regresion

Regresion lineal

library(tidyverse)
data("Orange")
Orange

## Grouped Data: circumference ~ age | Tree
##    Tree  age circumference
## 1     1  118            30
## 2     1  484            58
## 3     1  664            87
## 4     1 1004           115
## 5     1 1231           120
## 6     1 1372           142
## 7     1 1582           145
## 8     2  118            33
## 9     2  484            69
## 10    2  664           111
## 11    2 1004           156
## 12    2 1231           172
## 13    2 1372           203
## 14    2 1582           203
## 15    3  118            30
## 16    3  484            51
## 17    3  664            75
## 18    3 1004           108
## 19    3 1231           115
## 20    3 1372           139
## 21    3 1582           140
## 22    4  118            32
## 23    4  484            62
## 24    4  664           112
## 25    4 1004           167
## 26    4 1231           179
## 27    4 1372           209
## 28    4 1582           214
## 29    5  118            30
## 30    5  484            49
## 31    5  664            81
## 32    5 1004           125
## 33    5 1231           142
## 34    5 1372           174
## 35    5 1582           177

Orange %>% 
  ggplot(aes(x=age,
             y=circumference,
             color=Tree))+
  geom_point()+
  geom_abline(intercept = 0,
              slope = 0.1,
              col="red",
              size=2)

#modelo de regresion lineal
lm(circumference ~  age, data=Orange)

## 
## Call:
## lm(formula = circumference ~ age, data = Orange)
## 
## Coefficients:
## (Intercept)          age  
##     17.3997       0.1068

Orange %>% 
  ggplot(aes(x=age,
             y=circumference,
             color=Tree))+
  geom_point()+
  geom_abline(intercept = 17.3997,
              slope =0.1068,
              col="red",
              size=2)

#agregamos linea en el punto de interes

Orange %>% 
  ggplot(aes(x=age,
             y=circumference,
             color=Tree))+
  geom_point()+
  geom_abline(intercept = 17.3997,
              slope =0.1068,
              col="red",
              size=2)+
  geom_vline(xintercept = 800,
             col="blue")

#si ya sabemos la ecuacion sistituimos 800 para saber la circunferencia.

dias <- 800
medida <- 17.3997+0.1068*dias
print(medida)

## [1] 102.8397

Clasificación

Arboles de decision

library(titanic)
data("titanic_train")
head(titanic_train)

##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

library(rpart) #para los datos
library(rattle)
library(rpart.plot)


arbol <- rpart(
  formula = Survived~Sex+Age,
  data=titanic_train,
  method = "class"
)


fancyRpartPlot(arbol)

#guardamos las predicciones
pred_arbol <- predict(arbol,type="class")

View(pred_arbol)

#Combinamos la base original y nuestros resultados en una tabla
titanic_pred <- cbind(titanic_train,pred_arbol)


#hacemos predicciones

predict(object=arbol,
        newdata = data.frame(Age=4,
                             Sex="male"),
        type = "class")

## 1 
## 1 
## Levels: 0 1

Conclusión

Una comunidad de R en Español para todos.

library(leaflet)
m <- leaflet(quakes) %>% 
  setView(lat=19.4978, lng=-99.1269 , zoom=3) %>%
  addProviderTiles("Esri.WorldImagery") %>% 
  leaflet() %>% 
  addTiles() %>% 
  addCircleMarkers(lng = -99.1269, lat =19.4978, color = 'red') %>% 
  addCircleMarkers(lng = -89.19, lat =13.69 , color = 'blue') %>% 
  addCircleMarkers(lng = -75.015152, lat =-9.1899677, color = 'green') %>% 
  addCircleMarkers(lng = -74.297333, lat = 4.570868, color = 'yellow') %>% 
  addCircleMarkers(lng = -78.183406, lat = -1.831239, color = 'grey')

m

Grupo de estudio de R