Clase 1 Análisis Exloratorio de Datos (EDA)

Anotaciones

Atajos del teclado

# Comentarios
# ctrl + shif + c   Comentario multilinea
# ctrl + enter Ejecutar código seleccionado
# alt + \
# ctrl + l Limpiar la consola
# ctrl + s Guardar
# alt 126 ~
# alt +91 +93 []
# alt +123  125 {}
# alt +38 &
# alt +124 |

Anotaciones R markdown

# titulo 1
## titulo 1
### titulo 1 etc

# {r echo=FALSE} solo va a imprimir el resultado sin el código

# {r message=FALSE} para instalar bibliotecas no se muestra el mensaje

# {r eval=FALSE} solo el código se va a mostrar

# {r include=FALSE} se ejecuta el código pero no se muestra nada (codigo ni resultado)

#warning=FALSE eliminar alertas

#índice

# output:
#   html_document:
#     toc: true
#     toc_float: true

Bibliografía extra

#otro tipo de puntos
#https://flowingdata.com/2016/09/08/beeswarm-plot-in-r-to-show-distributions/

#simbolos pch
#http://www.sthda.com/english/wiki/r-plot-pch-symbols-the-different-point-shapes-available-in-r

#gráficos de pastel
#https://r-coder.com/grafico-sectores-r/

#graficos densidad
#https://r-charts.com/es/distribucion/grafico-densidad-grupo-ggplot2/

#grafico de boxplot
#https://r-coder.com/boxplot-en-r/

#más graficos
#https://www.rpubs.com/rdelgado/429190

#pingüinos
#https://cran.r-project.org/web/packages/palmerpenguins/readme/README.html

0.-Importar los datos

myUrl="https://raw.githubusercontent.com/armandovl/datasets_uno/main/atitanic.csv"
datos<-read.csv(url(myUrl))
head(datos)

##   PassengerId                                 Name Age Pclass    Sex Family
## 1         891                  Dooley, Mr. Patrick  32      3   male      0
## 2         890                Behr, Mr. Karl Howell  26      1   male      0
## 3         888         Graham, Miss. Margaret Edith  19      1 female      0
## 4         887                Montvila, Rev. Juozas  27      2   male      0
## 5         886 Rice, Mrs. William (Margaret Norton)  39      3 female      5
## 6         885               Sutehall, Mr. Henry Jr  25      3   male      0
##   Embarked Survived
## 1        Q    muere
## 2        C     vive
## 3        S     vive
## 4        S    muere
## 5        Q    muere
## 6        S    muere

1.-Manejo del dataframe

head(datos,8) #primeras 8 filas

##   PassengerId                                 Name Age Pclass    Sex Family
## 1         891                  Dooley, Mr. Patrick  32      3   male      0
## 2         890                Behr, Mr. Karl Howell  26      1   male      0
## 3         888         Graham, Miss. Margaret Edith  19      1 female      0
## 4         887                Montvila, Rev. Juozas  27      2   male      0
## 5         886 Rice, Mrs. William (Margaret Norton)  39      3 female      5
## 6         885               Sutehall, Mr. Henry Jr  25      3   male      0
## 7         884        Banfield, Mr. Frederick James  28      2   male      0
## 8         883         Dahlberg, Miss. Gerda Ulrika  22      3 female      0
##   Embarked Survived
## 1        Q    muere
## 2        C     vive
## 3        S     vive
## 4        S    muere
## 5        Q    muere
## 6        S    muere
## 7        S    muere
## 8        S    muere

tail(datos,8) #últimas 8 filas

##     PassengerId                                                Name Age Pclass
## 707           9   Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  27      3
## 708           8                      Palsson, Master. Gosta Leonard   2      3
## 709           7                             McCarthy, Mr. Timothy J  54      1
## 710           5                            Allen, Mr. William Henry  35      3
## 711           4        Futrelle, Mrs. Jacques Heath (Lily May Peel)  35      1
## 712           3                              Heikkinen, Miss. Laina  26      3
## 713           2 Cumings, Mrs. John Bradley (Florence Briggs Thayer)  38      1
## 714           1                             Braund, Mr. Owen Harris  22      3
##        Sex Family Embarked Survived
## 707 female      2        S     vive
## 708   male      4        S    muere
## 709   male      0        S    muere
## 710   male      0        S    muere
## 711 female      1        S     vive
## 712 female      0        S     vive
## 713 female      1        C     vive
## 714   male      1        S    muere

dim(datos) #número de filas y número de columnas

## [1] 714   8

names(datos)#nombre de las columnas

## [1] "PassengerId" "Name"        "Age"         "Pclass"      "Sex"        
## [6] "Family"      "Embarked"    "Survived"

str(datos) #estructura del dataframe

## 'data.frame':    714 obs. of  8 variables:
##  $ PassengerId: int  891 890 888 887 886 885 884 883 882 881 ...
##  $ Name       : chr  "Dooley, Mr. Patrick" "Behr, Mr. Karl Howell" "Graham, Miss. Margaret Edith" "Montvila, Rev. Juozas" ...
##  $ Age        : num  32 26 19 27 39 25 28 22 33 25 ...
##  $ Pclass     : int  3 1 1 2 3 3 2 3 3 2 ...
##  $ Sex        : chr  "male" "male" "female" "male" ...
##  $ Family     : int  0 0 0 0 5 0 0 0 0 1 ...
##  $ Embarked   : chr  "Q" "C" "S" "S" ...
##  $ Survived   : chr  "muere" "vive" "vive" "muere" ...

#cambiar el nombre de ciertas columnas

colnames(datos)[1] <- "Identificador"
colnames(datos)[2] <- "El nombre"
colnames(datos)[3] <- "La edad"

str(datos)

## 'data.frame':    714 obs. of  8 variables:
##  $ Identificador: int  891 890 888 887 886 885 884 883 882 881 ...
##  $ El nombre    : chr  "Dooley, Mr. Patrick" "Behr, Mr. Karl Howell" "Graham, Miss. Margaret Edith" "Montvila, Rev. Juozas" ...
##  $ La edad      : num  32 26 19 27 39 25 28 22 33 25 ...
##  $ Pclass       : int  3 1 1 2 3 3 2 3 3 2 ...
##  $ Sex          : chr  "male" "male" "female" "male" ...
##  $ Family       : int  0 0 0 0 5 0 0 0 0 1 ...
##  $ Embarked     : chr  "Q" "C" "S" "S" ...
##  $ Survived     : chr  "muere" "vive" "vive" "muere" ...

#cambiar el nombre de todas las columnas

colnames(datos)<-c('Id', 'Nombre', 'Edad', 'Clase', 'Sexo', 'Familia','Embarcacion','Sobrevivientes')

str(datos)

## 'data.frame':    714 obs. of  8 variables:
##  $ Id            : int  891 890 888 887 886 885 884 883 882 881 ...
##  $ Nombre        : chr  "Dooley, Mr. Patrick" "Behr, Mr. Karl Howell" "Graham, Miss. Margaret Edith" "Montvila, Rev. Juozas" ...
##  $ Edad          : num  32 26 19 27 39 25 28 22 33 25 ...
##  $ Clase         : int  3 1 1 2 3 3 2 3 3 2 ...
##  $ Sexo          : chr  "male" "male" "female" "male" ...
##  $ Familia       : int  0 0 0 0 5 0 0 0 0 1 ...
##  $ Embarcacion   : chr  "Q" "C" "S" "S" ...
##  $ Sobrevivientes: chr  "muere" "vive" "vive" "muere" ...

#eliminar una columna

#otra forma
#datos<-datos[,-1]

datos$Id <- NULL
str(datos)

## 'data.frame':    714 obs. of  7 variables:
##  $ Nombre        : chr  "Dooley, Mr. Patrick" "Behr, Mr. Karl Howell" "Graham, Miss. Margaret Edith" "Montvila, Rev. Juozas" ...
##  $ Edad          : num  32 26 19 27 39 25 28 22 33 25 ...
##  $ Clase         : int  3 1 1 2 3 3 2 3 3 2 ...
##  $ Sexo          : chr  "male" "male" "female" "male" ...
##  $ Familia       : int  0 0 0 0 5 0 0 0 0 1 ...
##  $ Embarcacion   : chr  "Q" "C" "S" "S" ...
##  $ Sobrevivientes: chr  "muere" "vive" "vive" "muere" ...

#eliminar varias columnas (nombre, familia, embarcación)

datos<-datos[,c(-1,-5,-6)]
str(datos)

## 'data.frame':    714 obs. of  4 variables:
##  $ Edad          : num  32 26 19 27 39 25 28 22 33 25 ...
##  $ Clase         : int  3 1 1 2 3 3 2 3 3 2 ...
##  $ Sexo          : chr  "male" "male" "female" "male" ...
##  $ Sobrevivientes: chr  "muere" "vive" "vive" "muere" ...

#descriptivos
summary(datos)

##       Edad           Clase           Sexo           Sobrevivientes    
##  Min.   : 0.42   Min.   :1.000   Length:714         Length:714        
##  1st Qu.:20.12   1st Qu.:1.000   Class :character   Class :character  
##  Median :28.00   Median :2.000   Mode  :character   Mode  :character  
##  Mean   :29.70   Mean   :2.237                                        
##  3rd Qu.:38.00   3rd Qu.:3.000                                        
##  Max.   :80.00   Max.   :3.000

#convertir clase a factor
datos$Clase<-factor(datos$Clase)

#descriptivos
summary(datos)

##       Edad       Clase       Sexo           Sobrevivientes    
##  Min.   : 0.42   1:186   Length:714         Length:714        
##  1st Qu.:20.12   2:173   Class :character   Class :character  
##  Median :28.00   3:355   Mode  :character   Mode  :character  
##  Mean   :29.70                                                
##  3rd Qu.:38.00                                                
##  Max.   :80.00

1.1- Algunos filtros

# Nuevo data Frame solo edad y sexo
Datos8<- datos[,c(3,4)]
#View(Datos8)

# También se pueden especificar las variables por nombre
# Igual se pueden reordenar
Datos9<- datos[,c("Edad","Sexo")]
#View(Datos9)

# Eliminar columnas
Datos10<-datos[,c(-3,-1)]
#View(Datos10)

# Eliminar columna
Datos10$Edad<-NULL
Datos10$Sexo<-NULL

# Seleccionar filas
Datos11<- datos[5:9,]
#View(Datos11)

# Filtrar por condiciones
Datos12<- datos[datos$Edad>=40,]
Datos13<- datos[datos$Edad>=40&datos$Sexo=="male",]
Datos14<- datos[datos$Edad>=50|datos$Sexo=="male",]
#View(Datos14)

2.- Estadísticos descriptivos de una variable

2.1-Medidas de tendencia central

total=length(datos$Edad)
media= mean(datos$Edad)
mediana=median(datos$Edad)
print(paste("total:", total,"promedio:",round(media,2),"mediana:",mediana))

## [1] "total: 714 promedio: 29.7 mediana: 28"

2.2-Medidas de dispersión

vari<-var(datos$Edad)
desv<-sd(datos$Edad)
coefiVar<-(sd(datos$Edad)/mean(datos$Edad))*100

sprintf("varianza: %s, desviación típica %s, coeficiente de var: %s ",vari, desv,coefiVar)

## [1] "varianza: 211.019124746308, desviación típica 14.526497332334, coeficiente de var: 48.9122185546567 "

sprintf("varianza: %s, desviación típica %s, coeficiente de var: %s ",
        round(vari,2), 
        round(desv,2),
        round(coefiVar,2))

## [1] "varianza: 211.02, desviación típica 14.53, coeficiente de var: 48.91 "

2.3-Medidas de posición

percentiles<-quantile(datos$Edad,c(0.25,0.75)) #primer y tercer cuartil
print(round(percentiles,1))

##  25%  75% 
## 20.1 38.0

2.4-Medidas de forma

# install.packages(“moments”) para asimetría
# install.packages(“kurtosis”) para kurtosis

# asimetria<-skewness(datos$Edad)
# curtosis<-kurtosis(datos$Edad)-3

3.-Métodos gráficos de exploración de una variable cuantitativa

3.1-Histogramas

hist(datos$Edad,col=5,breaks="Sturges")

#cambiar tipo de barras
hist(datos$Edad,col="green3",breaks="Scott")

#agregar etiquetas
hist(datos$Edad,col=7,breaks="Scott",
     main="título", 
     xlab = "etiqueta en X",
     ylab="etiqueta en Y")

#cambiar límites
hist(datos$Edad,col="lightcyan",breaks="Scott",
     main="título", 
     xlab = "etiqueta en X",
     ylab="etiqueta en Y",
     xlim = c(0,100),
     ylim = c(0,150)
)

#cambiar número de cortes
hist(datos$Edad,col="lightcyan",breaks = 50,
     main="título", 
     xlab = "etiqueta en X",
     ylab="etiqueta en Y",
     xlim = c(0,90),
     ylim = c(0,70),
     
)

3.2-Gráfico de cajas y bigotes

boxplot(datos$Edad, horizontal = TRUE, main="Boxplot \n una variable ",col="lightSalmon")

par(mfrow= c(2,1)) #filas, columnas
hist(datos$Edad,main="Histograma y Boxplot para una variable",
     #probability = TRUE, #da la densidad
     col="lightSalmon",
     breaks = 50)
boxplot(datos$Edad, horizontal = TRUE, main=" ",col="lightSalmon")

#agregar los puntosa grafico de cajas y bigotes
boxplot(datos$Edad,horizontal=TRUE)
stripchart(datos$Edad, method = "jitter", pch = 19, add = TRUE, col = "blue")

print("nueva Seccion")

## [1] "nueva Seccion"

4.-Datos cuantitativos

#pie chart
tablaClase <- table(datos$Clase)
pie(tablaClase)

tablaClase <- table(datos$Clase)
labels<-c("primera","segunda","tercera")
pie(tablaClase,labels)

### 4.1 Tablas de frecuencia

table(datos$Clase) #cantidad por clase

## 
##   1   2   3 
## 186 173 355

prop.table(table(datos$Clase)) #por proporcion

## 
##         1         2         3 
## 0.2605042 0.2422969 0.4971989

frecRelativas<-prop.table(table(datos$Clase))
round(frecRelativas,3)*100 #frecuencie relativa en porcentaje

## 
##    1    2    3 
## 26.1 24.2 49.7

4.2 Diagrama de pareto

#instalar paquete
#install.packages("qcc")
library(qcc)

pareto.chart(table(datos$Clase))

##    
## Pareto chart analysis for table(datos$Clase)
##     Frequency Cum.Freq. Percentage Cum.Percent.
##   3 355.00000 355.00000   49.71989     49.71989
##   1 186.00000 541.00000   26.05042     75.77031
##   2 173.00000 714.00000   24.22969    100.00000

5.- Análisis de datos cualitativos agrupados

5.1-Tablas de frecuencia por grupos (frecuencias absolutas)

#De dos variables
table(datos$Clase,datos$Sexo)

##    
##     female male
##   1     85  101
##   2     74   99
##   3    102  253

paraAreas<-table(datos$Clase,datos$Sexo)
plot(paraAreas)

paraAreas<-table(datos$Clase,datos$Sexo)
plot(paraAreas, col=datos$Clase) #poner color

paraAreas<-table(datos$Clase,datos$Sexo)
colores<-c("#00AFBB","#E7B800","#FC4E07")
plot(paraAreas, col=colores[datos$Clase]) #poner color

#grafico de barras
paraBarras<-table(datos$Clase,datos$Sexo)
barplot(paraBarras)

paraBarras<-table(datos$Clase,datos$Sexo)
barplot(paraBarras, 
        col=rainbow(3),#colores
        legend.text = rownames(paraBarras) #leyenda
        )

paraBarras<-table(datos$Clase,datos$Sexo)
barplot(paraBarras, 
        col=rainbow(3),#colores
        legend.text = rownames(paraBarras), #leyenda
         beside = TRUE #para que los ponga en diferentes lugares
        )

paraBarras

##    
##     female male
##   1     85  101
##   2     74   99
##   3    102  253

paraBarras<-table(datos$Clase,datos$Sexo)
grafico1<-barplot(paraBarras, 
        col=rainbow(3),#colores
        legend.text = rownames(paraBarras), #leyenda
        beside = TRUE, #para que los ponga en diferentes lugares
        ylim=c(0,300), #cambiar el limite
        xlab="Género", #etiqueta x
        ylab="Pasajeros", #etiqueta en Y
        main="título del Gráfico"
        
        )

#otra leyenda
legend("topleft", legend = c("Class1", "Class2","Class3"), fill = rainbow(3)) 

##### grafico, posicion, etiquetas
text(grafico1,paraBarras+10,labels=round(paraBarras,2)) #añadir texto

5.2-Tablas de frecuencia por grupos (frecuencias relativas globales)

#De dos variables frecuencias relativas globales
freRelGlo<-prop.table(table(datos$Clase,datos$Sexo))
round(freRelGlo,3)*100

##    
##     female male
##   1   11.9 14.1
##   2   10.4 13.9
##   3   14.3 35.4

freRelGlo<-prop.table(table(datos$Clase,datos$Sexo))
enPorcentaje<-freRelGlo*100
grafico2<-barplot(enPorcentaje, 
          col=colores,#colores
          legend.text = rownames(enPorcentaje), #leyenda
          beside =TRUE, #para que los ponga en diferentes lugares
          ylim=c(0,80), #cambiar el limite
          xlab="Género", #etiqueta x
          ylab="Pasajeros", #etiqueta en Y
          main="Frecuencias relativas globales"
        
        )

#otra leyenda
legend("topleft", legend = c("Class1%", "Class2%","Class3%"), fill = colores) 

##### grafico, posicion, etiquetas , solo en beside=TRUE
text(grafico2,enPorcentaje+10,labels=round(enPorcentaje,2)) #añadir texto

5.3-Tablas de frecuencia por grupos (frecuencias relativas marginales)

#De dos variables frecuencias relativas marginales por filas
freRelGlo<-prop.table(table(datos$Clase,datos$Sexo),1)
round(freRelGlo,3)*100

##    
##     female male
##   1   45.7 54.3
##   2   42.8 57.2
##   3   28.7 71.3

#gráfico
freRelGlo<-prop.table(table(datos$Clase,datos$Sexo),1)
enPorcentaje<-freRelGlo*100
grafico2<-barplot(enPorcentaje, 
          col=colores,#colores
          legend.text = rownames(enPorcentaje), #leyenda
          beside =TRUE, #para que los ponga en diferentes lugares
          ylim=c(0,120), #cambiar el limite
          xlab="Género", #etiqueta x
          ylab="Pasajeros", #etiqueta en Y
          main="Frecuencias relativas marginales filas"
        
        )

#otra leyenda
legend("topleft", legend = c("Class1%", "Class2%","Class3%"), fill = colores) 

##### grafico, posicion, etiquetas , solo en beside=TRUE
text(grafico2,enPorcentaje+10,labels=round(enPorcentaje,2)) #añadir texto

#De dos variables frecuencias relativas marginales por columnas
freRelGlo<-prop.table(table(datos$Clase,datos$Sexo),2)
round(freRelGlo,3)*100

##    
##     female male
##   1   32.6 22.3
##   2   28.4 21.9
##   3   39.1 55.8

#gráfico
freRelGlo<-prop.table(table(datos$Clase,datos$Sexo),2)
enPorcentaje<-freRelGlo*100
grafico2<-barplot(enPorcentaje, 
          col=colores,#colores
          legend.text = rownames(enPorcentaje), #leyenda
          beside =TRUE, #para que los ponga en diferentes lugares
          ylim=c(0,120), #cambiar el limite
          xlab="Género", #etiqueta x
          ylab="Pasajeros", #etiqueta en Y
          main="Frecuencias relativas marginales columnas"
        
        )

#otra leyenda
legend("topleft", legend = c("Class1%", "Class2%","Class3%"), fill = colores) 

##### grafico, posicion, etiquetas , solo en beside=TRUE
text(grafico2,enPorcentaje+10,labels=round(enPorcentaje,2)) #añadir texto

#tabla de frecuencias de absolutos
table(datos$Clase,datos$Sexo,datos$Sobrevivientes)

## , ,  = muere
## 
##    
##     female male
##   1      3   61
##   2      6   84
##   3     55  215
## 
## , ,  = vive
## 
##    
##     female male
##   1     82   40
##   2     68   15
##   3     47   38

#estructura de datos
str(datos)

## 'data.frame':    714 obs. of  4 variables:
##  $ Edad          : num  32 26 19 27 39 25 28 22 33 25 ...
##  $ Clase         : Factor w/ 3 levels "1","2","3": 3 1 1 2 3 3 2 3 3 2 ...
##  $ Sexo          : chr  "male" "male" "female" "male" ...
##  $ Sobrevivientes: chr  "muere" "vive" "vive" "muere" ...

#filtrar las filas por condicion
datosFiltrado<-datos[datos$Sobrevivientes=="vive",]
datosFiltrado2<-datos[datos$Sobrevivientes=="muere",]

par(mfrow=c(1,2))

###########PRimer Gráfico###########################
paraBarras<-table(datosFiltrado$Clase,datosFiltrado$Sexo)
grafico1<-barplot(paraBarras,
        col=colores,#colores
        #legend.text = rownames(paraBarras), #leyenda
        beside = TRUE, #para que los ponga en diferentes lugares
        ylim=c(0,300), #cambiar el limite
        xlab="Género", #etiqueta x
        ylab="Pasajeros", #etiqueta en Y
        main="pasajeros x clase vivieron "
        
        )

#otra leyenda
legend("topleft", legend = c("Class1", "Class2","Class3"), fill = colores) 

##### grafico, posicion, etiquetas
text(grafico1,paraBarras+10,labels=round(paraBarras,2)) #añadir texto


###########Segundo Gráfico###########################


paraBarras<-table(datosFiltrado2$Clase,datosFiltrado2$Sexo)
grafico1<-barplot(paraBarras,
        col=colores,#colores
        #legend.text = rownames(paraBarras), #leyenda
        beside = TRUE, #para que los ponga en diferentes lugares
        ylim=c(0,300), #cambiar el limite
        xlab="Género", #etiqueta x
        ylab="Pasajeros", #etiqueta en Y
        main="pasajeros x clase murieron"
        
        )

#otra leyenda
legend("topleft", legend = c("Class1", "Class2","Class3"), fill = colores) 

##### grafico, posicion, etiquetas
text(grafico1,paraBarras+10,labels=round(paraBarras,2)) #añadir texto

#De dos variables frecuencias relativas globales
freRelGlo<-prop.table(table(datos$Clase,datos$Sexo,datos$Sobrevivientes))
round(freRelGlo,3)*100

## , ,  = muere
## 
##    
##     female male
##   1    0.4  8.5
##   2    0.8 11.8
##   3    7.7 30.1
## 
## , ,  = vive
## 
##    
##     female male
##   1   11.5  5.6
##   2    9.5  2.1
##   3    6.6  5.3

#De dos variables frecuencias relativas marginales por fila

#female1 muere+ male1 muere+ female1 vive + male 1 =100

freRelGlo<-prop.table(table(datos$Clase,datos$Sexo,datos$Sobrevivientes),1)
round(freRelGlo,3)*100

## , ,  = muere
## 
##    
##     female male
##   1    1.6 32.8
##   2    3.5 48.6
##   3   15.5 60.6
## 
## , ,  = vive
## 
##    
##     female male
##   1   44.1 21.5
##   2   39.3  8.7
##   3   13.2 10.7

#De dos variables frecuencias relativas marginales por columna

#female1 vive + female2 vive +female3 vive +female1 muere + female2 muere +female3 muere = 100

freRelGlo<-prop.table(table(datos$Clase,datos$Sexo,datos$Sobrevivientes),2)
round(freRelGlo,3)*100

## , ,  = muere
## 
##    
##     female male
##   1    1.1 13.5
##   2    2.3 18.5
##   3   21.1 47.5
## 
## , ,  = vive
## 
##    
##     female male
##   1   31.4  8.8
##   2   26.1  3.3
##   3   18.0  8.4

6.-Análisis de datos cuantitativos agrupados

6.1-Descriptivos por grupo

######## variable continua, variable categorica, #funcion
tapply(datos$Edad,datos$Clase, summary)

## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.92   27.00   37.00   38.23   49.00   80.00 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.67   23.00   29.00   29.88   36.00   70.00 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   18.00   24.00   25.14   32.00   74.00

######## variable continua, variable categorica, #funcion
tapply(datos$Edad,datos$Clase, mean)

##        1        2        3 
## 38.23344 29.87763 25.14062

######## variable continua, variable categorica, #funcion
tapply(datos$Edad,datos$Clase, median)

##  1  2  3 
## 37 29 24

######## variable continua, variable categorica, #funcion
tapply(datos$Edad,datos$Clase, sd)

##        1        2        3 
## 14.80286 14.00108 12.49540

#instalamos ggplot 2
library(ggplot2)

6.2-Histogramas por grupo

# Gráfico de densidad en ggplot2
ggplot(datos, aes(x = Edad, colour =factor(Clase)))+
  geom_density()

# Gráfico de densidad en ggplot2
ggplot(datos, aes(x = Edad, 
                  colour =factor(Clase), #color por factor 
                  fill=factor(Clase), #colorear
                  alpha=0.4, #transparencia
                  )
       )+
  geom_density()

# Gráfico de densidad en ggplot2
ggplot(datos, aes(x = Edad, 
                  colour =factor(Clase), #color por factor 
                  fill=factor(Clase), #colorear
                  alpha=0.4, #transparencia
                  )
       )+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Gráfico de densidad en ggplot2
ggplot(datos, aes(x = Edad, 
                  colour =factor(Clase), #color por factor 
                  fill=factor(Clase), #colorear
                  alpha=0.3, #transparencia
                  position="dodge" #dividir e histograma "identity
                  )
       )+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(Edad, data = datos, geom = "density", color = Sexo,  facets = .~ Sexo)

qplot(Edad, data = datos, geom = "density", color = Sexo,  facets = Sexo~.)

6.3-Boxplot por grupo

#gráfico sin ggplot
#boxplot(datos$Edad ~ datos$Clase)
boxplot(Edad ~ Clase, data = datos) # Equivalente

stripchart(Edad ~ Clase, data = datos,
           method="jitter",
           vertical=TRUE,
           pch=19,
           col =c("red","blue","green3")
           )

boxplot(Edad ~ Clase, data = datos)
stripchart(Edad ~ Clase, data = datos,
           method="stack", # overplot, stack, jitter
           vertical=TRUE,
           pch=19,
           col =c("red","blue","green3"),
           add=TRUE #unir los dos graficos
           )

#con ggplot2
plot <- ggplot(datos, aes(x=Sexo, y=Edad))
plot <- plot + geom_boxplot(aes(group=Sexo, fill=Sexo, alpha=0.3))
plot <- plot + geom_jitter(aes(color=Sexo, alpha=0.3))
plot <- plot + facet_grid(.~Sobrevivientes)

plot

7.- Anáisis de datos cuantitativos

7.1-Correlaciones

miurl<-"https://raw.githubusercontent.com/armandovl/datasets_uno/main/iris.csv"
iris<-read.csv(url(miurl))

library(corrplot)

## corrplot 0.92 loaded

Una correlación mide la relación lineal entre dos variables. Por ejemplo la relación entre el la experiencia laboral y el salario.

Esta se mide con un coeficiente que va de -1 a 1. * r = 1, la relación es positiva perfecta * 0 < r < 1 la relación es positiva * r = 0 no hay relación lineal * -1 < r < 0 la relación es negativa * r = -1 la relación es negativa perfecta

#calcular ma matriz de correlación
round(cor(iris[,1:4]),3)

##             long_sepalo anch_sepalo long_petalo anch_petalo
## long_sepalo       1.000      -0.109       0.872       0.818
## anch_sepalo      -0.109       1.000      -0.421      -0.357
## long_petalo       0.872      -0.421       1.000       0.963
## anch_petalo       0.818      -0.357       0.963       1.000

library(Hmisc) #para rcor

rcorr(as.matrix(iris[,1:4])) #correlacion con p valor

##             long_sepalo anch_sepalo long_petalo anch_petalo
## long_sepalo        1.00       -0.11        0.87        0.82
## anch_sepalo       -0.11        1.00       -0.42       -0.36
## long_petalo        0.87       -0.42        1.00        0.96
## anch_petalo        0.82       -0.36        0.96        1.00
## 
## n= 150 
## 
## 
## P
##             long_sepalo anch_sepalo long_petalo anch_petalo
## long_sepalo             0.1828      0.0000      0.0000     
## anch_sepalo 0.1828                  0.0000      0.0000     
## long_petalo 0.0000      0.0000                  0.0000     
## anch_petalo 0.0000      0.0000      0.0000

#calcular la matriz con el p valor
correlacion<-round(cor(iris[,1:4]), 1)

corrplot(correlacion, method="number", type="upper")

library(PerformanceAnalytics)

chart.Correlation(iris[,1:4], histogram = F, pch = 19)

### 7.2- Gráfico de pares

pairs(iris[,1:4])

pairs(iris[,1:4],pch=10)

pairs(iris[,1:4],pch=9,lower.panel = NULL)

pairs(iris[,1:4],pch=19,lower.panel = NULL,col=factor(iris$clase))

colores<-c("#00AFBB","#E7B800","#FC4E07")
pairs(iris[,1:4],pch=19,lower.panel = NULL,col=colores[factor(iris$clase)])

7.3- Gráfico de dispersión Scatterplot

plot(long_petalo~anch_petalo,data = iris,col=factor(iris$clase),pch=19)

myurl="https://raw.githubusercontent.com/armandovl/datasets_uno/main/penguins1.csv"
ping<-read.csv(url(myurl))

str(ping)

## 'data.frame':    344 obs. of  7 variables:
##  $ species          : chr  "Adelie" "Adelie" "Adelie" "Adelie" ...
##  $ sex              : chr  "male" "female" "female" "" ...
##  $ island           : chr  "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ bill_length_mm   : num  39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num  18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int  181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int  3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...

summary(ping)

##    species              sex               island          bill_length_mm 
##  Length:344         Length:344         Length:344         Min.   :32.10  
##  Class :character   Class :character   Class :character   1st Qu.:39.23  
##  Mode  :character   Mode  :character   Mode  :character   Median :44.45  
##                                                           Mean   :43.92  
##                                                           3rd Qu.:48.50  
##                                                           Max.   :59.60  
##                                                           NA's   :2      
##  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :13.10   Min.   :172.0     Min.   :2700  
##  1st Qu.:15.60   1st Qu.:190.0     1st Qu.:3550  
##  Median :17.30   Median :197.0     Median :4050  
##  Mean   :17.15   Mean   :200.9     Mean   :4202  
##  3rd Qu.:18.70   3rd Qu.:213.0     3rd Qu.:4750  
##  Max.   :21.50   Max.   :231.0     Max.   :6300  
##  NA's   :2       NA's   :2         NA's   :2

head(ping,10)

##    species    sex    island bill_length_mm bill_depth_mm flipper_length_mm
## 1   Adelie   male Torgersen           39.1          18.7               181
## 2   Adelie female Torgersen           39.5          17.4               186
## 3   Adelie female Torgersen           40.3          18.0               195
## 4   Adelie        Torgersen             NA            NA                NA
## 5   Adelie female Torgersen           36.7          19.3               193
## 6   Adelie   male Torgersen           39.3          20.6               190
## 7   Adelie female Torgersen           38.9          17.8               181
## 8   Adelie   male Torgersen           39.2          19.6               195
## 9   Adelie        Torgersen           34.1          18.1               193
## 10  Adelie        Torgersen           42.0          20.2               190
##    body_mass_g
## 1         3750
## 2         3800
## 3         3250
## 4           NA
## 5         3450
## 6         3650
## 7         3625
## 8         4675
## 9         3475
## 10        4250

library(mice) #para identificar valores perdidos

md.pattern(ping)

##     species sex island bill_length_mm bill_depth_mm flipper_length_mm
## 342       1   1      1              1             1                 1
## 2         1   1      1              0             0                 0
##           0   0      0              2             2                 2
##     body_mass_g  
## 342           1 0
## 2             0 4
##               2 8

#tres cuanti, una cuali

plot <- ggplot(data = ping, aes(x = bill_length_mm, y = bill_depth_mm)) +
          
  geom_point(aes(color = sex, size =body_mass_g, shape=island), alpha = 0.7) +
  

          xlab('Largo del pico') +
          ylab('Profundidad del pico') +
          ggtitle('Tres variables cuanti, dos cuali') +
          theme_minimal()

plot$labels$colour = "Sexo"
plot$labels$size = "Peso"
plot$labels$shape = "Isla"

plot

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(data = ping, aes(x = bill_length_mm, y = bill_depth_mm)) + 
  geom_point(aes(color = island), size = 1, alpha = 0.7) +
  geom_smooth(aes(color =island)) +
  facet_grid(island~., scales = 'free') +
  xlab('Longitud pico') + 
  ylab('Profundidad pico') +
  
  ggtitle('Dos cuanti, una cualitativa') + 
  theme_minimal()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

#quitar nulos
ping<-na.omit(ping)

md.pattern(ping)

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     species sex island bill_length_mm bill_depth_mm flipper_length_mm
## 342       1   1      1              1             1                 1
##           0   0      0              0             0                 0
##     body_mass_g  
## 342           1 0
##               0 0

summary(ping)

##    species              sex               island          bill_length_mm 
##  Length:342         Length:342         Length:342         Min.   :32.10  
##  Class :character   Class :character   Class :character   1st Qu.:39.23  
##  Mode  :character   Mode  :character   Mode  :character   Median :44.45  
##                                                           Mean   :43.92  
##                                                           3rd Qu.:48.50  
##                                                           Max.   :59.60  
##  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :13.10   Min.   :172.0     Min.   :2700  
##  1st Qu.:15.60   1st Qu.:190.0     1st Qu.:3550  
##  Median :17.30   Median :197.0     Median :4050  
##  Mean   :17.15   Mean   :200.9     Mean   :4202  
##  3rd Qu.:18.70   3rd Qu.:213.0     3rd Qu.:4750  
##  Max.   :21.50   Max.   :231.0     Max.   :6300

table(ping$sex) #hay 9 que no tienen nada

## 
##        female   male 
##      9    165    168

#sigue habiendo 9 que no tienen nada
sum(ping$sex=="")

## [1] 9

#eliminar esos 9
ping<-ping[ping$sex!="",]

#sigue habiendo 9 que no tienen nada
sum(ping$sex=="")

## [1] 0

str(ping)

## 'data.frame':    333 obs. of  7 variables:
##  $ species          : chr  "Adelie" "Adelie" "Adelie" "Adelie" ...
##  $ sex              : chr  "male" "female" "female" "female" ...
##  $ island           : chr  "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ bill_length_mm   : num  39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
##  $ bill_depth_mm    : num  18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
##  $ flipper_length_mm: int  181 186 195 193 190 181 195 182 191 198 ...
##  $ body_mass_g      : int  3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2] 4 272
##   ..- attr(*, "names")= chr [1:2] "4" "272"

#convertir a factor

# names<-c(1:3,5)
# mydata[,names]<-lapply(mydata[,names], factor)

names<-c("species","island","sex")
ping[,names]<-lapply(ping[,names], factor)
str(ping)

## 'data.frame':    333 obs. of  7 variables:
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 1 2 1 2 1 2 2 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num  39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
##  $ bill_depth_mm    : num  18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
##  $ flipper_length_mm: int  181 186 195 193 190 181 195 182 191 198 ...
##  $ body_mass_g      : int  3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2] 4 272
##   ..- attr(*, "names")= chr [1:2] "4" "272"

table(ping$sex)

## 
## female   male 
##    165    168

ggplot(data = ping, aes(x = bill_length_mm, y = bill_depth_mm)) + 
         
  geom_point(aes(color = island), size = 1, alpha = 0.7) +
  geom_smooth(aes(color = island)) +
    
  facet_grid(island~sex, scales = 'free') +
    
  xlab('longitud pico') + 
  ylab('profundidad del pico') +
  ggtitle('dos cuantitativas, dos cualitativas') + 
  theme_minimal()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Próximos

Clase 2 dplyr
Clase 3 programación en R
econometía en R
ggplot2