Objetivo: Analizar datos de salarios usando funciones de agrupación d edatos usando libreria dplyr
Las librerías
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stats) # Para tener la misma semilla cuando la generemos
Cargamos los datos de salarios
Los datos
# Cargar datos de salarios
# salarios <- read.csv("Va la ruta en donde estan los datos")
salarios <- read.csv("https://raw.githubusercontent.com/rpizarrog/Curso-Titulacion-Data-Science-/master/2019/Datos/Salaries.csv")
# salarios # Ya no los queremos ver
str(salarios)
## 'data.frame': 148654 obs. of 13 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ EmployeeName : Factor w/ 110810 levels "A Bernard Fatooh",..: 77636 34712 1560 17232 81101 23164 3271 22709 73975 47938 ...
## $ JobTitle : Factor w/ 2159 levels "Account Clerk",..: 836 298 298 2149 594 135 246 609 246 370 ...
## $ BasePay : num 167411 155966 212739 77916 134402 ...
## $ OvertimePay : num 0 245132 106088 56121 9737 ...
## $ OtherPay : num 400184 137811 16453 198307 182235 ...
## $ Benefits : num NA NA NA NA NA NA NA NA NA NA ...
## $ TotalPay : num 567595 538909 335280 332344 326373 ...
## $ TotalPayBenefits: num 567595 538909 335280 332344 326373 ...
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Notes : logi NA NA NA NA NA NA ...
## $ Agency : Factor w/ 1 level "San Francisco": 1 1 1 1 1 1 1 1 1 1 ...
## $ Status : Factor w/ 3 levels "","FT","PT": 1 1 1 1 1 1 1 1 1 1 ...
summary(salarios)
## Id EmployeeName
## Min. : 1 Kevin Lee : 13
## 1st Qu.: 37164 Richard Lee : 11
## Median : 74328 Steven Lee : 11
## Mean : 74328 William Wong: 11
## 3rd Qu.:111491 John Chan : 9
## Max. :148654 KEVIN LEE : 9
## (Other) :148590
## JobTitle BasePay
## Transit Operator : 7036 Min. : -166
## Special Nurse : 4389 1st Qu.: 33588
## Registered Nurse : 3736 Median : 65007
## Public Svc Aide-Public Works: 2518 Mean : 66325
## Police Officer 3 : 2421 3rd Qu.: 94691
## Custodian : 2418 Max. :319275
## (Other) :126136 NA's :609
## OvertimePay OtherPay Benefits
## Min. : -0.01 Min. : -7058.6 Min. : -33.89
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:11535.40
## Median : 0.00 Median : 811.3 Median :28628.62
## Mean : 5066.06 Mean : 3648.8 Mean :25007.89
## 3rd Qu.: 4658.18 3rd Qu.: 4236.1 3rd Qu.:35566.86
## Max. :245131.88 Max. :400184.2 Max. :96570.66
## NA's :4 NA's :4 NA's :36163
## TotalPay TotalPayBenefits Year Notes
## Min. : -618.1 Min. : -618.1 Min. :2011 Mode:logical
## 1st Qu.: 36169.0 1st Qu.: 44065.7 1st Qu.:2012 NA's:148654
## Median : 71426.6 Median : 92404.1 Median :2013
## Mean : 74768.3 Mean : 93692.6 Mean :2013
## 3rd Qu.:105839.1 3rd Qu.:132876.5 3rd Qu.:2014
## Max. :567595.4 Max. :567595.4 Max. :2014
##
## Agency Status
## San Francisco:148654 :110535
## FT: 22334
## PT: 15785
##
##
##
##
head(salarios) # Los primeros seis registros
## Id EmployeeName JobTitle
## 1 1 NATHANIEL FORD GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY
## 2 2 GARY JIMENEZ CAPTAIN III (POLICE DEPARTMENT)
## 3 3 ALBERT PARDINI CAPTAIN III (POLICE DEPARTMENT)
## 4 4 CHRISTOPHER CHONG WIRE ROPE CABLE MAINTENANCE MECHANIC
## 5 5 PATRICK GARDNER DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)
## 6 6 DAVID SULLIVAN ASSISTANT DEPUTY CHIEF II
## BasePay OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year
## 1 167411.2 0.00 400184.2 NA 567595.4 567595.4 2011
## 2 155966.0 245131.88 137811.4 NA 538909.3 538909.3 2011
## 3 212739.1 106088.18 16452.6 NA 335279.9 335279.9 2011
## 4 77916.0 56120.71 198306.9 NA 332343.6 332343.6 2011
## 5 134401.6 9737.00 182234.6 NA 326373.2 326373.2 2011
## 6 118602.0 8601.00 189082.7 NA 316285.7 316285.7 2011
## Notes Agency Status
## 1 NA San Francisco
## 2 NA San Francisco
## 3 NA San Francisco
## 4 NA San Francisco
## 5 NA San Francisco
## 6 NA San Francisco
Agregamos la columna paisOrigen a un conjunto de datos de salarios de empleados pero unicamente del año 2014
Las nacionalidades
set.seed(1000)
nacionalidades <- c("MEXICO", "ESTADOS UNIDOS", "INGLATERRA", "JAPON", "CHINA", "ARGENTINA",
"BRASIL", "CANADA", "CHILE",
"HOLANDA", "FINLANDIA", "EGIPTO", "SUECIA", "AUSTRALIA", "FRANCIA", "ITALIA", "ESPAÑA", "CUBA", "COLOMBIA",
"IRLADA", "NORUEGA", "COREA DEL NORTE", "SINGAPUR",
"INDIA")
nacionalidades
## [1] "MEXICO" "ESTADOS UNIDOS" "INGLATERRA"
## [4] "JAPON" "CHINA" "ARGENTINA"
## [7] "BRASIL" "CANADA" "CHILE"
## [10] "HOLANDA" "FINLANDIA" "EGIPTO"
## [13] "SUECIA" "AUSTRALIA" "FRANCIA"
## [16] "ITALIA" "ESPAÑA" "CUBA"
## [19] "COLOMBIA" "IRLADA" "NORUEGA"
## [22] "COREA DEL NORTE" "SINGAPUR" "INDIA"
# Mediante mutate generar una nueva columna a salarios
# llamada paisOrigen que sea un valor aleatorios de las
# nacionalidades
#
# Primero vamos a generar todas las nacionalidades
n <- nrow(salarios) # Número de observaciones total de salrios
paises <- rep(nacionalidades, n*2) # Genera un vector muy grande
head(paises) ; tail(paises)
## [1] "MEXICO" "ESTADOS UNIDOS" "INGLATERRA" "JAPON"
## [5] "CHINA" "ARGENTINA"
## [1] "COLOMBIA" "IRLADA" "NORUEGA" "COREA DEL NORTE"
## [5] "SINGAPUR" "INDIA"
Generamos un conjunto de datos solo del año 2014 de los empleados
salarios2014 <- filter(salarios, Year == 2014)
n <- nrow(salarios2014) # Número de observaciones de salaros2014
# Ahora mediante función mutate() agregamos la columna a salarios2014
paises <- sample(paises, n) # Ahora una muestra aletoria de todos los paises
salarios2014 <- mutate(salarios2014, paisOrigen = paises)
head(salarios2014) # Los primeros seis
## Id EmployeeName JobTitle BasePay
## 1 110532 David Shinn Deputy Chief 3 129150.0
## 2 110533 Amy P Hart Asst Med Examiner 318835.5
## 3 110534 William J Coaker Jr. Chief Investment Officer 257340.0
## 4 110535 Gregory P Suhr Chief of Police 307450.0
## 5 110536 Joanne M Hayes-White Chief, Fire Department 302068.0
## 6 110537 Ellen G Moffatt Asst Med Examiner 270222.0
## OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year Notes
## 1 0.00 342802.63 38780.04 471952.6 510732.7 2014 NA
## 2 10712.95 60563.54 89540.23 390112.0 479652.2 2014 NA
## 3 0.00 82313.70 96570.66 339653.7 436224.4 2014 NA
## 4 0.00 19266.72 91302.46 326716.8 418019.2 2014 NA
## 5 0.00 24165.44 91201.66 326233.4 417435.1 2014 NA
## 6 6009.22 67956.20 71580.48 344187.5 415767.9 2014 NA
## Agency Status paisOrigen
## 1 San Francisco PT INDIA
## 2 San Francisco FT INGLATERRA
## 3 San Francisco PT CHINA
## 4 San Francisco FT SINGAPUR
## 5 San Francisco FT EGIPTO
## 6 San Francisco FT MEXICO
tail(salarios2014) # Los últimos seis
## Id EmployeeName JobTitle BasePay OvertimePay
## 38118 148649 Leon Walker Custodian 0 0
## 38119 148650 Roy I Tillery Custodian 0 0
## 38120 148651 NaN NaN NaN NaN
## 38121 148652 NaN NaN NaN NaN
## 38122 148653 NaN NaN NaN NaN
## 38123 148654 Joe Lopez Counselor, Log Cabin Ranch 0 0
## OtherPay Benefits TotalPay TotalPayBenefits Year Notes Agency
## 38118 0.00 0 0.00 0.00 2014 NA San Francisco
## 38119 0.00 0 0.00 0.00 2014 NA San Francisco
## 38120 NaN NaN 0.00 0.00 2014 NA San Francisco
## 38121 NaN NaN 0.00 0.00 2014 NA San Francisco
## 38122 NaN NaN 0.00 0.00 2014 NA San Francisco
## 38123 -618.13 0 -618.13 -618.13 2014 NA San Francisco
## Status paisOrigen
## 38118 PT EGIPTO
## 38119 PT MEXICO
## 38120 CHILE
## 38121 ARGENTINA
## 38122 COLOMBIA
## 38123 PT NORUEGA
Análisis de datos haciendo agrupaciones mediante summarize()
Ordenando resumen
resumen <- arrange(resumen, desc(media))
class(resumen) # de que tipo de datos estamos hablando
## [1] "tbl_df" "tbl" "data.frame"
resumen <- as.data.frame(resumen)
class(resumen) # De que tipo de dato queda ????
## [1] "data.frame"
resumen
## paisOrigen media desvstd maximo minimo primerCuartil
## 1 CUBA 103454.77 65027.81 355299.8 59.03 56271.32
## 2 FINLANDIA 103055.94 68370.98 374598.0 41.37 42862.61
## 3 SUECIA 102990.60 65417.86 374083.5 0.00 53190.58
## 4 INGLATERRA 102592.22 67756.96 479652.2 0.00 46969.41
## 5 FRANCIA 101726.33 64910.91 370464.4 0.00 51240.92
## 6 CHINA 101605.28 67188.78 436224.4 0.00 44544.44
## 7 ITALIA 101426.35 67554.96 401070.9 0.00 46528.28
## 8 ESTADOS UNIDOS 101306.49 67972.94 379918.7 0.00 46778.53
## 9 EGIPTO 100768.73 65468.03 417435.1 0.00 47574.04
## 10 INDIA 100736.52 65916.01 510732.7 0.00 46761.54
## 11 BRASIL 100639.31 63867.36 324556.2 73.81 49163.36
## 12 SINGAPUR 100517.75 67599.48 418019.2 23.24 42871.45
## 13 NORUEGA 100373.09 66939.77 365561.1 -618.13 44295.52
## 14 ARGENTINA 100267.84 67014.80 365782.4 0.00 42109.39
## 15 COLOMBIA 99472.00 66242.28 353797.1 0.00 41235.53
## 16 CHILE 99403.76 66259.56 314018.2 0.00 42032.50
## 17 JAPON 99308.69 67268.64 393990.1 32.07 39874.10
## 18 HOLANDA 99259.74 66484.18 341838.5 12.89 40751.16
## 19 ESPAÑA 99221.62 65061.57 291705.6 22.56 40780.07
## 20 IRLADA 98966.92 64959.91 349527.8 15.35 42077.68
## 21 MEXICO 98256.60 66270.33 415767.9 0.00 38566.68
## 22 COREA DEL NORTE 98112.57 65849.06 311072.0 0.00 37075.96
## 23 CANADA 97084.02 66299.20 365548.3 0.00 37788.07
## 24 AUSTRALIA 95295.74 63695.92 322313.1 0.00 38541.38
## segundoCuartil tercerCuartil registros
## 1 103333.54 142567.7 1600
## 2 103769.64 150181.8 1527
## 3 103677.36 145612.1 1564
## 4 101674.46 147601.2 1609
## 5 101481.97 143609.9 1625
## 6 102579.69 144485.2 1629
## 7 100906.66 144549.8 1605
## 8 99048.28 145373.1 1637
## 9 101130.35 142550.0 1620
## 10 101980.83 140874.2 1570
## 11 101311.45 140651.0 1635
## 12 100415.73 143747.8 1521
## 13 100726.04 141782.7 1565
## 14 100548.75 143029.5 1619
## 15 101027.92 143061.8 1613
## 16 101723.58 141952.3 1553
## 17 98592.64 142743.5 1612
## 18 100590.63 145440.1 1581
## 19 101250.31 140421.6 1574
## 20 101329.24 142127.0 1609
## 21 99193.77 140299.4 1504
## 22 100818.48 139801.5 1621
## 23 97500.07 138979.0 1554
## 24 98578.74 136129.1 1576
Visualizando resumen
# Vamos a visualizar resumen
pie(head(resumen$media)) # Muy feo

# Vamos a poner los tres mejores y los tres peores
mejores <- head(resumen, 3) # Los tres primeros
peores <- tail(resumen, 3)
mejores
## paisOrigen media desvstd maximo minimo primerCuartil
## 1 CUBA 103454.8 65027.81 355299.8 59.03 56271.32
## 2 FINLANDIA 103055.9 68370.98 374598.0 41.37 42862.61
## 3 SUECIA 102990.6 65417.86 374083.5 0.00 53190.58
## segundoCuartil tercerCuartil registros
## 1 103333.5 142567.7 1600
## 2 103769.6 150181.8 1527
## 3 103677.4 145612.1 1564
peores
## paisOrigen media desvstd maximo minimo primerCuartil
## 22 COREA DEL NORTE 98112.57 65849.06 311072.0 0 37075.96
## 23 CANADA 97084.02 66299.20 365548.3 0 37788.07
## 24 AUSTRALIA 95295.74 63695.92 322313.1 0 38541.38
## segundoCuartil tercerCuartil registros
## 22 100818.48 139801.5 1621
## 23 97500.07 138979.0 1554
## 24 98578.74 136129.1 1576
barplot(mejores$media / 1000) # represent.. en miles

barplot(peores$media / 1000) # represent.. en miles
