library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stats) # Sirve para generar una misma semilla de datos para generarlos aleatoriamente
salarios <- read.csv("C:/Users/Gerencia Banthai/Documents/Ciencia de los datos/Datos/Salaries.csv",
encoding = "UTF-8")
head(salarios,10) # Verificamos mostrando los primeros 10 registros
## Id EmployeeName JobTitle
## 1 1 NATHANIEL FORD GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY
## 2 2 GARY JIMENEZ CAPTAIN III (POLICE DEPARTMENT)
## 3 3 ALBERT PARDINI CAPTAIN III (POLICE DEPARTMENT)
## 4 4 CHRISTOPHER CHONG WIRE ROPE CABLE MAINTENANCE MECHANIC
## 5 5 PATRICK GARDNER DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)
## 6 6 DAVID SULLIVAN ASSISTANT DEPUTY CHIEF II
## 7 7 ALSON LEE BATTALION CHIEF, (FIRE DEPARTMENT)
## 8 8 DAVID KUSHNER DEPUTY DIRECTOR OF INVESTMENTS
## 9 9 MICHAEL MORRIS BATTALION CHIEF, (FIRE DEPARTMENT)
## 10 10 JOANNE HAYES-WHITE CHIEF OF DEPARTMENT, (FIRE DEPARTMENT)
## BasePay OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year
## 1 167411.18 0.00 400184.25 NA 567595.4 567595.4 2011
## 2 155966.02 245131.88 137811.38 NA 538909.3 538909.3 2011
## 3 212739.13 106088.18 16452.60 NA 335279.9 335279.9 2011
## 4 77916.00 56120.71 198306.90 NA 332343.6 332343.6 2011
## 5 134401.60 9737.00 182234.59 NA 326373.2 326373.2 2011
## 6 118602.00 8601.00 189082.74 NA 316285.7 316285.7 2011
## 7 92492.01 89062.90 134426.14 NA 315981.0 315981.0 2011
## 8 256576.96 0.00 51322.50 NA 307899.5 307899.5 2011
## 9 176932.64 86362.68 40132.23 NA 303427.5 303427.5 2011
## 10 285262.00 0.00 17115.73 NA 302377.7 302377.7 2011
## Notes Agency Status
## 1 NA San Francisco
## 2 NA San Francisco
## 3 NA San Francisco
## 4 NA San Francisco
## 5 NA San Francisco
## 6 NA San Francisco
## 7 NA San Francisco
## 8 NA San Francisco
## 9 NA San Francisco
## 10 NA San Francisco
str(salarios)# mostramos la variables que tiene salarios
## 'data.frame': 148654 obs. of 13 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ EmployeeName : Factor w/ 110810 levels "A Bernard Fatooh",..: 77636 34712 1560 17232 81101 23164 3271 22709 73975 47938 ...
## $ JobTitle : Factor w/ 2159 levels "Account Clerk",..: 836 298 298 2149 594 135 246 609 246 370 ...
## $ BasePay : num 167411 155966 212739 77916 134402 ...
## $ OvertimePay : num 0 245132 106088 56121 9737 ...
## $ OtherPay : num 400184 137811 16453 198307 182235 ...
## $ Benefits : num NA NA NA NA NA NA NA NA NA NA ...
## $ TotalPay : num 567595 538909 335280 332344 326373 ...
## $ TotalPayBenefits: num 567595 538909 335280 332344 326373 ...
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Notes : logi NA NA NA NA NA NA ...
## $ Agency : Factor w/ 1 level "San Francisco": 1 1 1 1 1 1 1 1 1 1 ...
## $ Status : Factor w/ 3 levels "","FT","PT": 1 1 1 1 1 1 1 1 1 1 ...
summary(salarios) #mostramos los valores estadisticos basicos
## Id EmployeeName
## Min. : 1 Kevin Lee : 13
## 1st Qu.: 37164 Richard Lee : 11
## Median : 74328 Steven Lee : 11
## Mean : 74328 William Wong: 11
## 3rd Qu.:111491 John Chan : 9
## Max. :148654 KEVIN LEE : 9
## (Other) :148590
## JobTitle BasePay
## Transit Operator : 7036 Min. : -166
## Special Nurse : 4389 1st Qu.: 33588
## Registered Nurse : 3736 Median : 65007
## Public Svc Aide-Public Works: 2518 Mean : 66325
## Police Officer 3 : 2421 3rd Qu.: 94691
## Custodian : 2418 Max. :319275
## (Other) :126136 NA's :609
## OvertimePay OtherPay Benefits
## Min. : -0.01 Min. : -7058.6 Min. : -33.89
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:11535.40
## Median : 0.00 Median : 811.3 Median :28628.62
## Mean : 5066.06 Mean : 3648.8 Mean :25007.89
## 3rd Qu.: 4658.18 3rd Qu.: 4236.1 3rd Qu.:35566.86
## Max. :245131.88 Max. :400184.2 Max. :96570.66
## NA's :4 NA's :4 NA's :36163
## TotalPay TotalPayBenefits Year Notes
## Min. : -618.1 Min. : -618.1 Min. :2011 Mode:logical
## 1st Qu.: 36169.0 1st Qu.: 44065.7 1st Qu.:2012 NA's:148654
## Median : 71426.6 Median : 92404.1 Median :2013
## Mean : 74768.3 Mean : 93692.6 Mean :2013
## 3rd Qu.:105839.1 3rd Qu.:132876.5 3rd Qu.:2014
## Max. :567595.4 Max. :567595.4 Max. :2014
##
## Agency Status
## San Francisco:148654 :110535
## FT: 22334
## PT: 15785
##
##
##
##
set.seed(1000)
nacionalidades <- c("MEXICO", "ESTADOS UNIDOS", "INGLATERRA", "JAPON", "CHINA", "ARGENTINA",
"BRASIL", "CANADA", "CHILE",
"HOLANDA", "FINLANDIA", "EGIPTO", "SUECIA", "AUSTRALIA", "FRANCIA", "ITALIA", "ESPAÑA", "CUBA", "COLOMBIA",
"IRLADA", "NORUEGA", "COREA DEL NORTE", "SINGAPUR",
"INDIA")
nacionalidades
## [1] "MEXICO" "ESTADOS UNIDOS" "INGLATERRA"
## [4] "JAPON" "CHINA" "ARGENTINA"
## [7] "BRASIL" "CANADA" "CHILE"
## [10] "HOLANDA" "FINLANDIA" "EGIPTO"
## [13] "SUECIA" "AUSTRALIA" "FRANCIA"
## [16] "ITALIA" "ESPAÑA" "CUBA"
## [19] "COLOMBIA" "IRLADA" "NORUEGA"
## [22] "COREA DEL NORTE" "SINGAPUR" "INDIA"
# Primero vamos a generar todas las nacionalidades
n <- nrow(salarios) # Número de observaciones total de salrios
paises <- rep(nacionalidades, n*2) # Genera un vector muy grande
head(paises)
## [1] "MEXICO" "ESTADOS UNIDOS" "INGLATERRA" "JAPON"
## [5] "CHINA" "ARGENTINA"
tail(paises)
## [1] "COLOMBIA" "IRLADA" "NORUEGA" "COREA DEL NORTE"
## [5] "SINGAPUR" "INDIA"
salarios2014 <- filter(salarios, Year == 2014)
n <- nrow(salarios2014) # Número de observaciones o registros de salarios2014
# Ahora mediante función mutate() agregamos la columna a salarios2014
paises <- sample(paises, n) # Ahora una muestra aletoria de todos los paises
salarios2014 <- mutate(salarios2014, paisOrigen = paises)
head(salarios2014,10) # Muestra los primeros 10 registros para verificar
## Id EmployeeName JobTitle BasePay
## 1 110532 David Shinn Deputy Chief 3 129150.0
## 2 110533 Amy P Hart Asst Med Examiner 318835.5
## 3 110534 William J Coaker Jr. Chief Investment Officer 257340.0
## 4 110535 Gregory P Suhr Chief of Police 307450.0
## 5 110536 Joanne M Hayes-White Chief, Fire Department 302068.0
## 6 110537 Ellen G Moffatt Asst Med Examiner 270222.0
## 7 110538 John L Martin Dept Head V 311298.5
## 8 110539 Harlan L Kelly-Jr Executive Contract Employee 310161.0
## 9 110540 Samson Lai Battalion Chief, Fire Suppress 179464.1
## 10 110541 David L Franklin Asst Chf of Dept (Fire Dept) 201566.9
## OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year Notes
## 1 0.00 342802.63 38780.04 471952.6 510732.7 2014 NA
## 2 10712.95 60563.54 89540.23 390112.0 479652.2 2014 NA
## 3 0.00 82313.70 96570.66 339653.7 436224.4 2014 NA
## 4 0.00 19266.72 91302.46 326716.8 418019.2 2014 NA
## 5 0.00 24165.44 91201.66 326233.4 417435.1 2014 NA
## 6 6009.22 67956.20 71580.48 344187.5 415767.9 2014 NA
## 7 0.00 0.00 89772.32 311298.5 401070.9 2014 NA
## 8 0.00 0.00 88823.51 310161.0 398984.5 2014 NA
## 9 128685.99 27334.83 59876.90 335485.0 395361.9 2014 NA
## 10 97907.38 29916.28 64599.59 329390.5 393990.1 2014 NA
## Agency Status paisOrigen
## 1 San Francisco PT ARGENTINA
## 2 San Francisco FT CANADA
## 3 San Francisco PT INGLATERRA
## 4 San Francisco FT HOLANDA
## 5 San Francisco FT COREA DEL NORTE
## 6 San Francisco FT AUSTRALIA
## 7 San Francisco FT CHILE
## 8 San Francisco FT ESTADOS UNIDOS
## 9 San Francisco FT EGIPTO
## 10 San Francisco FT ESTADOS UNIDOS
tail(salarios2014,8) # Los últimos 8 registros
## Id EmployeeName JobTitle BasePay
## 38116 148647 NaN NaN NaN
## 38117 148648 Joann Anderson Communications Dispatcher 2 0
## 38118 148649 Leon Walker Custodian 0
## 38119 148650 Roy I Tillery Custodian 0
## 38120 148651 NaN NaN NaN
## 38121 148652 NaN NaN NaN
## 38122 148653 NaN NaN NaN
## 38123 148654 Joe Lopez Counselor, Log Cabin Ranch 0
## OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year Notes
## 38116 NaN NaN NaN 0.00 0.00 2014 NA
## 38117 0 0.00 0 0.00 0.00 2014 NA
## 38118 0 0.00 0 0.00 0.00 2014 NA
## 38119 0 0.00 0 0.00 0.00 2014 NA
## 38120 NaN NaN NaN 0.00 0.00 2014 NA
## 38121 NaN NaN NaN 0.00 0.00 2014 NA
## 38122 NaN NaN NaN 0.00 0.00 2014 NA
## 38123 0 -618.13 0 -618.13 -618.13 2014 NA
## Agency Status paisOrigen
## 38116 San Francisco HOLANDA
## 38117 San Francisco PT CHILE
## 38118 San Francisco PT CUBA
## 38119 San Francisco PT IRLADA
## 38120 San Francisco CHINA
## 38121 San Francisco CANADA
## 38122 San Francisco FINLANDIA
## 38123 San Francisco PT FINLANDIA
mexicanos <- filter(salarios2014, paisOrigen == "MEXICO")
# mexicanos
meanMx <- mean(mexicanos$TotalPayBenefits)
desvstdMex <- sd(mexicanos$TotalPayBenefits)
meanMx
## [1] 99910.91
desvstdMex
## [1] 66113.36
str(mexicanos)
## 'data.frame': 1561 obs. of 14 variables:
## $ Id : int 110561 110595 110597 110615 110618 110625 110627 110655 110674 110676 ...
## $ EmployeeName : Factor w/ 110810 levels "A Bernard Fatooh",..: 9751 37179 29951 94920 44495 49147 3862 57015 50358 58458 ...
## $ JobTitle : Factor w/ 2159 levels "Account Clerk",..: 467 297 772 59 1415 432 1267 297 738 59 ...
## $ BasePay : num 267914 87324 246928 213558 100149 ...
## $ OvertimePay : num 0 0 0 3773 15716 ...
## $ OtherPay : num 0 199112 0 18589 152038 ...
## $ Benefits : num 79800 24960 63924 66845 33847 ...
## $ TotalPay : num 267914 286436 246928 235920 267903 ...
## $ TotalPayBenefits: num 347714 311397 310852 302765 301750 ...
## $ Year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ Notes : logi NA NA NA NA NA NA ...
## $ Agency : Factor w/ 1 level "San Francisco": 1 1 1 1 1 1 1 1 1 1 ...
## $ Status : Factor w/ 3 levels "","FT","PT": 2 3 2 3 3 2 2 2 2 3 ...
## $ paisOrigen : chr "MEXICO" "MEXICO" "MEXICO" "MEXICO" ...
# Agrupamos por pais de Origen
agrupados <- group_by(salarios2014, paisOrigen)
resumen <- summarise(agrupados, media = mean(TotalPayBenefits), desvstd = sd(TotalPayBenefits) )
resumen
## # A tibble: 24 x 3
## paisOrigen media desvstd
## <chr> <dbl> <dbl>
## 1 ARGENTINA 101437. 67311.
## 2 AUSTRALIA 102286. 67674.
## 3 BRASIL 98838. 63844.
## 4 CANADA 98885. 67057.
## 5 CHILE 98759. 66323.
## 6 CHINA 104758. 68022.
## 7 COLOMBIA 100889. 68021.
## 8 COREA DEL NORTE 98892. 64196.
## 9 CUBA 99644. 67685.
## 10 EGIPTO 100941. 66593.
## # ... with 14 more rows
resumen <- summarise(agrupados, media = mean(TotalPayBenefits), desvstd = sd(TotalPayBenefits), maximo = max(TotalPayBenefits), minimo = min(TotalPayBenefits),
primerCuartil = quantile(TotalPayBenefits, 0.25),
segundoCuartil = quantile(TotalPayBenefits, 0.50),
tercerCuartil = quantile(TotalPayBenefits, 0.75),
registros = n() )
resumen
## # A tibble: 24 x 9
## paisOrigen media desvstd maximo minimo primerCuartil segundoCuartil
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARGENTINA 1.01e5 67311. 5.11e5 0 47009. 101381.
## 2 AUSTRALIA 1.02e5 67674. 4.16e5 0 46735. 102797.
## 3 BRASIL 9.88e4 63844. 3.54e5 54.4 45142. 100519.
## 4 CANADA 9.89e4 67057. 4.80e5 0 37951. 100917.
## 5 CHILE 9.88e4 66323. 4.01e5 0 41588. 97533.
## 6 CHINA 1.05e5 68022. 3.29e5 0 48360. 103392.
## 7 COLOMBIA 1.01e5 68021. 3.65e5 40.5 42279. 101368.
## 8 COREA DEL~ 9.89e4 64196. 4.17e5 13.8 46266. 99774.
## 9 CUBA 9.96e4 67685. 3.74e5 0 40213. 99363.
## 10 EGIPTO 1.01e5 66593. 3.95e5 0 43734. 101549.
## # ... with 14 more rows, and 2 more variables: tercerCuartil <dbl>,
## # registros <int>
resumen <- arrange(resumen, desc(media))
class(resumen) # de que tipo de datos es nuestra variable resumen
## [1] "tbl_df" "tbl" "data.frame"
resumen <- as.data.frame(resumen)
class(resumen) # Ya quedo como tipo DataFrame
## [1] "data.frame"
resumen
## paisOrigen media desvstd maximo minimo primerCuartil
## 1 CHINA 104757.59 68022.13 329147.3 0.00 48359.66
## 2 NORUEGA 103374.01 65204.12 303476.2 0.00 56507.21
## 3 SINGAPUR 102561.50 65653.61 355299.9 0.00 48760.29
## 4 AUSTRALIA 102286.11 67673.70 415767.9 0.00 46735.38
## 5 INGLATERRA 101916.13 65730.20 436224.4 22.56 49425.95
## 6 ARGENTINA 101437.08 67311.32 510732.7 0.00 47009.44
## 7 ITALIA 101280.12 64360.54 292573.1 0.00 52387.23
## 8 EGIPTO 100941.31 66593.22 395361.9 0.00 43734.49
## 9 COLOMBIA 100889.33 68021.03 364814.5 40.46 42279.16
## 10 HOLANDA 100709.89 65745.05 418019.2 0.00 48528.23
## 11 FRANCIA 99964.57 66598.23 342814.3 75.85 41566.33
## 12 FINLANDIA 99921.13 67341.68 374598.0 -618.13 41670.33
## 13 MEXICO 99910.91 66113.36 347713.9 23.72 44503.18
## 14 IRLADA 99802.97 65080.17 314018.2 0.00 46165.07
## 15 CUBA 99644.20 67684.97 374083.5 0.00 40212.87
## 16 ESPAÑA 99219.06 65067.76 321216.7 46.36 45607.40
## 17 INDIA 98929.65 65683.19 379918.7 0.00 45005.15
## 18 COREA DEL NORTE 98892.47 64195.68 417435.1 13.77 46266.37
## 19 CANADA 98884.96 67056.75 479652.2 0.00 37950.89
## 20 BRASIL 98838.41 63843.66 353797.1 54.39 45142.11
## 21 SUECIA 98766.07 66789.62 347713.9 25.57 37987.83
## 22 CHILE 98759.42 66322.89 401070.9 0.00 41588.08
## 23 ESTADOS UNIDOS 97505.85 66338.76 398984.5 18.39 36414.26
## 24 JAPON 97134.45 67000.10 297504.6 0.00 33305.20
## segundoCuartil tercerCuartil registros
## 1 103392.28 152998.1 1561
## 2 103842.15 148389.0 1541
## 3 104669.07 144383.5 1590
## 4 102796.53 147011.8 1563
## 5 103446.04 147322.4 1573
## 6 101380.58 142663.4 1613
## 7 102321.42 141401.0 1496
## 8 101548.65 141134.4 1631
## 9 101367.54 143953.6 1604
## 10 101219.43 143386.8 1728
## 11 102354.60 141919.3 1529
## 12 101270.92 142831.0 1549
## 13 101034.81 139867.4 1561
## 14 100640.58 139592.6 1561
## 15 99363.47 142223.9 1541
## 16 100228.06 140906.0 1613
## 17 100323.83 140325.0 1649
## 18 99773.62 135564.9 1641
## 19 100917.44 140563.0 1629
## 20 100519.17 141853.7 1586
## 21 99352.40 141973.0 1632
## 22 97533.10 140824.1 1541
## 23 99051.49 139926.5 1615
## 24 96823.19 139692.7 1576
# Vamos a visualizar resumen
pie(head(resumen$media)) # Grafica de pastel
mejores <- head(resumen, 3) # Los tres primeros
peores <- tail(resumen, 3)
mejores
## paisOrigen media desvstd maximo minimo primerCuartil
## 1 CHINA 104757.6 68022.13 329147.3 0 48359.66
## 2 NORUEGA 103374.0 65204.12 303476.2 0 56507.21
## 3 SINGAPUR 102561.5 65653.61 355299.9 0 48760.29
## segundoCuartil tercerCuartil registros
## 1 103392.3 152998.1 1561
## 2 103842.1 148389.0 1541
## 3 104669.1 144383.5 1590
peores
## paisOrigen media desvstd maximo minimo primerCuartil
## 22 CHILE 98759.42 66322.89 401070.9 0.00 41588.08
## 23 ESTADOS UNIDOS 97505.85 66338.76 398984.5 18.39 36414.26
## 24 JAPON 97134.45 67000.10 297504.6 0.00 33305.20
## segundoCuartil tercerCuartil registros
## 22 97533.10 140824.1 1541
## 23 99051.49 139926.5 1615
## 24 96823.19 139692.7 1576
barplot(mejores$media / 1000) # Grafica de barras representando los mejores
barplot(peores$media / 1000) # Grafica de barras representando los peores
print("Por medio de la practica observamos como nos puede ayudar la libreria dplyr para sacar los analis estadisticos basicos como media desviacion, quartiles, maximos y minimos de manera sensilla.
Ademas agregamos columnas y construimos un vector de paises de manera aleatoria para tomar una muestra y asignarla a la columna o variable creada, con unas cunatas instrucciones determinamos los mejores paises respecto a sueldos (CHINA, NORUEGA Y SINGAPUR), asi como los peores(CHILE, ESTADOS UNIDOS Y JAPON).
Tambien realizamos graficas simples para entender mejor la informacion")
## [1] "Por medio de la practica observamos como nos puede ayudar la libreria dplyr para sacar los analis estadisticos basicos como media desviacion, quartiles, maximos y minimos de manera sensilla.\n Ademas agregamos columnas y construimos un vector de paises de manera aleatoria para tomar una muestra y asignarla a la columna o variable creada, con unas cunatas instrucciones determinamos los mejores paises respecto a sueldos (CHINA, NORUEGA Y SINGAPUR), asi como los peores(CHILE, ESTADOS UNIDOS Y JAPON).\n Tambien realizamos graficas simples para entender mejor la informacion"