library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stats)
# cargar datos de salarios
salarios <- read.csv ("C:/Users/Usuario/Documents/Ciencia de los datos/datos/Salaries.csv")
#salarios ya no lo queremos ver
## Filtrar Datos
filter(salarios, EmployeeName == "Kevin Lee")
## Id EmployeeName JobTitle BasePay
## 1 39716 Kevin Lee Police Officer 3 115866.90
## 2 40571 Kevin Lee Police Officer 3 117171.41
## 3 42511 Kevin Lee Electrical Transit System Mech 79220.54
## 4 49271 Kevin Lee Deputy Court Clerk III 84512.32
## 5 49440 Kevin Lee Deputy Court Clerk III 84512.34
## 6 50124 Kevin Lee Personnel Analyst 83382.00
## 7 52234 Kevin Lee Senior Parking Control Officer 66774.03
## 8 53932 Kevin Lee Transit Operator 57397.32
## 9 55120 Kevin Lee IS Administrator 2 68940.44
## 10 112386 Kevin Lee Sergeant 3 137982.14
## 11 123142 Kevin Lee IT Operations Support Admin IV 91606.03
## 12 124166 Kevin Lee Personnel Analyst 88353.01
## 13 124659 Kevin Lee Transit Operator 67230.30
## OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year Notes
## 1 23523.30 9313.54 34906.20 148703.74 183609.94 2012 NA
## 2 19606.88 4244.90 34610.10 141023.19 175633.29 2012 NA
## 3 40247.39 3353.49 38781.04 122821.42 161602.46 2012 NA
## 4 0.00 1469.99 36080.30 85982.31 122062.61 2012 NA
## 5 0.00 975.44 35902.02 85487.78 121389.80 2012 NA
## 6 0.00 0.00 35210.59 83382.00 118592.59 2012 NA
## 7 9599.08 1092.62 31902.38 77465.73 109368.11 2012 NA
## 8 9989.83 1800.48 34577.54 69187.63 103765.17 2012 NA
## 9 0.00 0.00 30650.48 68940.44 99590.92 2012 NA
## 10 18537.19 16039.62 43039.12 172558.95 215598.07 2014 NA
## 11 0.00 1013.00 34109.61 92619.03 126728.64 2014 NA
## 12 0.00 0.00 33172.33 88353.01 121525.34 2014 NA
## 13 10961.25 4050.85 36777.83 82242.40 119020.23 2014 NA
## Agency Status
## 1 San Francisco
## 2 San Francisco
## 3 San Francisco
## 4 San Francisco
## 5 San Francisco
## 6 San Francisco
## 7 San Francisco
## 8 San Francisco
## 9 San Francisco
## 10 San Francisco FT
## 11 San Francisco FT
## 12 San Francisco FT
## 13 San Francisco FT
filter(salarios, EmployeeName == "Kevin Lee" & Year == 2014)
## Id EmployeeName JobTitle BasePay OvertimePay
## 1 112386 Kevin Lee Sergeant 3 137982.14 18537.19
## 2 123142 Kevin Lee IT Operations Support Admin IV 91606.03 0.00
## 3 124166 Kevin Lee Personnel Analyst 88353.01 0.00
## 4 124659 Kevin Lee Transit Operator 67230.30 10961.25
## OtherPay Benefits TotalPay TotalPayBenefits Year Notes Agency
## 1 16039.62 43039.12 172558.95 215598.1 2014 NA San Francisco
## 2 1013.00 34109.61 92619.03 126728.6 2014 NA San Francisco
## 3 0.00 33172.33 88353.01 121525.3 2014 NA San Francisco
## 4 4050.85 36777.83 82242.40 119020.2 2014 NA San Francisco
## Status
## 1 FT
## 2 FT
## 3 FT
## 4 FT
cuantos <- filter(salarios,TotalPayBenefits >= 150000 & TotalPayBenefits <= 170000 & Year == 2012)
str(cuantos)
## 'data.frame': 2913 obs. of 13 variables:
## $ Id : int 41218 41219 41220 41221 41222 41223 41224 41225 41226 41227 ...
## $ EmployeeName : Factor w/ 110810 levels "A Bernard Fatooh",..: 16010 20663 88354 60640 94542 73444 45099 108649 89546 64494 ...
## $ JobTitle : Factor w/ 2159 levels "Account Clerk",..: 1565 2087 234 806 615 1414 1413 1565 1415 625 ...
## $ BasePay : num 112611 102662 97609 110847 88374 ...
## $ OvertimePay : num 5648 19465 22952 3927 33428 ...
## $ OtherPay : num 6037 6254 8148 18492 10512 ...
## $ Benefits : num 45699 41608 41273 36714 37660 ...
## $ TotalPay : num 124296 128382 128709 133266 132314 ...
## $ TotalPayBenefits: num 169995 169990 169982 169980 169973 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ Notes : logi NA NA NA NA NA NA ...
## $ Agency : Factor w/ 1 level "San Francisco": 1 1 1 1 1 1 1 1 1 1 ...
## $ Status : Factor w/ 3 levels "","FT","PT": 1 1 1 1 1 1 1 1 1 1 ...
nrow(cuantos)
## [1] 2913
cuartil75 <- quantile(salarios$TotalPayBenefits, 0.75)
cuantos <- filter (salarios, TotalPayBenefits >= cuartil75)
nrow(cuantos)
## [1] 37164
registros <- select (salarios, Id, EmployeeName,TotalPayBenefits, Year)
#registros
cuales <- filter(registros,TotalPayBenefits >= 150000 & TotalPayBenefits <= 170000 & Year == 2012)
#cuales
datosordenados <- arrange(cuales, EmployeeName)
# Por default es de menos a mayor
#datosordenados
head(datosordenados)
## Id EmployeeName TotalPayBenefits Year
## 1 42773 Aaron Ballonado 159732.7 2012
## 2 41480 Aaron Cowhig 168330.4 2012
## 3 42067 Aaron Fischer 164280.9 2012
## 4 41965 Aaron Smith 164960.6 2012
## 5 42777 Aaron Yoo 159711.9 2012
## 6 42739 Abraham Abarca 159988.9 2012
tail(datosordenados)
## Id EmployeeName TotalPayBenefits Year
## 2908 41326 Yvonne Fuentes-Pattishall 169171.5 2012
## 2909 43221 Zahid Khan 156872.5 2012
## 2910 42623 Zara Grace Janer 160766.9 2012
## 2911 44083 Zhong Qiu 150289.4 2012
## 2912 41647 Zoila Maguina 167054.6 2012
## 2913 42196 Zula Jones 163422.7 2012
# Poner una semilla igual todos para generar la misma muestra
set.seed(1000)#un valor de semilla
# vamos a generar generos 0 y 1
generos <- rep (0:1, 5000) # Es una repeticion de 500 valores entre 0 y 1
generos <- sample(generos, 2913)# Sample genera una muestra de 2913 registros
cuales <- mutate(cuales, Genero = generos)
#cuales
head(cuales, 50)
## Id EmployeeName TotalPayBenefits Year Genero
## 1 41218 Cheryl De Lemos 169995.4 2012 1
## 2 41219 Daniel Godfrey 169990.1 2012 1
## 3 41220 Robert Harvey 169982.0 2012 0
## 4 41221 Lawrence McDonnell 169980.4 2012 0
## 5 41222 Sergio Chavez 169973.1 2012 1
## 6 41223 Michael Grande 169970.4 2012 0
## 7 41224 Jeffrey Chow 169963.2 2012 1
## 8 41225 Winilyn Hidalgo 169960.5 2012 1
## 9 41226 Rodney Lee 169960.3 2012 1
## 10 41227 Lucille Palma 169940.8 2012 0
## 11 41228 Jonathan Rapp 169939.2 2012 1
## 12 41229 Ajay Saxena 169936.3 2012 0
## 13 41230 Albert Tom 169933.8 2012 0
## 14 41231 Edith Hammond 169930.6 2012 1
## 15 41232 Hector Tam 169925.7 2012 1
## 16 41233 Eugene Ling 169898.1 2012 1
## 17 41234 Xing Wang 169897.1 2012 1
## 18 41235 Crystal McDonald 169893.1 2012 0
## 19 41236 Patrick Cox 169890.4 2012 1
## 20 41237 Balraj Singh Rai 169887.5 2012 1
## 21 41238 Jennifer Northridge 169879.5 2012 1
## 22 41239 Roger Fong 169876.2 2012 0
## 23 41240 Edgar Tabo 169876.1 2012 1
## 24 41241 Kin Lee 169864.1 2012 0
## 25 41242 Kevin Adkins 169849.0 2012 1
## 26 41243 Curtis Caldwell 169837.2 2012 0
## 27 41244 Dianna Yanez 169831.5 2012 1
## 28 41245 Samuel Yu 169820.4 2012 1
## 29 41246 Eugene Shu 169811.9 2012 1
## 30 41247 Dale Winniford 169799.7 2012 1
## 31 41248 Clifford Burkhart 169798.2 2012 0
## 32 41249 Suzanne Miller 169794.0 2012 0
## 33 41250 Catheryn Williams 169791.3 2012 0
## 34 41251 Stephen Gritsch 169741.6 2012 0
## 35 41252 Maria Cecilia Martin 169740.3 2012 1
## 36 41253 Russell Roby 169736.7 2012 1
## 37 41254 Lawrence Soe 169732.1 2012 1
## 38 41255 Stephon Degand 169723.0 2012 0
## 39 41256 Michelle Tong 169704.5 2012 1
## 40 41257 James Kazarian 169704.2 2012 0
## 41 41258 Violeta Del Mundo 169699.7 2012 0
## 42 41259 Fred Lew 169693.3 2012 1
## 43 41260 Donald Bannett 169688.4 2012 1
## 44 41261 Miles Young 169673.6 2012 0
## 45 41262 Victor Le 169660.8 2012 0
## 46 41263 Eric Louie 169660.2 2012 1
## 47 41264 Lawrence Ng 169654.4 2012 1
## 48 41265 Eduard Ochoa 169639.7 2012 0
## 49 41266 Geoffrey Clayton 169635.6 2012 1
## 50 41267 Rona Sandler 169633.7 2012 1
# Poner una semilla igual todos para generar la misma muestra
set.seed(1000)#un valor de semilla
# vamos a generar Estados Civiles 0,1,2,3,4,5
edosciviles <- rep (1:5, 5000) # Es una repeticion de 500 valores entre 0 y 1
edosciviles <- sample(edosciviles, 2913)# Sample genera una muestra de 2913 registros
cuales <- mutate(cuales, EdoCivil = edosciviles)
#cuales
#head(cuales, 50)
#Genera una tabla cruzada o lo que es lo mismo la frecuncia de clases de alguna variable en este caso Genero
table(cuales$Genero)
##
## 0 1
## 1468 1445
#Genera una tabla cruzada o lo que es lo mismo la frecuncia de clases de alguna variable en este caso Edos civiles
table(cuales$EdoCivil)
##
## 1 2 3 4 5
## 593 593 564 593 570
barplot(table(cuales$Genero))
#### Vamos a ver un barplot de Estado Civil
barplot(table(cuales$EdoCivil))