2023/01/31 (updated: 2023-12-04)
library(tidyverse)
resume <- read.csv("resume.csv")
str(resume)
## 'data.frame': 4870 obs. of 4 variables: ## $ firstname: chr "Allison" "Kristen" "Lakisha" "Latonya" ... ## $ sex : chr "female" "female" "female" "female" ... ## $ race : chr "white" "white" "black" "black" ... ## $ call : int 0 0 0 0 0 0 0 0 0 0 ...
head(), mostra as primeiras linhas, tail(), mostra as últimas linhas e glimpse() uma substituta do tidyverse para a função str() do R básico.head(resume)
## firstname sex race call ## 1 Allison female white 0 ## 2 Kristen female white 0 ## 3 Lakisha female black 0 ## 4 Latonya female black 0 ## 5 Carrie female white 0 ## 6 Jay male white 0
tail(resume)
## firstname sex race call ## 4865 Lakisha female black 0 ## 4866 Tamika female black 0 ## 4867 Ebony female black 0 ## 4868 Jay male white 0 ## 4869 Latonya female black 0 ## 4870 Laurie female white 0
glimpse(resume)
## Rows: 4,870 ## Columns: 4 ## $ firstname <chr> "Allison", "Kristen", "Lakisha", "Latonya", "Carrie", "Jay",… ## $ sex <chr> "female", "female", "female", "female", "female", "male", "f… ## $ race <chr> "white", "white", "black", "black", "white", "white", "white… ## $ call <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
summary()summary(resume)
## firstname sex race call ## Length:4870 Length:4870 Length:4870 Min. :0.00000 ## Class :character Class :character Class :character 1st Qu.:0.00000 ## Mode :character Mode :character Mode :character Median :0.00000 ## Mean :0.08049 ## 3rd Qu.:0.00000 ## Max. :1.00000
race.call.summary <- resume %>% group_by(race, call) %>% count() race.call.summary
## # A tibble: 4 × 3 ## # Groups: race, call [4] ## race call n ## <chr> <int> <int> ## 1 black 0 2278 ## 2 black 1 157 ## 3 white 0 2200 ## 4 white 1 235
race.call.tab <- race.call.summary %>% pivot_wider(names_from = call, values_from = n) race.call.tab
## # A tibble: 2 × 3 ## # Groups: race [2] ## race `0` `1` ## <chr> <int> <int> ## 1 black 2278 157 ## 2 white 2200 235
rename() permite dar nomes mais informativos às colunas.race.call.tab.names <- race.call.tab %>% rename(no_callback = "0", callback = "1") race.call.tab.names
## # A tibble: 2 × 3 ## # Groups: race [2] ## race no_callback callback ## <chr> <int> <int> ## 1 black 2278 157 ## 2 white 2200 235
race.call.tab.names <- race.call.tab.names %>%
mutate(total_resumes = no_callback + callback,
callback_prop = callback/total_resumes)
race.call.tab.names
## # A tibble: 2 × 5 ## # Groups: race [2] ## race no_callback callback total_resumes callback_prop ## <chr> <int> <int> <int> <dbl> ## 1 black 2278 157 2435 0.0645 ## 2 white 2200 235 2435 0.0965
overall_callback <- resume %>% summarize(total_callback_rate = sum(call)/n()) overall_callback
## total_callback_rate ## 1 0.08049281
overall_callback <- resume %>% summarize(total_callback_rate = mean(call)) overall_callback
## total_callback_rate ## 1 0.08049281
group_by().callback_by_race <- resume %>% group_by(race) %>% summarize(callback_rate = mean(call)) callback_by_race
## # A tibble: 2 × 2 ## race callback_rate ## <chr> <dbl> ## 1 black 0.0645 ## 2 white 0.0965
group_by() utilizada acima e na Unidade 1 nos permite obter informações de um subconjunto de dados. Esse tipo de trabalho é muito comum em análise de dados.class(TRUE)
## [1] "logical"
as.integer(TRUE)
## [1] 1
as.integer(FALSE)
## [1] 0
x <- c(TRUE, FALSE, TRUE) mean(x) #proporção de TRUEs
## [1] 0.6666667
sum(x) #número de TRUEs
## [1] 2
TRUE & TRUE
## [1] TRUE
TRUE & FALSE
## [1] FALSE
FALSE & FALSE
## [1] FALSE
TRUE | FALSE
## [1] TRUE
FALSE & FALSE
## [1] FALSE
TRUE | TRUE
## [1] TRUE
TRUE & FALSE & TRUE
## [1] FALSE
(TRUE | FALSE) & FALSE
## [1] FALSE
(TRUE | FALSE) & TRUE
## [1] TRUE
TRUE | (FALSE & FALSE)
## [1] TRUE
TF1 <- c(TRUE, FALSE, FALSE) TF2 <- c(TRUE, FALSE,TRUE) TF1 | TF2
## [1] TRUE FALSE TRUE
TF1 & TF2
## [1] TRUE FALSE FALSE
4 > 3
## [1] TRUE
"Hello" == "hello"
## [1] FALSE
"Hello" == "Hello"
## [1] TRUE
"Hello" != "hello"
## [1] TRUE
x <- c(3,2,1,-2,-1) x >= 2
## [1] TRUE TRUE FALSE FALSE FALSE
x != 1
## [1] TRUE TRUE FALSE TRUE TRUE
(x > 0) & (x <= 2)
## [1] FALSE TRUE TRUE FALSE FALSE
(x > 2) | (x <= -1)
## [1] TRUE FALSE FALSE TRUE TRUE
x.int <- (x > 0) & (x <= 2) sum(x.int)
## [1] 2
mean(x.int)
## [1] 0.4
filter() do pacote tidyverse usa operadores relacionais. Podemos usar essa função para calcular o retorno com ofertas de empregos apenas para nomes associados a afro-americanos.resume %>% filter(race == "black") %>% summarize(mean(call))
## mean(call) ## 1 0.06447639
resumeB <- filter(resume, race == "black") mean(resumeB$call)
## [1] 0.06447639
summarize(resumeB, mean(call))
## mean(call) ## 1 0.06447639
resumeBf <- filter(resume, race == "black" & sex == "female") head(resumeBf)
## firstname sex race call ## 1 Lakisha female black 0 ## 2 Latonya female black 0 ## 3 Kenya female black 0 ## 4 Latonya female black 0 ## 5 Aisha female black 0 ## 6 Aisha female black 0
Bf_callback <- filter(resume, race == "black" & sex == "female") %>% summarize(callback_rate = mean(call)) %>% pull() Bf_callback
## [1] 0.06627784
Bm_callback <- filter(resume, race == "black" & sex == "male") %>% summarize(callback_rate = mean(call)) %>% pull() Bm_callback
## [1] 0.0582878
Wf_callback <- filter(resume, race == "white" & sex == "female") %>% summarize(callback_rate = mean(call)) %>% pull() Wf_callback
## [1] 0.09892473
Wm_callback <- filter(resume, race == "white" & sex == "male") %>% summarize(callback_rate = mean(call)) %>% pull() Wm_callback
## [1] 0.08869565
Wf_callback - Bf_callback
## [1] 0.03264689
Wm_callback - Bm_callback
## [1] 0.03040786
racial_gaps_by_sex <- resume %>% group_by(race,sex) %>% summarize(callback = mean(call)) %>% pivot_wider(names_from = race, values_from = callback) %>% mutate(race_gap = white - black)
resume %>% group_by(race,sex) %>% summarize(callback = mean(call))
## # A tibble: 4 × 3 ## # Groups: race [2] ## race sex callback ## <chr> <chr> <dbl> ## 1 black female 0.0663 ## 2 black male 0.0583 ## 3 white female 0.0989 ## 4 white male 0.0887
resume %>% group_by(race,sex) %>% summarize(callback = mean(call)) %>% pivot_wider(names_from = race, values_from = callback)
## # A tibble: 2 × 3 ## sex black white ## <chr> <dbl> <dbl> ## 1 female 0.0663 0.0989 ## 2 male 0.0583 0.0887
resume %>% group_by(race,sex) %>% summarize(callback = mean(call)) %>% pivot_wider(names_from = race, values_from = callback) %>% mutate(race_gap = white - black)
## # A tibble: 2 × 4 ## sex black white race_gap ## <chr> <dbl> <dbl> <dbl> ## 1 female 0.0663 0.0989 0.0326 ## 2 male 0.0583 0.0887 0.0304
resume <- resume %>% mutate(BlackFemale = if_else(race == "black" & sex == "female", 1, 0))
nrow(resumeBf)
## [1] 1886
nrow(resumeBf) == resume %>% summarize(bf = sum(BlackFemale))
## bf ## [1,] TRUE
resume_fact <- resume %>%
mutate(type = if_else(race == "black" & sex == "female", "BlackFemale", ""),
type = if_else(race == "black" & sex == "male", "BlackMale", type),
type = if_else(race == "white" & sex == "female", "WhiteFemale", type),
type = if_else(race == "white" & sex == "male", "WhiteMale", type))
case_when() nos permite criar a mesma variável um código mais claro.resume <- resume %>%
mutate(type = case_when(race == "black" & sex == "female" ~ "BlackFemale",
race == "black" & sex == "male" ~ "BlackMale",
race == "white" & sex == "female" ~ "WhiteFemale",
race == "white" & sex == "male" ~ "WhiteMale"))
class(resume_fact$type)
## [1] "character"
summary(resume_fact$type)
## Length Class Mode ## 4870 character character
as.factor() diz para o R transformar uma variável em um fator.resume <- resume %>% mutate(type = as.factor(type)) class(resume$type)
## [1] "factor"
group_by() não trabalha apenas com fatores, é comum que usuários do tidyverse não se preocupem em usar objetos da classe factor ou da classe character, mas para o R faz diferença.levels(resume$type)
## [1] "BlackFemale" "BlackMale" "WhiteFemale" "WhiteMale"
summary(resume$type)
## BlackFemale BlackMale WhiteFemale WhiteMale ## 1886 549 1860 575
slice(resume, 1)
## firstname sex race call BlackFemale type ## 1 Allison female white 0 0 WhiteFemale
\[SATE=\frac{1}{n}\sum_{i=1}^n\left[Y_i(1)-Y_i(0)\right]\] onde \(n\) é o tamanho da amostra.
…
…
…
…
social <- read.csv("social.csv")
head(social)
## sex yearofbirth primary2004 messages primary2006 hhsize ## 1 male 1941 0 Civic Duty 0 2 ## 2 female 1947 0 Civic Duty 0 2 ## 3 male 1951 0 Hawthorne 1 3 ## 4 female 1950 0 Hawthorne 1 3 ## 5 female 1982 0 Hawthorne 1 3 ## 6 male 1981 0 Control 0 3
str(social)
## 'data.frame': 305866 obs. of 6 variables: ## $ sex : chr "male" "female" "male" "female" ... ## $ yearofbirth: int 1941 1947 1951 1950 1982 1981 1959 1956 1968 1967 ... ## $ primary2004: int 0 0 0 0 0 0 0 0 0 0 ... ## $ messages : chr "Civic Duty" "Civic Duty" "Hawthorne" "Hawthorne" ... ## $ primary2006: int 0 0 1 1 1 0 1 1 0 0 ... ## $ hhsize : int 2 2 3 3 3 3 3 3 2 2 ...
summary(social)
## sex yearofbirth primary2004 messages ## Length:305866 Min. :1900 Min. :0.0000 Length:305866 ## Class :character 1st Qu.:1947 1st Qu.:0.0000 Class :character ## Mode :character Median :1956 Median :0.0000 Mode :character ## Mean :1956 Mean :0.4014 ## 3rd Qu.:1965 3rd Qu.:1.0000 ## Max. :1986 Max. :1.0000 ## primary2006 hhsize ## Min. :0.0000 Min. :1.000 ## 1st Qu.:0.0000 1st Qu.:2.000 ## Median :0.0000 Median :2.000 ## Mean :0.3122 Mean :2.184 ## 3rd Qu.:1.0000 3rd Qu.:2.000 ## Max. :1.0000 Max. :8.000
turnout_by_message <- social %>% group_by(messages) %>% summarize(turnout = mean(primary2006))
turnout_diffs <- turnout_by_message %>%
pivot_wider(names_from = messages, values_from = turnout) %>%
mutate(diff_Civic_Duty = `Civic Duty` - Control,
diff_Hawthorne = Hawthorne - Control,
diff_Neighbors = Neighbors - Control) %>%
select(diff_Civic_Duty, diff_Hawthorne, diff_Neighbors)
social %>%
mutate(age = 2006 - yearofbirth) %>%
group_by(messages) %>%
summarize(age_avg = mean(age),
primary2004_avg = mean(primary2004),
hhsize_avg = mean(hhsize))
## # A tibble: 4 × 4 ## messages age_avg primary2004_avg hhsize_avg ## <chr> <dbl> <dbl> <dbl> ## 1 Civic Duty 49.7 0.399 2.19 ## 2 Control 49.8 0.400 2.18 ## 3 Hawthorne 49.7 0.403 2.18 ## 4 Neighbors 49.9 0.407 2.19
minwage <- read.csv("minwage.csv")
glimpse(minwage)
## Rows: 358 ## Columns: 8 ## $ chain <chr> "wendys", "wendys", "burgerking", "burgerking", "kfc", "kfc… ## $ location <chr> "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA",… ## $ wageBefore <dbl> 5.00, 5.50, 5.00, 5.00, 5.25, 5.00, 5.00, 5.00, 5.00, 5.50,… ## $ wageAfter <dbl> 5.25, 4.75, 4.75, 5.00, 5.00, 5.00, 4.75, 5.00, 4.50, 4.75,… ## $ fullBefore <dbl> 20.0, 6.0, 50.0, 10.0, 2.0, 2.0, 2.5, 40.0, 8.0, 10.5, 6.0,… ## $ fullAfter <dbl> 0.0, 28.0, 15.0, 26.0, 3.0, 2.0, 1.0, 9.0, 7.0, 18.0, 5.0, … ## $ partBefore <dbl> 20.0, 26.0, 35.0, 17.0, 8.0, 10.0, 20.0, 30.0, 27.0, 30.0, … ## $ partAfter <dbl> 36, 3, 18, 9, 12, 9, 25, 32, 39, 10, 20, 4, 13, 20, 15, 19,…
summary(minwage)
## chain location wageBefore wageAfter ## Length:358 Length:358 Min. :4.250 Min. :4.250 ## Class :character Class :character 1st Qu.:4.250 1st Qu.:5.050 ## Mode :character Mode :character Median :4.500 Median :5.050 ## Mean :4.618 Mean :4.994 ## 3rd Qu.:4.987 3rd Qu.:5.050 ## Max. :5.750 Max. :6.250 ## fullBefore fullAfter partBefore partAfter ## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00 ## 1st Qu.: 2.125 1st Qu.: 2.000 1st Qu.:11.00 1st Qu.:11.00 ## Median : 6.000 Median : 6.000 Median :16.25 Median :17.00 ## Mean : 8.475 Mean : 8.362 Mean :18.75 Mean :18.69 ## 3rd Qu.:12.000 3rd Qu.:12.000 3rd Qu.:25.00 3rd Qu.:25.00 ## Max. :60.000 Max. :40.000 Max. :60.00 Max. :60.00
minwage %>%
mutate(chain = as.factor(chain),
location = as.factor(location)) %>%
summary()
## chain location wageBefore wageAfter ## burgerking:149 centralNJ: 45 Min. :4.250 Min. :4.250 ## kfc : 75 northNJ :146 1st Qu.:4.250 1st Qu.:5.050 ## roys : 88 PA : 67 Median :4.500 Median :5.050 ## wendys : 46 shoreNJ : 33 Mean :4.618 Mean :4.994 ## southNJ : 67 3rd Qu.:4.987 3rd Qu.:5.050 ## Max. :5.750 Max. :6.250 ## fullBefore fullAfter partBefore partAfter ## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00 ## 1st Qu.: 2.125 1st Qu.: 2.000 1st Qu.:11.00 1st Qu.:11.00 ## Median : 6.000 Median : 6.000 Median :16.25 Median :17.00 ## Mean : 8.475 Mean : 8.362 Mean :18.75 Mean :18.69 ## 3rd Qu.:12.000 3rd Qu.:12.000 3rd Qu.:25.00 3rd Qu.:25.00 ## Max. :60.000 Max. :40.000 Max. :60.00 Max. :60.00
minwage <- minwage %>%
mutate(state = if_else(location == "PA", "PA", "NJ"))
new_wage <- 5.05
state_props <- minwage %>%
mutate(above_min_before = if_else(wageBefore >= new_wage,1,0),
above_min_after = if_else(wageAfter >= new_wage, 1, 0)) %>%
group_by(state) %>%
summarize(prop_before = mean(above_min_before),
prop_after = mean(above_min_after))
minwage <- minwage %>%
mutate(totalAfter = fullAfter + partAfter,
fullPropAfter = fullAfter/totalAfter)
full_prop_by_state <- minwage %>% group_by(state) %>% summarize(fullPropAfter = mean(fullPropAfter))
full_prop_by_state %>% pivot_wider(names_from = state, values_from = fullPropAfter) %>% mutate(diff = NJ - PA)
## # A tibble: 1 × 3 ## NJ PA diff ## <dbl> <dbl> <dbl> ## 1 0.320 0.272 0.0481
…
chains_by_state <- minwage %>% group_by(state) %>% count(chain) %>% mutate(prop = n/sum(n)) %>% pivot_wider(id_cols=-n, names_from = state, values_from = prop)
full_prop_by_state_chain <- minwage %>% group_by(state, chain) %>% summarize(fullPropAfter = mean(fullPropAfter)) %>% pivot_wider(names_from = "state", values_from = fullPropAfter) %>% mutate(diff = NJ - PA)
…
prop_by_state_chain_location_subset <- minwage %>%
filter(!location %in% c("shoreNJ", "centralNJ")) %>%
group_by(state,chain) %>%
summarize(fullPropAfter = mean(fullPropAfter)) %>%
pivot_wider(names_from = state, values_from = fullPropAfter) %>%
mutate(diff = NJ - PA)
minwage <- minwage %>%
mutate(totalBefore = fullBefore + partBefore,
fullPropBefore = fullBefore/totalBefore)
minwage %>%
filter(state == "NJ") %>%
summarize(diff = mean(fullPropAfter) - mean(fullPropBefore))
## diff ## 1 0.02387474
…
minwage %>% group_by(state) %>% summarize(diff = mean(fullPropAfter) - mean(fullPropBefore))
## # A tibble: 2 × 2 ## state diff ## <chr> <dbl> ## 1 NJ 0.0239 ## 2 PA -0.0377
minwage %>% group_by(state) %>% summarize(diff = mean(fullPropAfter) - mean(fullPropBefore)) %>% pivot_wider(names_from = state, values_from = diff) %>% mutate(diff_in_diff = NJ - PA)
## # A tibble: 1 × 3 ## NJ PA diff_in_diff ## <dbl> <dbl> <dbl> ## 1 0.0239 -0.0377 0.0616
median() calcula a mediana de um conjunto de observações.x1 <- seq(1,5, by=1) median(x1)
## [1] 3
x2 <- seq(1,10, by=1) median(x2)
## [1] 5.5
x3 <- seq(0,1000, by=1) median(x3)
## [1] 500
quantile() calcula os quartis de um conjunto de observações. Para outros quantis é preciso definir o argumento probs.#Quartis quantile(x3)
## 0% 25% 50% 75% 100% ## 0 250 500 750 1000
#Decis quantile(x3, probs=seq(0,1,0.1))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% ## 0 100 200 300 400 500 600 700 800 900 1000
x <- c(1,3,4,10,82) x
## [1] 1 3 4 10 82
mean(x)
## [1] 20
median(x)
## [1] 4
mean(minwage$wageBefore)
## [1] 4.617709
median(minwage$wageBefore)
## [1] 4.5
mean(minwage$wageAfter)
## [1] 4.993855
median(minwage$wageAfter)
## [1] 5.05
summary() retorna média, máximo, mínimo, mediana e demais quartis de variáveis numéricas.minwage %>% filter(state == "NJ") %>% select(wageBefore, wageAfter) %>% summary()
## wageBefore wageAfter ## Min. :4.25 Min. :5.000 ## 1st Qu.:4.25 1st Qu.:5.050 ## Median :4.50 Median :5.050 ## Mean :4.61 Mean :5.081 ## 3rd Qu.:4.87 3rd Qu.:5.050 ## Max. :5.75 Max. :5.750
IQR() calcula o intervalo interquartil.minwage %>%
filter(state == "NJ") %>%
select(wageBefore, wageAfter) %>%
summarize(wageBeforeIQR = IQR(wageBefore),
wageAfterIQR = IQR(wageAfter))
## wageBeforeIQR wageAfterIQR ## 1 0.62 0
deciles_prob <- seq(from = 0, to = 1, by = 0.1)
deciles_names <- as.character(deciles_prob)
minwage %>%
filter(state == "NJ") %>%
select(wageBefore, wageAfter) %>%
summarize(wageBeforeDecile = quantile(wageBefore, probs = deciles_prob),
wageAfterDecile = quantile(wageAfter, probs = deciles_prob),
decile = deciles_names)
## wageBeforeDecile wageAfterDecile decile ## 1 4.25 5.00 0 ## 2 4.25 5.05 0.1 ## 3 4.25 5.05 0.2 ## 4 4.25 5.05 0.3 ## 5 4.50 5.05 0.4 ## 6 4.50 5.05 0.5 ## 7 4.65 5.05 0.6 ## 8 4.75 5.05 0.7 ## 9 5.00 5.05 0.8 ## 10 5.00 5.15 0.9 ## 11 5.75 5.75 1
minwage %>% group_by(state) %>% summarize(diff = median(fullPropAfter) - median(fullPropBefore)) %>% pivot_wider(names_from = state, values_from = diff) %>% mutate(diff_in_diff = NJ - PA)
## # A tibble: 1 × 3 ## NJ PA diff_in_diff ## <dbl> <dbl> <dbl> ## 1 0.0250 -0.0120 0.0370
z <- c(-2,-1,0,1,2) sqrt(sum(z^2)/length(z))
## [1] 1.414214
mean(z)
## [1] 0
sqrt(sum((z-mean(z))^2)/(length(z)-1))
## [1] 1.581139
sd(z)
## [1] 1.581139
sd(z)^2
## [1] 2.5
var(z)
## [1] 2.5
summarize_at() do tidyverse permite calcular várias estatísiticas de uma forma compacta. A função summarize_at() precisa ser informada das variáveis, isso é feito com a função vars(), e das funções que devem ser aplicada nas variáveis..funs e devem estar em uma lista que é uma classe do R.minwage %>% group_by(state) %>% summarize_at(vars(fullPropBefore, fullPropAfter), .funs = list(sd,var))
minwage %>% group_by(state) %>% summarize_at(vars(fullPropBefore, fullPropAfter), .funs = list(stdv=sd,variance=var))