data <- read_excel("mydatasal.xlsx")
#
head(data)
## # A tibble: 6 × 13
## age workclass degree marital_status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 39 State-gov Bache… Never-married Adm-cleri… Not-in-fami… White Male
## 2 50 Self-emp-not… Bache… Married-civ-s… Exec-mana… Husband White Male
## 3 38 Private HS-gr… Divorced Handlers-… Not-in-fami… White Male
## 4 53 Private 11th Married-civ-s… Handlers-… Husband Black Male
## 5 28 Private Bache… Married-civ-s… Prof-spec… Wife Black Female
## 6 37 Private Maste… Married-civ-s… Exec-mana… Wife White Female
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>
data_small <- data %>%
select(occupation, relationship, race, gender, hoursperweek, country, salary) %>%
filter(country %in% c("United-States", "Canada"))
wide_hours_by_gender <- data_small %>%
group_by(occupation, gender) %>%
summarise(avg_hours = mean(hoursperweek, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = gender, values_from = avg_hours)
kable(wide_hours_by_gender)
occupation | Female | Male |
---|---|---|
? | 29.92980 | 33.35027 |
Adm-clerical | 36.78301 | 39.34810 |
Armed-Forces | NA | 40.66667 |
Craft-repair | 39.75510 | 42.52112 |
Exec-managerial | 41.41061 | 46.32423 |
Farming-fishing | 37.38983 | 48.22141 |
Handlers-cleaners | 35.53472 | 37.97708 |
Machine-op-inspct | 38.87982 | 41.51958 |
Other-service | 33.00254 | 35.21270 |
Priv-house-serv | 30.25301 | 39.85714 |
Prof-specialty | 39.51367 | 44.27804 |
Protective-serv | 38.35211 | 43.60149 |
Sales | 34.13737 | 44.25170 |
Tech-support | 37.34375 | 40.79362 |
Transport-moving | 37.12941 | 45.25177 |
###2
wide_salary_by_country <- data_small %>%
count(country, salary) %>%
pivot_wider(names_from = salary, values_from = n, values_fill = 0)
kable(wide_salary_by_country)
country | <=50K | >50K |
---|---|---|
Canada | 82 | 39 |
United-States | 21999 | 7171 |
long_hours_by_gender <- wide_hours_by_gender %>%
pivot_longer(cols = -occupation,
names_to = "gender",
values_to = "avg_hours")
kable(long_hours_by_gender)
occupation | gender | avg_hours |
---|---|---|
? | Female | 29.92980 |
? | Male | 33.35027 |
Adm-clerical | Female | 36.78301 |
Adm-clerical | Male | 39.34810 |
Armed-Forces | Female | NA |
Armed-Forces | Male | 40.66667 |
Craft-repair | Female | 39.75510 |
Craft-repair | Male | 42.52112 |
Exec-managerial | Female | 41.41061 |
Exec-managerial | Male | 46.32423 |
Farming-fishing | Female | 37.38983 |
Farming-fishing | Male | 48.22141 |
Handlers-cleaners | Female | 35.53472 |
Handlers-cleaners | Male | 37.97708 |
Machine-op-inspct | Female | 38.87982 |
Machine-op-inspct | Male | 41.51958 |
Other-service | Female | 33.00254 |
Other-service | Male | 35.21270 |
Priv-house-serv | Female | 30.25301 |
Priv-house-serv | Male | 39.85714 |
Prof-specialty | Female | 39.51367 |
Prof-specialty | Male | 44.27804 |
Protective-serv | Female | 38.35211 |
Protective-serv | Male | 43.60149 |
Sales | Female | 34.13737 |
Sales | Male | 44.25170 |
Tech-support | Female | 37.34375 |
Tech-support | Male | 40.79362 |
Transport-moving | Female | 37.12941 |
Transport-moving | Male | 45.25177 |
data_small_unique <- data_small %>% distinct(occupation, gender, .keep_all = TRUE)
data_wide_unique <- data_small_unique %>%
group_by(occupation, gender) %>%
summarise(avg_hours = mean(hoursperweek, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = gender, values_from = avg_hours)
kable(data_wide_unique)
occupation | Female | Male |
---|---|---|
? | 32 | 40 |
Adm-clerical | 30 | 40 |
Armed-Forces | NA | 40 |
Craft-repair | 40 | 40 |
Exec-managerial | 40 | 13 |
Farming-fishing | 45 | 35 |
Handlers-cleaners | 40 | 40 |
Machine-op-inspct | 25 | 40 |
Other-service | 20 | 15 |
Priv-house-serv | 30 | 25 |
Prof-specialty | 50 | 60 |
Protective-serv | 40 | 52 |
Sales | 40 | 50 |
Tech-support | 40 | 40 |
Transport-moving | 30 | 40 |
data_long_unique <- data_wide_unique %>%
pivot_longer(cols = -occupation,
names_to = "gender",
values_to = "avg_hours")
kable(data_long_unique)
occupation | gender | avg_hours |
---|---|---|
? | Female | 32 |
? | Male | 40 |
Adm-clerical | Female | 30 |
Adm-clerical | Male | 40 |
Armed-Forces | Female | NA |
Armed-Forces | Male | 40 |
Craft-repair | Female | 40 |
Craft-repair | Male | 40 |
Exec-managerial | Female | 40 |
Exec-managerial | Male | 13 |
Farming-fishing | Female | 45 |
Farming-fishing | Male | 35 |
Handlers-cleaners | Female | 40 |
Handlers-cleaners | Male | 40 |
Machine-op-inspct | Female | 25 |
Machine-op-inspct | Male | 40 |
Other-service | Female | 20 |
Other-service | Male | 15 |
Priv-house-serv | Female | 30 |
Priv-house-serv | Male | 25 |
Prof-specialty | Female | 50 |
Prof-specialty | Male | 60 |
Protective-serv | Female | 40 |
Protective-serv | Male | 52 |
Sales | Female | 40 |
Sales | Male | 50 |
Tech-support | Female | 40 |
Tech-support | Male | 40 |
Transport-moving | Female | 30 |
Transport-moving | Male | 40 |
# Make a column table3 "hoursperweek/salary"
data_with_pair <- data_small %>%
mutate(pair = paste0(hoursperweek, "/", salary))
head(data_with_pair %>% select(hoursperweek, salary, pair))
## # A tibble: 6 × 3
## hoursperweek salary pair
## <dbl> <chr> <chr>
## 1 40 <=50K 40/<=50K
## 2 13 <=50K 13/<=50K
## 3 40 <=50K 40/<=50K
## 4 40 <=50K 40/<=50K
## 5 40 <=50K 40/<=50K
## 6 45 >50K 45/>50K
# Separate pair
separated <- data_with_pair %>%
separate(pair, into = c("hoursperweek_num", "salary_str"), sep = "/", convert = TRUE)
head(separated %>% select(hoursperweek, salary, hoursperweek_num, salary_str))
## # A tibble: 6 × 4
## hoursperweek salary hoursperweek_num salary_str
## <dbl> <chr> <int> <chr>
## 1 40 <=50K 40 <=50K
## 2 13 <=50K 13 <=50K
## 3 40 <=50K 40 <=50K
## 4 40 <=50K 40 <=50K
## 5 40 <=50K 40 <=50K
## 6 45 >50K 45 >50K
# Unite relationship and race into a column
united_underscore <- data_small %>%
unite(rel_race, relationship, race, sep = "_")
head(united_underscore %>% select(rel_race, gender, hoursperweek, country, salary))
## # A tibble: 6 × 5
## rel_race gender hoursperweek country salary
## <chr> <chr> <dbl> <chr> <chr>
## 1 Not-in-family_White Male 40 United-States <=50K
## 2 Husband_White Male 13 United-States <=50K
## 3 Not-in-family_White Male 40 United-States <=50K
## 4 Husband_Black Male 40 United-States <=50K
## 5 Wife_White Female 40 United-States <=50K
## 6 Husband_White Male 45 United-States >50K
united_nosep <- data_small %>%
unite(relrace, relationship, race, sep = "", remove = FALSE) # keep originals
head(united_nosep %>% select(relationship, race, relrace) )
## # A tibble: 6 × 3
## relationship race relrace
## <chr> <chr> <chr>
## 1 Not-in-family White Not-in-familyWhite
## 2 Husband White HusbandWhite
## 3 Not-in-family White Not-in-familyWhite
## 4 Husband Black HusbandBlack
## 5 Wife White WifeWhite
## 6 Husband White HusbandWhite
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
stocks
## # A tibble: 7 × 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 NA
## 5 2016 2 0.92
## 6 2016 3 0.17
## 7 2016 4 2.66
treatment <- tribble(
~person, ~treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
treatment
## # A tibble: 4 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 <NA> 2 10
## 3 <NA> 3 9
## 4 Katherine Burke 1 4
treatment %>%
fill(person)
## # A tibble: 4 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 Derrick Whitmore 2 10
## 3 Derrick Whitmore 3 9
## 4 Katherine Burke 1 4