Week 8: Apply it to your data 7

Import your data

data <- read_excel("mydatasal.xlsx")

#
head(data)

## # A tibble: 6 × 13
##     age workclass     degree marital_status occupation relationship race  gender
##   <dbl> <chr>         <chr>  <chr>          <chr>      <chr>        <chr> <chr> 
## 1    39 State-gov     Bache… Never-married  Adm-cleri… Not-in-fami… White Male  
## 2    50 Self-emp-not… Bache… Married-civ-s… Exec-mana… Husband      White Male  
## 3    38 Private       HS-gr… Divorced       Handlers-… Not-in-fami… White Male  
## 4    53 Private       11th   Married-civ-s… Handlers-… Husband      Black Male  
## 5    28 Private       Bache… Married-civ-s… Prof-spec… Wife         Black Female
## 6    37 Private       Maste… Married-civ-s… Exec-mana… Wife         White Female
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>

data_small <- data %>%
  select(occupation, relationship, race, gender, hoursperweek, country, salary) %>%
  filter(country %in% c("United-States", "Canada"))

Pivoting

long to wide form

wide_hours_by_gender <- data_small %>%
group_by(occupation, gender) %>%
summarise(avg_hours = mean(hoursperweek, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = gender, values_from = avg_hours)

kable(wide_hours_by_gender)

occupation	Female	Male
?	29.92980	33.35027
Adm-clerical	36.78301	39.34810
Armed-Forces	NA	40.66667
Craft-repair	39.75510	42.52112
Exec-managerial	41.41061	46.32423
Farming-fishing	37.38983	48.22141
Handlers-cleaners	35.53472	37.97708
Machine-op-inspct	38.87982	41.51958
Other-service	33.00254	35.21270
Priv-house-serv	30.25301	39.85714
Prof-specialty	39.51367	44.27804
Protective-serv	38.35211	43.60149
Sales	34.13737	44.25170
Tech-support	37.34375	40.79362
Transport-moving	37.12941	45.25177

###2

wide_salary_by_country <- data_small %>%
count(country, salary) %>%
pivot_wider(names_from = salary, values_from = n, values_fill = 0)

kable(wide_salary_by_country)

country	<=50K	>50K
Canada	82	39
United-States	21999	7171

wide to long form

long_hours_by_gender <- wide_hours_by_gender %>%
pivot_longer(cols = -occupation,
names_to = "gender",
values_to = "avg_hours")

kable(long_hours_by_gender)

occupation	gender	avg_hours
?	Female	29.92980
?	Male	33.35027
Adm-clerical	Female	36.78301
Adm-clerical	Male	39.34810
Armed-Forces	Female	NA
Armed-Forces	Male	40.66667
Craft-repair	Female	39.75510
Craft-repair	Male	42.52112
Exec-managerial	Female	41.41061
Exec-managerial	Male	46.32423
Farming-fishing	Female	37.38983
Farming-fishing	Male	48.22141
Handlers-cleaners	Female	35.53472
Handlers-cleaners	Male	37.97708
Machine-op-inspct	Female	38.87982
Machine-op-inspct	Male	41.51958
Other-service	Female	33.00254
Other-service	Male	35.21270
Priv-house-serv	Female	30.25301
Priv-house-serv	Male	39.85714
Prof-specialty	Female	39.51367
Prof-specialty	Male	44.27804
Protective-serv	Female	38.35211
Protective-serv	Male	43.60149
Sales	Female	34.13737
Sales	Male	44.25170
Tech-support	Female	37.34375
Tech-support	Male	40.79362
Transport-moving	Female	37.12941
Transport-moving	Male	45.25177

data_small_unique <- data_small %>% distinct(occupation, gender, .keep_all = TRUE)

data_wide_unique <- data_small_unique %>%
group_by(occupation, gender) %>%
summarise(avg_hours = mean(hoursperweek, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = gender, values_from = avg_hours)

kable(data_wide_unique)

occupation	Female	Male
?	32	40
Adm-clerical	30	40
Armed-Forces	NA	40
Craft-repair	40	40
Exec-managerial	40	13
Farming-fishing	45	35
Handlers-cleaners	40	40
Machine-op-inspct	25	40
Other-service	20	15
Priv-house-serv	30	25
Prof-specialty	50	60
Protective-serv	40	52
Sales	40	50
Tech-support	40	40
Transport-moving	30	40

data_long_unique <- data_wide_unique %>%
pivot_longer(cols = -occupation,
names_to = "gender",
values_to = "avg_hours")

kable(data_long_unique)

occupation	gender	avg_hours
?	Female	32
?	Male	40
Adm-clerical	Female	30
Adm-clerical	Male	40
Armed-Forces	Female	NA
Armed-Forces	Male	40
Craft-repair	Female	40
Craft-repair	Male	40
Exec-managerial	Female	40
Exec-managerial	Male	13
Farming-fishing	Female	45
Farming-fishing	Male	35
Handlers-cleaners	Female	40
Handlers-cleaners	Male	40
Machine-op-inspct	Female	25
Machine-op-inspct	Male	40
Other-service	Female	20
Other-service	Male	15
Priv-house-serv	Female	30
Priv-house-serv	Male	25
Prof-specialty	Female	50
Prof-specialty	Male	60
Protective-serv	Female	40
Protective-serv	Male	52
Sales	Female	40
Sales	Male	50
Tech-support	Female	40
Tech-support	Male	40
Transport-moving	Female	30
Transport-moving	Male	40

Separating and Uniting

Separate a column

# Make a  column table3 "hoursperweek/salary"

data_with_pair <- data_small %>%
mutate(pair = paste0(hoursperweek, "/", salary))

head(data_with_pair %>% select(hoursperweek, salary, pair))

## # A tibble: 6 × 3
##   hoursperweek salary pair    
##          <dbl> <chr>  <chr>   
## 1           40 <=50K  40/<=50K
## 2           13 <=50K  13/<=50K
## 3           40 <=50K  40/<=50K
## 4           40 <=50K  40/<=50K
## 5           40 <=50K  40/<=50K
## 6           45 >50K   45/>50K

# Separate pair 

separated <- data_with_pair %>%
separate(pair, into = c("hoursperweek_num", "salary_str"), sep = "/", convert = TRUE)

head(separated %>% select(hoursperweek, salary, hoursperweek_num, salary_str))

## # A tibble: 6 × 4
##   hoursperweek salary hoursperweek_num salary_str
##          <dbl> <chr>             <int> <chr>     
## 1           40 <=50K                40 <=50K     
## 2           13 <=50K                13 <=50K     
## 3           40 <=50K                40 <=50K     
## 4           40 <=50K                40 <=50K     
## 5           40 <=50K                40 <=50K     
## 6           45 >50K                 45 >50K

Unite two columns

# Unite relationship and race into a column

united_underscore <- data_small %>%
unite(rel_race, relationship, race, sep = "_")

head(united_underscore %>% select(rel_race, gender, hoursperweek, country, salary))

## # A tibble: 6 × 5
##   rel_race            gender hoursperweek country       salary
##   <chr>               <chr>         <dbl> <chr>         <chr> 
## 1 Not-in-family_White Male             40 United-States <=50K 
## 2 Husband_White       Male             13 United-States <=50K 
## 3 Not-in-family_White Male             40 United-States <=50K 
## 4 Husband_Black       Male             40 United-States <=50K 
## 5 Wife_White          Female           40 United-States <=50K 
## 6 Husband_White       Male             45 United-States >50K

united_nosep <- data_small %>%
unite(relrace, relationship, race, sep = "", remove = FALSE)  # keep originals

head(united_nosep %>% select(relationship, race, relrace) )

## # A tibble: 6 × 3
##   relationship  race  relrace           
##   <chr>         <chr> <chr>             
## 1 Not-in-family White Not-in-familyWhite
## 2 Husband       White HusbandWhite      
## 3 Not-in-family White Not-in-familyWhite
## 4 Husband       Black HusbandBlack      
## 5 Wife          White WifeWhite         
## 6 Husband       White HusbandWhite

Missing Values

stocks <- tibble(
year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr    = c(   1,    2,    3,    4,    2,    3,    4),
return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

stocks

## # A tibble: 7 × 3
##    year   qtr return
##   <dbl> <dbl>  <dbl>
## 1  2015     1   1.88
## 2  2015     2   0.59
## 3  2015     3   0.35
## 4  2015     4  NA   
## 5  2016     2   0.92
## 6  2016     3   0.17
## 7  2016     4   2.66

treatment <- tribble(
~person,           ~treatment, ~response,
"Derrick Whitmore", 1,           7,
NA,                 2,          10,
NA,                 3,           9,
"Katherine Burke",  1,           4
)

treatment

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 <NA>                     2       10
## 3 <NA>                     3        9
## 4 Katherine Burke          1        4

treatment %>%
fill(person)

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Derrick Whitmore         2       10
## 3 Derrick Whitmore         3        9
## 4 Katherine Burke          1        4

Week 8: Apply it to your data 7

Connor Sweeney

2025-10-15

Import your data

Pivoting

long to wide form

wide to long form

Separating and Uniting

Separate a column

Unite two columns

Missing Values