Group By

income_data <- read.csv("./censusincome.csv")
group_by_workclass <- income_data |>
  group_by(workclass) |>
  summarize(avg_hours_per_week = mean(hours_per_work,na.rm = TRUE))
head(group_by_workclass)
## # A tibble: 6 × 2
##   workclass       avg_hours_per_week
##   <chr>                        <dbl>
## 1 " ?"                          31.9
## 2 " Federal-gov"                41.4
## 3 " Local-gov"                  41.0
## 4 " Never-worked"               28.4
## 5 " Private"                    40.3
## 6 " Self-emp-inc"               48.8
  1. Grouping by work class
cleaned_income_data <- income_data |>
  filter(workclass != " ?")

#creating the the grouping by work class
group_by_workclass <- cleaned_income_data |>
  group_by(workclass) |>
  summarise(
    avg_hours_per_week = mean(hours_per_work, na.rm = TRUE),
    number_of_employees = n()
    )
group_by_workclass
## # A tibble: 8 × 3
##   workclass           avg_hours_per_week number_of_employees
##   <chr>                            <dbl>               <int>
## 1 " Federal-gov"                    41.4                 960
## 2 " Local-gov"                      41.0                2093
## 3 " Never-worked"                   28.4                   7
## 4 " Private"                        40.3               22696
## 5 " Self-emp-inc"                   48.8                1116
## 6 " Self-emp-not-inc"               44.4                2541
## 7 " State-gov"                      39.0                1298
## 8 " Without-pay"                    32.7                  14
#creating a line plot for the working class against average hours per week
ggplot(group_by_workclass, aes(x=workclass, y=avg_hours_per_week, group=1)) +
  geom_line(color = "blue", size=0.9) +
  geom_point(color = "blue", size=2) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(title = "Average Hours Worked Per Workclass", x="Workclass", y="Average Hours Per Week")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

  1. Grouping by Education
#creating the group by education
group_by_education <- cleaned_income_data |>
  group_by(education) |>
  summarise(
    avg_hours_per_week = mean(hours_per_work),
    number_of_employees = n()
    ) 
group_by_education
## # A tibble: 16 × 3
##    education       avg_hours_per_week number_of_employees
##    <chr>                        <dbl>               <int>
##  1 " 10th"                       37.5                 833
##  2 " 11th"                       34.2                1057
##  3 " 12th"                       35.9                 393
##  4 " 1st-4th"                    38.5                 156
##  5 " 5th-6th"                    39.1                 303
##  6 " 7th-8th"                    40.2                 574
##  7 " 9th"                        38.7                 463
##  8 " Assoc-acdm"                 41.2                1020
##  9 " Assoc-voc"                  42.0                1321
## 10 " Bachelors"                  43.0                5182
## 11 " Doctorate"                  47.6                 398
## 12 " HS-grad"                    41.0                9969
## 13 " Masters"                    44.2                1675
## 14 " Preschool"                  36.9                  46
## 15 " Prof-school"                48.0                 558
## 16 " Some-college"               39.4                6777
#creating the group by education
group_by_marital <- cleaned_income_data |>
  group_by(marital_status) |>
  summarise(
    avg_hours_per_week = mean(hours_per_work),
    number_of_employees = n()
    ) 
group_by_marital
## # A tibble: 7 × 3
##   marital_status           avg_hours_per_week number_of_employees
##   <chr>                                 <dbl>               <int>
## 1 " Divorced"                            41.5                4259
## 2 " Married-AF-spouse"                   44.2                  21
## 3 " Married-civ-spouse"                  43.8               14340
## 4 " Married-spouse-absent"               40.0                 389
## 5 " Never-married"                       37.3                9917
## 6 " Separated"                           39.7                 959
## 7 " Widowed"                             34.6                 840

Assigning Probability Tags

#first we compute our sample size i.e. total number of people
sample_size <- nrow(cleaned_income_data)
sample_size
## [1] 30725
#We add a column for probability for all the 3 groupings that we had:
group_by_workclass <- group_by_workclass |>
  mutate(probability = number_of_employees / sample_size )
group_by_workclass
## # A tibble: 8 × 4
##   workclass           avg_hours_per_week number_of_employees probability
##   <chr>                            <dbl>               <int>       <dbl>
## 1 " Federal-gov"                    41.4                 960    0.0312  
## 2 " Local-gov"                      41.0                2093    0.0681  
## 3 " Never-worked"                   28.4                   7    0.000228
## 4 " Private"                        40.3               22696    0.739   
## 5 " Self-emp-inc"                   48.8                1116    0.0363  
## 6 " Self-emp-not-inc"               44.4                2541    0.0827  
## 7 " State-gov"                      39.0                1298    0.0422  
## 8 " Without-pay"                    32.7                  14    0.000456
#we do the same thing for grouping by education
group_by_education <- group_by_education |>
  mutate(probability = number_of_employees / sample_size )
group_by_education
## # A tibble: 16 × 4
##    education       avg_hours_per_week number_of_employees probability
##    <chr>                        <dbl>               <int>       <dbl>
##  1 " 10th"                       37.5                 833     0.0271 
##  2 " 11th"                       34.2                1057     0.0344 
##  3 " 12th"                       35.9                 393     0.0128 
##  4 " 1st-4th"                    38.5                 156     0.00508
##  5 " 5th-6th"                    39.1                 303     0.00986
##  6 " 7th-8th"                    40.2                 574     0.0187 
##  7 " 9th"                        38.7                 463     0.0151 
##  8 " Assoc-acdm"                 41.2                1020     0.0332 
##  9 " Assoc-voc"                  42.0                1321     0.0430 
## 10 " Bachelors"                  43.0                5182     0.169  
## 11 " Doctorate"                  47.6                 398     0.0130 
## 12 " HS-grad"                    41.0                9969     0.324  
## 13 " Masters"                    44.2                1675     0.0545 
## 14 " Preschool"                  36.9                  46     0.00150
## 15 " Prof-school"                48.0                 558     0.0182 
## 16 " Some-college"               39.4                6777     0.221
#finally we do it for the group by marital status
#We add a column for probability for all the 3 groupings that we had:
group_by_marital <- group_by_marital |>
  mutate(probability = number_of_employees / sample_size )
group_by_marital
## # A tibble: 7 × 4
##   marital_status           avg_hours_per_week number_of_employees probability
##   <chr>                                 <dbl>               <int>       <dbl>
## 1 " Divorced"                            41.5                4259    0.139   
## 2 " Married-AF-spouse"                   44.2                  21    0.000683
## 3 " Married-civ-spouse"                  43.8               14340    0.467   
## 4 " Married-spouse-absent"               40.0                 389    0.0127  
## 5 " Never-married"                       37.3                9917    0.323   
## 6 " Separated"                           39.7                 959    0.0312  
## 7 " Widowed"                             34.6                 840    0.0273
#finding the categories with the minimum probabilities for each grouping

#for the workclass grouping
lowest_prob_workclass <- group_by_workclass |>
  filter(probability == min(probability))
lowest_prob_workclass
## # A tibble: 1 × 4
##   workclass       avg_hours_per_week number_of_employees probability
##   <chr>                        <dbl>               <int>       <dbl>
## 1 " Never-worked"               28.4                   7    0.000228
#for the education grouping
lowest_prob_education <- group_by_education |>
  filter(probability == min(probability))
lowest_prob_education
## # A tibble: 1 × 4
##   education    avg_hours_per_week number_of_employees probability
##   <chr>                     <dbl>               <int>       <dbl>
## 1 " Preschool"               36.9                  46     0.00150
#for marital status grouping
lowest_prob_marital <- group_by_marital |>
  filter(probability == min(probability))
lowest_prob_marital
## # A tibble: 1 × 4
##   marital_status       avg_hours_per_week number_of_employees probability
##   <chr>                             <dbl>               <int>       <dbl>
## 1 " Married-AF-spouse"               44.2                  21    0.000683

The insights that we can gather from above include the following:

Combinations of Categories

group_by_workclass_income <- cleaned_income_data |>
  group_by(workclass, income) |>
  summarise(number_of_employees = n())
## `summarise()` has grouped output by 'workclass'. You can override using the
## `.groups` argument.
group_by_workclass_income
## # A tibble: 14 × 3
## # Groups:   workclass [8]
##    workclass           income   number_of_employees
##    <chr>               <chr>                  <int>
##  1 " Federal-gov"      " <=50K"                 589
##  2 " Federal-gov"      " >50K"                  371
##  3 " Local-gov"        " <=50K"                1476
##  4 " Local-gov"        " >50K"                  617
##  5 " Never-worked"     " <=50K"                   7
##  6 " Private"          " <=50K"               17733
##  7 " Private"          " >50K"                 4963
##  8 " Self-emp-inc"     " <=50K"                 494
##  9 " Self-emp-inc"     " >50K"                  622
## 10 " Self-emp-not-inc" " <=50K"                1817
## 11 " Self-emp-not-inc" " >50K"                  724
## 12 " State-gov"        " <=50K"                 945
## 13 " State-gov"        " >50K"                  353
## 14 " Without-pay"      " <=50K"                  14

We can see from the above results that our groupings for the workclass category have now been subdivided even further i.e. into \(\le 50k\) and \(> 50k\) categories. To better visualize this we can plot the results above using bar plot as follows:

ggplot(group_by_workclass_income, aes(x=workclass, y=number_of_employees,fill = income)) +
         geom_bar(stat = "identity", position="dodge")+
         theme(axis.text.x = element_text(angle = 45, hjust=1)) +
         labs( title = "Number of Employees by Workclass and Income", x="workclass", y="Number of employees") +
         scale_fill_manual(values = c(" <=50K"= "red", " >50K" = "blue"))

Conclusion

Groupings help us create categories of our data set and visualize our data set better. For the Census Income data luckily we had enough categorical columns so we did not have to create bins from continuous columns. However, we able to use the groups and probability concepts to derive key insights from our data set. Of course, even more groupings and combinations could be created for deeper analysis.