data <- read_excel("mydatasal.xlsx")
data
## # A tibble: 32,562 × 13
##      age workclass    degree marital_status occupation relationship race  gender
##    <dbl> <chr>        <chr>  <chr>          <chr>      <chr>        <chr> <chr> 
##  1    39 State-gov    Bache… Never-married  Adm-cleri… Not-in-fami… White Male  
##  2    50 Self-emp-no… Bache… Married-civ-s… Exec-mana… Husband      White Male  
##  3    38 Private      HS-gr… Divorced       Handlers-… Not-in-fami… White Male  
##  4    53 Private      11th   Married-civ-s… Handlers-… Husband      Black Male  
##  5    28 Private      Bache… Married-civ-s… Prof-spec… Wife         Black Female
##  6    37 Private      Maste… Married-civ-s… Exec-mana… Wife         White Female
##  7    49 Private      9th    Married-spous… Other-ser… Not-in-fami… Black Female
##  8    52 Self-emp-no… HS-gr… Married-civ-s… Exec-mana… Husband      White Male  
##  9    31 Private      Maste… Never-married  Prof-spec… Not-in-fami… White Female
## 10    42 Private      Bache… Married-civ-s… Exec-mana… Husband      White Male  
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>
## Variation

Visualizing distributions

ggplot(data = data) +
  geom_bar(mapping = aes(x = age))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_count()`).

data %>% count(age)
## # A tibble: 74 × 2
##      age     n
##    <dbl> <int>
##  1    17   395
##  2    18   550
##  3    19   712
##  4    20   753
##  5    21   720
##  6    22   765
##  7    23   877
##  8    24   798
##  9    25   841
## 10    26   785
## # ℹ 64 more rows
ggplot(data = data) +
  geom_histogram(mapping = aes(x = hoursperweek))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data = data, mapping = aes(x = hoursperweek, colour = relationship)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Typical values

ggplot(data = data, mapping = aes(x = hoursperweek)) + 
  geom_histogram(binwidth = 0.75)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Unusual values

Missing Values

Covariation

A categorical and continuous variable

Two categorical variables

ggplot(data = data, mapping = aes(x = hoursperweek)) + 
  geom_freqpoly(mapping = aes(colour = gender), binwidth = 500)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Two continous variables

ggplot(data = data) + 
  geom_point(mapping = aes(x = gender, y = hoursperweek), alpha = 1 / 100)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Patterns and models

ggplot(data = data) + 
  geom_boxplot(mapping = aes(x = hoursperweek, y = race))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(data = data, aes(x = hoursperweek, color = salary)) +
    geom_freqpoly(binwidth = 5) + # Use a sensible binwidth
    labs(title = "Hours per Week Distribution by Salary", color = "Salary")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).