data <- read_excel("mydatasal.xlsx")
data
## # A tibble: 32,562 × 13
## age workclass degree marital_status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 39 State-gov Bache… Never-married Adm-cleri… Not-in-fami… White Male
## 2 50 Self-emp-no… Bache… Married-civ-s… Exec-mana… Husband White Male
## 3 38 Private HS-gr… Divorced Handlers-… Not-in-fami… White Male
## 4 53 Private 11th Married-civ-s… Handlers-… Husband Black Male
## 5 28 Private Bache… Married-civ-s… Prof-spec… Wife Black Female
## 6 37 Private Maste… Married-civ-s… Exec-mana… Wife White Female
## 7 49 Private 9th Married-spous… Other-ser… Not-in-fami… Black Female
## 8 52 Self-emp-no… HS-gr… Married-civ-s… Exec-mana… Husband White Male
## 9 31 Private Maste… Never-married Prof-spec… Not-in-fami… White Female
## 10 42 Private Bache… Married-civ-s… Exec-mana… Husband White Male
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>
## Variation
Visualizing distributions
ggplot(data = data) +
geom_bar(mapping = aes(x = age))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_count()`).

data %>% count(age)
## # A tibble: 74 × 2
## age n
## <dbl> <int>
## 1 17 395
## 2 18 550
## 3 19 712
## 4 20 753
## 5 21 720
## 6 22 765
## 7 23 877
## 8 24 798
## 9 25 841
## 10 26 785
## # ℹ 64 more rows
ggplot(data = data) +
geom_histogram(mapping = aes(x = hoursperweek))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data = data, mapping = aes(x = hoursperweek, colour = relationship)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Typical values
ggplot(data = data, mapping = aes(x = hoursperweek)) +
geom_histogram(binwidth = 0.75)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Unusual values
Missing Values
Covariation
A categorical and continuous variable
Two categorical variables
ggplot(data = data, mapping = aes(x = hoursperweek)) +
geom_freqpoly(mapping = aes(colour = gender), binwidth = 500)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Two continous variables
ggplot(data = data) +
geom_point(mapping = aes(x = gender, y = hoursperweek), alpha = 1 / 100)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Patterns and models
ggplot(data = data) +
geom_boxplot(mapping = aes(x = hoursperweek, y = race))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(data = data, aes(x = hoursperweek, color = salary)) +
geom_freqpoly(binwidth = 5) + # Use a sensible binwidth
labs(title = "Hours per Week Distribution by Salary", color = "Salary")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
