Load Libraries

library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)

Import data

df <- read_csv('adult.csv.zip', na = c("", "NA", "?"))

Describe file format

str(df, max.level = 1)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   32561 obs. of  15 variables:
 $ age           : int  90 82 66 54 41 34 38 74 68 41 ...
 $ workclass     : chr  "?" "Private" "?" "Private" ...
 $ fnlwgt        : int  77053 132870 186061 140359 264663 216864 150601 88638 422013 70037 ...
 $ education     : chr  "HS-grad" "HS-grad" "Some-college" "7th-8th" ...
 $ education.num : int  9 9 10 4 10 9 6 16 9 10 ...
 $ marital.status: chr  "Widowed" "Widowed" "Widowed" "Divorced" ...
 $ occupation    : chr  "?" "Exec-managerial" "?" "Machine-op-inspct" ...
 $ relationship  : chr  "Not-in-family" "Not-in-family" "Unmarried" "Unmarried" ...
 $ race          : chr  "White" "White" "Black" "White" ...
 $ sex           : chr  "Female" "Female" "Female" "Female" ...
 $ capital.gain  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ capital.loss  : int  4356 4356 4356 3900 3900 3770 3770 3683 3683 3004 ...
 $ hours.per.week: int  40 18 40 40 40 45 40 20 40 60 ...
 $ native.country: chr  "United-States" "United-States" "United-States" "United-States" ...
 $ income        : chr  "<=50K" "<=50K" "<=50K" "<=50K" ...

Convert categorical variables

df$age <- as.factor(df$age)
df$workclass <- as.factor(df$workclass)
df$education <- as.factor(df$education)
df$marital.status <- as.factor(df$marital.status)
df$occupation <- as.factor(df$occupation)
df$relationship <- as.factor(df$relationship)
df$race <- as.factor(df$race)
df$sex <- as.factor(df$sex)
df$native.country <- as.factor(df$native.country)
df$income <- as.factor(df$income)

Remove duplicate variables

df <- df %>%
  select(-education.num)

VARIATION

Visualising the distribution of a categorical variable, use Bar-Chart

How you visualise the distribution of a variable will depend on whether the variable is categorical or continuous. To examine the distribution of a categorical variable, use a bar chart:

Visualising the distribution of a continuous variable, use a histogram:

ggplot(data = df) +
  geom_histogram(mapping = aes(x = hours.per.week, fill=..count..), binwidth = 5, col="red") +
  scale_fill_gradient("Count", low = "green", high = "red")

To make it easy to see the unusual values, we need to zoom into to small values of the y-axis with coord_cartesian():

ggplot(data = df) +
  geom_histogram(mapping = aes(x = hours.per.week)) +
  coord_cartesian(ylim = c(0, 100), xlim = c(90, 100))

COVARIATION

Variation describes the behavior within a variable, covariation describes the behavior between variables.

A categorical and continuous variable

to display the distribution of a continuous variable broken down by a categorical variable is the boxplot.

Two categorical variables

To visualise the covariation between categorical variables, you’ll need to count the number of observations for each combination. One way to do that is to rely on the built-in geom_count():

ggplot(data = df) +
  geom_count(mapping = aes(x = education, y = race)) +
   theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
  coord_flip()

Then visualise with geom_tile() and the fill aesthetic:

df %>% 
  count(education, race) %>%  
  ggplot(mapping = aes(x = education, y = race)) +
    geom_tile(mapping = aes(fill = n)) +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
  coord_flip()

LS0tDQp0aXRsZTogIkV4cGxvcmF0b3J5IERhdGEgQW5hbHlzaXMiDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQNCiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0DQotLS0NCiMjI0xvYWQgTGlicmFyaWVzDQpgYGB7ciwgZWNobz1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkodGlkeXIpDQpsaWJyYXJ5KHJlYWRyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQojIyNJbXBvcnQgZGF0YQ0KYGBge3IsIGVjaG89VFJVRSwgbWVzc2FnZT1UUlVFLCB3YXJuaW5nPVRSVUV9DQpkZiA8LSByZWFkX2NzdignYWR1bHQuY3N2LnppcCcsIG5hID0gYygiIiwgIk5BIiwgIj8iKSkNCmBgYA0KIyMjRGVzY3JpYmUgZmlsZSBmb3JtYXQNCmBgYHtyLCBlY2hvPVRSVUV9DQpzdHIoZGYsIG1heC5sZXZlbCA9IDEpDQpgYGANCiMjI0NvbnZlcnQgY2F0ZWdvcmljYWwgdmFyaWFibGVzDQpgYGB7ciwgZWNobz1UUlVFfQ0KZGYkYWdlIDwtIGFzLmZhY3RvcihkZiRhZ2UpDQpkZiR3b3JrY2xhc3MgPC0gYXMuZmFjdG9yKGRmJHdvcmtjbGFzcykNCmRmJGVkdWNhdGlvbiA8LSBhcy5mYWN0b3IoZGYkZWR1Y2F0aW9uKQ0KZGYkbWFyaXRhbC5zdGF0dXMgPC0gYXMuZmFjdG9yKGRmJG1hcml0YWwuc3RhdHVzKQ0KZGYkb2NjdXBhdGlvbiA8LSBhcy5mYWN0b3IoZGYkb2NjdXBhdGlvbikNCmRmJHJlbGF0aW9uc2hpcCA8LSBhcy5mYWN0b3IoZGYkcmVsYXRpb25zaGlwKQ0KZGYkcmFjZSA8LSBhcy5mYWN0b3IoZGYkcmFjZSkNCmRmJHNleCA8LSBhcy5mYWN0b3IoZGYkc2V4KQ0KZGYkbmF0aXZlLmNvdW50cnkgPC0gYXMuZmFjdG9yKGRmJG5hdGl2ZS5jb3VudHJ5KQ0KZGYkaW5jb21lIDwtIGFzLmZhY3RvcihkZiRpbmNvbWUpDQpgYGANCiMjI1JlbW92ZSBkdXBsaWNhdGUgdmFyaWFibGVzDQpgYGB7ciwgZWNobz1UUlVFfQ0KZGYgPC0gZGYgJT4lDQogIHNlbGVjdCgtZWR1Y2F0aW9uLm51bSkNCmBgYA0KIyNWQVJJQVRJT04gDQojIyNWaXN1YWxpc2luZyB0aGUgZGlzdHJpYnV0aW9uIG9mIGEgY2F0ZWdvcmljYWwgdmFyaWFibGUsIHVzZSBCYXItQ2hhcnQNCkhvdyB5b3UgdmlzdWFsaXNlIHRoZSBkaXN0cmlidXRpb24gb2YgYSB2YXJpYWJsZSB3aWxsIGRlcGVuZCBvbiB3aGV0aGVyIHRoZSB2YXJpYWJsZSBpcyBjYXRlZ29yaWNhbCBvciBjb250aW51b3VzLiBUbyBleGFtaW5lIHRoZSBkaXN0cmlidXRpb24gb2YgYSBjYXRlZ29yaWNhbCB2YXJpYWJsZSwgdXNlIGEgYmFyIGNoYXJ0Og0KYGBge3IsIGVjaG89RkFMU0V9DQpnZ3Bsb3QoZGYsIGFlcyh4ID1pbmNvbWUpKSArDQogIGdlb21fYmFyKGFlcyh5ID0gKC4uY291bnQuLikvc3VtKC4uY291bnQuLikpKSArDQogIGdlb21fdGV4dChhZXMoeSA9ICgoLi5jb3VudC4uKS9zdW0oLi5jb3VudC4uKSksIGxhYmVsID0gc2NhbGVzOjpwZXJjZW50KCguLmNvdW50Li4pL3N1bSguLmNvdW50Li4pKSksIHN0YXQgPSAiY291bnQiLCB2anVzdCA9IC0wLjI1KSArDQogIGxhYnModGl0bGUgPSAiSW5jb21lIERpc3RyaWJ1dGlvbiIsIHkgPSAiUGVyY2VudCIsIHggPSAiSW5jb21lIikNCmBgYA0KYGBge3IsIGVjaG89RkFMU0V9DQogIGdncGxvdChkZiwgYWVzKHg9cmVvcmRlcihlZHVjYXRpb24sIC10YWJsZShlZHVjYXRpb24pW2VkdWNhdGlvbl0pKSkrDQogIGdlb21fYmFyKGFlcyh5ID0gKC4uY291bnQuLikvc3VtKC4uY291bnQuLikpKSArDQogIGdlb21fdGV4dChhZXMoeSA9ICgoLi5jb3VudC4uKS9zdW0oLi5jb3VudC4uKSksIGxhYmVsID0gc2NhbGVzOjpwZXJjZW50KCguLmNvdW50Li4pL3N1bSguLmNvdW50Li4pKSksIHN0YXQgPSAiY291bnQiLCB2anVzdCA9IC0wLjI1KSArDQogIGxhYnModGl0bGUgPSAiRWR1Y2F0aW9uIERpc3RyaWJ1dGlvbiIsIHkgPSAiUGVyY2VudCIsIHggPSAiRWR1Y2F0aW9uIikgKw0KICB0aGVtZShheGlzLnRleHQueD1lbGVtZW50X3RleHQoYW5nbGU9OTAsaGp1c3Q9MSx2anVzdD0wLjUpKQ0KYGBgDQojIyNWaXN1YWxpc2luZyB0aGUgZGlzdHJpYnV0aW9uIG9mIGEgY29udGludW91cyB2YXJpYWJsZSwgdXNlIGEgaGlzdG9ncmFtOg0KYGBge3J9DQpnZ3Bsb3QoZGF0YSA9IGRmKSArDQogIGdlb21faGlzdG9ncmFtKG1hcHBpbmcgPSBhZXMoeCA9IGhvdXJzLnBlci53ZWVrLCBmaWxsPS4uY291bnQuLiksIGJpbndpZHRoID0gNSwgY29sPSJyZWQiKSArDQogIHNjYWxlX2ZpbGxfZ3JhZGllbnQoIkNvdW50IiwgbG93ID0gImdyZWVuIiwgaGlnaCA9ICJyZWQiKQ0KYGBgDQpUbyBtYWtlIGl0IGVhc3kgdG8gc2VlIHRoZSB1bnVzdWFsIHZhbHVlcywgd2UgbmVlZCB0byB6b29tIGludG8gdG8gc21hbGwgdmFsdWVzIG9mIHRoZSB5LWF4aXMgd2l0aCBjb29yZF9jYXJ0ZXNpYW4oKToNCmBgYHtyfQ0KZ2dwbG90KGRhdGEgPSBkZikgKw0KICBnZW9tX2hpc3RvZ3JhbShtYXBwaW5nID0gYWVzKHggPSBob3Vycy5wZXIud2VlaykpICsNCiAgY29vcmRfY2FydGVzaWFuKHlsaW0gPSBjKDAsIDEwMCksIHhsaW0gPSBjKDkwLCAxMDApKQ0KYGBgDQojI0NPVkFSSUFUSU9ODQpWYXJpYXRpb24gZGVzY3JpYmVzIHRoZSBiZWhhdmlvciB3aXRoaW4gYSB2YXJpYWJsZSwgY292YXJpYXRpb24gZGVzY3JpYmVzIHRoZSBiZWhhdmlvciBiZXR3ZWVuIHZhcmlhYmxlcy4gIA0KDQojIyNBIGNhdGVnb3JpY2FsIGFuZCBjb250aW51b3VzIHZhcmlhYmxlDQp0byBkaXNwbGF5IHRoZSBkaXN0cmlidXRpb24gb2YgYSBjb250aW51b3VzIHZhcmlhYmxlIGJyb2tlbiBkb3duIGJ5IGEgY2F0ZWdvcmljYWwgdmFyaWFibGUgaXMgdGhlIGJveHBsb3QuDQpgYGB7ciwgZWNobz1GQUxTRX0NCmdncGxvdChkYXRhID0gZGYpICsNCiAgZ2VvbV9ib3hwbG90KG1hcHBpbmcgPSBhZXMoeCA9IHJlb3JkZXIoZWR1Y2F0aW9uLGhvdXJzLnBlci53ZWVrLEZVTiA9IG1lZGlhbikgLCB5ID0gaG91cnMucGVyLndlZWspKSArDQogIGxhYnMoeD0nRWR1Y2F0aW9uJywgeT0nSG91cnMgcGVyIHdlZWsnLCB0aXRsZT0nQ292YXJpYXRpb24gYmV0d2VlbiBob3VycyBhbmQgZWR1Y2F0aW9uJykgKw0KICBjb29yZF9mbGlwKCkNCmBgYA0KIyMjVHdvIGNhdGVnb3JpY2FsIHZhcmlhYmxlcw0KVG8gdmlzdWFsaXNlIHRoZSBjb3ZhcmlhdGlvbiBiZXR3ZWVuIGNhdGVnb3JpY2FsIHZhcmlhYmxlcywgeW914oCZbGwgbmVlZCB0byBjb3VudCB0aGUgbnVtYmVyIG9mIG9ic2VydmF0aW9ucyBmb3IgZWFjaCBjb21iaW5hdGlvbi4gT25lIHdheSB0byBkbyB0aGF0IGlzIHRvIHJlbHkgb24gdGhlIGJ1aWx0LWluIGdlb21fY291bnQoKToNCmBgYHtyfQ0KZ2dwbG90KGRhdGEgPSBkZikgKw0KICBnZW9tX2NvdW50KG1hcHBpbmcgPSBhZXMoeCA9IGVkdWNhdGlvbiwgeSA9IHJhY2UpKSArDQogIHRoZW1lKGF4aXMudGV4dC54PWVsZW1lbnRfdGV4dChhbmdsZT05MCxoanVzdD0xLHZqdXN0PTAuNSkpICsNCiAgY29vcmRfZmxpcCgpDQpgYGANClRoZW4gdmlzdWFsaXNlIHdpdGggZ2VvbV90aWxlKCkgYW5kIHRoZSBmaWxsIGFlc3RoZXRpYzoNCmBgYHtyfQ0KZGYgJT4lIA0KICBjb3VudChlZHVjYXRpb24sIHJhY2UpICU+JSAgDQogIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSBlZHVjYXRpb24sIHkgPSByYWNlKSkgKw0KICAgIGdlb21fdGlsZShtYXBwaW5nID0gYWVzKGZpbGwgPSBuKSkgKw0KICB0aGVtZShheGlzLnRleHQueD1lbGVtZW50X3RleHQoYW5nbGU9OTAsaGp1c3Q9MSx2anVzdD0wLjUpKSArDQogIGNvb3JkX2ZsaXAoKQ0KYGBgDQoNCg==