This fictional dataset features 150000 rows and 6 columns for exploratory data analysis (EDA) click here

Setting up my environment

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Import data and clean

toys <- read_csv("toy_dataset.csv") %>%
  clean_names() %>%
  mutate_all(list(~na_if(., " "))) %>%
  na.omit() %>%
  distinct()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Number = col_double(),
##   City = col_character(),
##   Gender = col_character(),
##   Age = col_double(),
##   Income = col_double(),
##   Illness = col_character()
## )
str(toys)
## tibble [150,000 × 6] (S3: tbl_df/tbl/data.frame)
##  $ number : num [1:150000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ city   : chr [1:150000] "Dallas" "Dallas" "Dallas" "Dallas" ...
##  $ gender : chr [1:150000] "Male" "Male" "Male" "Male" ...
##  $ age    : num [1:150000] 41 54 42 40 46 36 32 39 51 30 ...
##  $ income : num [1:150000] 40367 45084 52483 40941 50289 ...
##  $ illness: chr [1:150000] "No" "No" "No" "No" ...
summary(toys)
##      number           city              gender               age       
##  Min.   :     1   Length:150000      Length:150000      Min.   :25.00  
##  1st Qu.: 37501   Class :character   Class :character   1st Qu.:35.00  
##  Median : 75000   Mode  :character   Mode  :character   Median :45.00  
##  Mean   : 75000                                         Mean   :44.95  
##  3rd Qu.:112500                                         3rd Qu.:55.00  
##  Max.   :150000                                         Max.   :65.00  
##      income         illness         
##  Min.   :  -654   Length:150000     
##  1st Qu.: 80868   Class :character  
##  Median : 93655   Mode  :character  
##  Mean   : 91253                     
##  3rd Qu.:104519                     
##  Max.   :177157
  • There is an individual who reported income of less than 0 which we will investigate
indices <- which(toys$income <0)
toys[246,]
## # A tibble: 1 x 6
##   number city   gender   age income illness
##    <dbl> <chr>  <chr>  <dbl>  <dbl> <chr>  
## 1    246 Dallas Female    40   -654 Yes
  • Since our data set is large and this is only one entry with an invalid entry of income < 0, we will simply delete this observation from our analysis.
toys_a <- toys[-c(246),]
summary(toys_a)
##      number           city              gender               age       
##  Min.   :     1   Length:149999      Length:149999      Min.   :25.00  
##  1st Qu.: 37502   Class :character   Class :character   1st Qu.:35.00  
##  Median : 75001   Mode  :character   Mode  :character   Median :45.00  
##  Mean   : 75001                                         Mean   :44.95  
##  3rd Qu.:112500                                         3rd Qu.:55.00  
##  Max.   :150000                                         Max.   :65.00  
##      income         illness         
##  Min.   :   584   Length:149999     
##  1st Qu.: 80868   Class :character  
##  Median : 93655   Mode  :character  
##  Mean   : 91253                     
##  3rd Qu.:104519                     
##  Max.   :177157

Data Exploration

table(toys_a$gender)
## 
## Female   Male 
##  66199  83800
hchart(as.character(toys_a$gender), type = "pie") %>%
  hc_title(text = "Gender Distribution")
table(toys_a$illness)
## 
##     No    Yes 
## 137861  12138
hchart(as.character(toys_a$illness), type = "pie") %>%
  hc_title(text = "Illness Pie Chart")
table(toys_a$city)
## 
##          Austin          Boston          Dallas     Los Angeles   Mountain View 
##           12292            8301           19706           32173           14219 
##   New York City       San Diego Washington D.C. 
##           50307            4881            8120
ggplot(toys_a, aes(x = city),) +
  geom_bar(color = 'pink', fill = 'lightblue') +
  labs(y = "Count", x = "City", title = "City Sample Breakdown")

toys_i <- 
  toys_a %>%
  group_by(city) %>%
  summarize(avg_income = mean(income)) %>%
  arrange(city)
head(toys_i)
## # A tibble: 6 x 2
##   city          avg_income
##   <chr>              <dbl>
## 1 Austin            90278.
## 2 Boston            91555.
## 3 Dallas            45255.
## 4 Los Angeles       95264.
## 5 Mountain View    135078.
## 6 New York City     96857.
ggplot(toys_a, aes(x = city, y = income)) +
  geom_boxplot() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(x = "City", y = "Income", title = "Average Income by City")

toys_b <- 
  toys_a %>%
  group_by(age, gender) %>%
  summarize(avg_income = mean(income))
## `summarise()` has grouped output by 'age'. You can override using the `.groups` argument.
ggplot(toys_b, aes(x = age, y = avg_income, color = gender)) +
  geom_line(size = 1) +
  labs(y = "Average Income", x = "Age", title = "Average Income throughout Lifetime") +
  scale_color_manual(values = c("#1b98e0", "#353436"))

  • Here we can see that the average income stays relatively normal throughout an individuals lifetime, but there is a clear discrepancy between the average income between genders
ggplot(toys_a, aes(x = income, color = "red")) +
  geom_density(alpha = .2) +
  geom_vline(aes(xintercept = mean(income, na.rm = T)),
color = "black", linetype = "dashed", size = 1) +
  geom_bar(pos = "dodge") +
  labs(y = "Value", x = "Income", title = "Distribution of Income")