Setting up my environment
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Import data and clean
toys <- read_csv("toy_dataset.csv") %>%
clean_names() %>%
mutate_all(list(~na_if(., " "))) %>%
na.omit() %>%
distinct()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Number = col_double(),
## City = col_character(),
## Gender = col_character(),
## Age = col_double(),
## Income = col_double(),
## Illness = col_character()
## )
str(toys)
## tibble [150,000 × 6] (S3: tbl_df/tbl/data.frame)
## $ number : num [1:150000] 1 2 3 4 5 6 7 8 9 10 ...
## $ city : chr [1:150000] "Dallas" "Dallas" "Dallas" "Dallas" ...
## $ gender : chr [1:150000] "Male" "Male" "Male" "Male" ...
## $ age : num [1:150000] 41 54 42 40 46 36 32 39 51 30 ...
## $ income : num [1:150000] 40367 45084 52483 40941 50289 ...
## $ illness: chr [1:150000] "No" "No" "No" "No" ...
summary(toys)
## number city gender age
## Min. : 1 Length:150000 Length:150000 Min. :25.00
## 1st Qu.: 37501 Class :character Class :character 1st Qu.:35.00
## Median : 75000 Mode :character Mode :character Median :45.00
## Mean : 75000 Mean :44.95
## 3rd Qu.:112500 3rd Qu.:55.00
## Max. :150000 Max. :65.00
## income illness
## Min. : -654 Length:150000
## 1st Qu.: 80868 Class :character
## Median : 93655 Mode :character
## Mean : 91253
## 3rd Qu.:104519
## Max. :177157
- There is an individual who reported income of less than 0 which we will investigate
indices <- which(toys$income <0)
toys[246,]
## # A tibble: 1 x 6
## number city gender age income illness
## <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 246 Dallas Female 40 -654 Yes
- Since our data set is large and this is only one entry with an invalid entry of income < 0, we will simply delete this observation from our analysis.
toys_a <- toys[-c(246),]
summary(toys_a)
## number city gender age
## Min. : 1 Length:149999 Length:149999 Min. :25.00
## 1st Qu.: 37502 Class :character Class :character 1st Qu.:35.00
## Median : 75001 Mode :character Mode :character Median :45.00
## Mean : 75001 Mean :44.95
## 3rd Qu.:112500 3rd Qu.:55.00
## Max. :150000 Max. :65.00
## income illness
## Min. : 584 Length:149999
## 1st Qu.: 80868 Class :character
## Median : 93655 Mode :character
## Mean : 91253
## 3rd Qu.:104519
## Max. :177157
Data Exploration
table(toys_a$gender)
##
## Female Male
## 66199 83800
hchart(as.character(toys_a$gender), type = "pie") %>%
hc_title(text = "Gender Distribution")
table(toys_a$illness)
##
## No Yes
## 137861 12138
hchart(as.character(toys_a$illness), type = "pie") %>%
hc_title(text = "Illness Pie Chart")
table(toys_a$city)
##
## Austin Boston Dallas Los Angeles Mountain View
## 12292 8301 19706 32173 14219
## New York City San Diego Washington D.C.
## 50307 4881 8120
ggplot(toys_a, aes(x = city),) +
geom_bar(color = 'pink', fill = 'lightblue') +
labs(y = "Count", x = "City", title = "City Sample Breakdown")

toys_i <-
toys_a %>%
group_by(city) %>%
summarize(avg_income = mean(income)) %>%
arrange(city)
head(toys_i)
## # A tibble: 6 x 2
## city avg_income
## <chr> <dbl>
## 1 Austin 90278.
## 2 Boston 91555.
## 3 Dallas 45255.
## 4 Los Angeles 95264.
## 5 Mountain View 135078.
## 6 New York City 96857.
ggplot(toys_a, aes(x = city, y = income)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
labs(x = "City", y = "Income", title = "Average Income by City")

toys_b <-
toys_a %>%
group_by(age, gender) %>%
summarize(avg_income = mean(income))
## `summarise()` has grouped output by 'age'. You can override using the `.groups` argument.
ggplot(toys_b, aes(x = age, y = avg_income, color = gender)) +
geom_line(size = 1) +
labs(y = "Average Income", x = "Age", title = "Average Income throughout Lifetime") +
scale_color_manual(values = c("#1b98e0", "#353436"))

- Here we can see that the average income stays relatively normal throughout an individuals lifetime, but there is a clear discrepancy between the average income between genders
ggplot(toys_a, aes(x = income, color = "red")) +
geom_density(alpha = .2) +
geom_vline(aes(xintercept = mean(income, na.rm = T)),
color = "black", linetype = "dashed", size = 1) +
geom_bar(pos = "dodge") +
labs(y = "Value", x = "Income", title = "Distribution of Income")
