For information about the data, click the link here. https://github.com/rfordatascience/tidytuesday/tree/master/data/2018/2018-10-16

library(tidyverse)
library(scales)
# Import data
recent_grads <- read.csv("~/R/Business Sat/DATA/recent_grads.csv") %>% as_tibble()

Q1 Describe the first observation

Q2 How many majors are there in the Business Major_category?

Hint: Use count.

recent_grads %>% count(Major_category)
## # A tibble: 16 x 2
##    Major_category                          n
##    <fct>                               <int>
##  1 Agriculture & Natural Resources        10
##  2 Arts                                    8
##  3 Biology & Life Science                 14
##  4 Business                               13
##  5 Communications & Journalism             4
##  6 Computers & Mathematics                11
##  7 Education                              16
##  8 Engineering                            29
##  9 Health                                 12
## 10 Humanities & Liberal Arts              15
## 11 Industrial Arts & Consumer Services     7
## 12 Interdisciplinary                       1
## 13 Law & Public Policy                     5
## 14 Physical Sciences                      10
## 15 Psychology & Social Work                9
## 16 Social Science                          9

Q3 What major has the highest median earnings?

Hint: Take recent_grads, pipe it to dplyr::arrange, and pipe it to dplyr::select.

recent_grads %>% ggplot(aes(ShareWomen, Median)) +
geom_point()

Q4 Is there a gender gap in wages? Decribe the relationship between ShareWomen and Median by creating a scatter plot.

Hint: Take recent_grads and pipe it ggplot(). Map ShareWomen to the x-asix and Median to the y-axis. Use geom_point() for the scatter plot.

recent_grads %>% ggplot(aes(ShareWomen, Median, color = Major_category)) +
geom_point()

Q5 Does Major_category have anything to do with median eaninigs?

Hint: Add the third variable to the aes function by mapping Major_category to color.


 recent_grads %>% ggplot(aes(ShareWomen, Median, color = Major_category)) +
geom_point()

Q6 Lump together least common factor levels into “Other”. There are too many levels in Major_category.

Hint: Take recent_grads, pipe it to mutate(Major_category = fct_lump(Major_category, 4)), and pipe it to ggplot().

recent_grads %>% mutate(Major_category = fct_lump(Major_category, 4)) %>% ggplot(aes(ShareWomen, Median, color = Major_category)) + geom_point()

Q7 Add the regression line.

Hint: Add geom_smooth(aes(group=1), method = “lm”) to to the ggplot() code.

recent_grads %>% 
  mutate(Major_category = fct_lump(Major_category, 4)) %>% 
  ggplot(aes(ShareWomen, Median, color = Major_category)) + 
  geom_point() +
  geom_smooth(aes(group = 1), method = "lm")

Q8 Convert the numbers on the x-axis into the percent format.

Hint: Add scale_x_continuous(labels = percent_format()) to to the ggplot() code.

recent_grads %>% 
  mutate(Major_category = fct_lump(Major_category, 4)) %>% 
  ggplot(aes(ShareWomen, Median, color = Major_category)) + 
  geom_point() +
  geom_smooth(aes(group = 1), method = "lm") +
  scale_x_continuous(labels = percent_format())

Q9 Convert the numbers on the y-axis into the dollar format.

Hint: Add scale_y_continuous(labels = scales::dollar_format()) to to the ggplot() code.

recent_grads %>% 
  mutate(Major_category = fct_lump(Major_category, 4)) %>% 
  ggplot(aes(ShareWomen, Median, color = Major_category)) + 
  geom_point() +
  geom_smooth(aes(group = 1), method = "lm") +
  scale_x_continuous(labels = percent_format())+
  scale_y_continuous(labels = scales::dollar_format())

Q10 Expand the y-axis to zero.

Hint: Add expand_limits() to the ggplot() code.

recent_grads %>% 
  mutate(Major_category = fct_lump(Major_category, 4)) %>% 
  ggplot(aes(ShareWomen, Median, color = Major_category)) +
  scale_x_continuous(labels = percent_format()) +
  scale_y_continuous(labels = scales::dollar_format()) +
  expand_limits(y=0) +
  geom_point() +
  geom_smooth(aes(group = 1), method = "lm")

Q11 What majors appear to be outliers (far away from the regression line)?

Hint: Add the third variable to the aes function by mapping Major to label. Assign the result to g and, in the next two lines, type library(plotly) and then ggplotly(g).

  g <- 
  recent_grads %>% 
  mutate(Major_category = fct_lump(Major_category, 4)) %>% 
  ggplot(aes(ShareWomen, Median, color = Major_category), label = Major) +
  scale_x_continuous(labels = percent_format()) +
  scale_y_continuous(labels = scales::dollar_format()) +
  expand_limits(y=0) +
  geom_point() +
  geom_smooth(aes(group = 1), method = "lm")
library(plotly)
ggplotly(g)

Q12 Are the outliers valid in terms of the sample size?

Hint: Add the third variable to the aes function by mapping Sample_size to size.

g <- 
  recent_grads %>% 
  mutate(Major_category = fct_lump(Major_category, 4)) %>% 
  ggplot(aes(ShareWomen, Median, color = Major_category), label = Major, Sample_size = size) +
  scale_x_continuous(labels = percent_format()) +
  scale_y_continuous(labels = scales::dollar_format()) +
  expand_limits(y=0) +
  geom_point() +
  geom_smooth(aes(group = 1), method = "lm")
library(plotly)
ggplotly(g)