DATA608_01

Principles of Data Visualization and Introduction to ggplot2

Data provided contains 5,000 fastest growing companies in the US as compiled by Inc. magazine. lets read this in:

And lets preview this data:

my_stats <- inc %>%
  dplyr::filter(State == "NY", complete.cases(.)) %>%
  select(Industry, Revenue, Employees) %>%
  mutate(Industry = as.character(Industry),
         RevPerEmp = Revenue/Employees) %>%
  group_by(Industry) %>%
  summarise(MedRevPerEmp = median(RevPerEmp),
            MedEmpPerIndustry = median(Employees)) %>% ungroup()

my_v_median <- 200000
my_v_medEmp <- 45   

head(inc, 3)

##   Rank                  Name Growth_Rate   Revenue
## 1    1                  Fuhu      421.48 117900000
## 2    2 FederalConference.com      248.31  49600000
## 3    3         The HCI Group      245.45  25500000
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL

summary(inc[,-c(1:2,7,8)])

##   Growth_Rate         Revenue                                  Industry   
##  Min.   :  0.340   Min.   :2.000e+06   IT Services                 : 733  
##  1st Qu.:  0.770   1st Qu.:5.100e+06   Business Products & Services: 482  
##  Median :  1.420   Median :1.090e+07   Advertising & Marketing     : 471  
##  Mean   :  4.612   Mean   :4.822e+07   Health                      : 355  
##  3rd Qu.:  3.290   3rd Qu.:2.860e+07   Software                    : 342  
##  Max.   :421.480   Max.   :1.010e+10   Financial Services          : 260  
##                                        (Other)                     :2358  
##    Employees      
##  Min.   :    1.0  
##  1st Qu.:   25.0  
##  Median :   53.0  
##  Mean   :  232.7  
##  3rd Qu.:  132.0  
##  Max.   :66803.0  
##  NA's   :12

Below, the summary stats of 4 features of interest provide some background on the data included:

# some functions for reviewing my data and formatting 
fun_mean <- function(x){
  return(data.frame(y = mean(x),
                    label = mean(x, na.rm = T)))}

na_review <- function(df){
  # returns df of vars w/ NA qty desc.
  na_qty <- colSums(is.na(df)) %>% as.data.frame(stringsAsFactors=F)
  colnames(na_qty) <- c("NA_qty")
  na_qty <- cbind('Variable' = rownames(na_qty), na_qty) %>% 
    select(Variable, NA_qty)
  rownames(na_qty) <- NULL
  
  na_qty <- na_qty %>% 
    arrange(desc(NA_qty)) %>% dplyr::filter(NA_qty > 0) %>% 
    mutate(Variable = as.character(Variable)) %>% 
    mutate(Pct_of_Tot =  round(NA_qty/nrow(df), 4) * 100)
  
  return(na_qty)
}
na_review(inc)

##    Variable NA_qty Pct_of_Tot
## 1 Employees     12       0.24

psych::describe(inc)

##             vars    n        mean           sd    median     trimmed
## Rank           1 5001     2501.64      1443.51 2.502e+03     2501.73
## Name*          2 5001     2501.00      1443.81 2.501e+03     2501.00
## Growth_Rate    3 5001        4.61        14.12 1.420e+00        2.14
## Revenue        4 5001 48222535.49 240542281.14 1.090e+07 17334966.26
## Industry*      5 5001       12.10         7.33 1.300e+01       12.05
## Employees      6 4989      232.72      1353.13 5.300e+01       81.78
## City*          7 5001      732.00       441.12 7.610e+02      731.74
## State*         8 5001       24.80        15.64 2.300e+01       24.44
##                     mad     min        max      range  skew kurtosis
## Rank            1853.25 1.0e+00 5.0000e+03 4.9990e+03  0.00    -1.20
## Name*           1853.25 1.0e+00 5.0010e+03 5.0000e+03  0.00    -1.20
## Growth_Rate        1.22 3.4e-01 4.2148e+02 4.2114e+02 12.55   242.34
## Revenue     10674720.00 2.0e+06 1.0100e+10 1.0098e+10 22.17   722.66
## Industry*          8.90 1.0e+00 2.5000e+01 2.4000e+01 -0.10    -1.18
## Employees         53.37 1.0e+00 6.6803e+04 6.6802e+04 29.81  1268.67
## City*            604.90 1.0e+00 1.5190e+03 1.5180e+03 -0.04    -1.26
## State*            19.27 1.0e+00 5.2000e+01 5.1000e+01  0.12    -1.46
##                     se
## Rank             20.41
## Name*            20.42
## Growth_Rate       0.20
## Revenue     3401441.44
## Industry*         0.10
## Employees        19.16
## City*             6.24
## State*            0.22

Question 1

A graph that shows the distribution of companies in the dataset by State (ie how many are in each state).

# note that I coded in such a way that each step is very explicit
# and not for streamlining purposes. 

inc %>% dplyr::group_by(State) %>% 
  dplyr::summarise(CoQty = n()) %>% 
  arrange(CoQty) %>% 
  ggplot(aes(x = reorder(State, CoQty), y = CoQty)) + 
  geom_bar(stat="identity", alpha = .8) + 
  coord_flip() + 
  labs(x = "", y = "", 
       title = "NY Company Qty by State") +
  theme_minimal()

Quesiton 2

The state with the 3rd most companies in the data set is NY. Examining just this state, the plot below shows the median employment by industry for companies. Boxplots with outliers can get the job done.

#View(inc %>% dplyr::filter(State == "NY", complete.cases(.)) )
inc %>% dplyr::filter(State == "NY", complete.cases(.)) %>% 
  select(Industry, Employees) %>% 
  mutate(Industry = as.character(Industry)) %>% 
  arrange(Industry) %>% 
  ggplot(aes(x = reorder(Industry, Employees, median), 
             y = Employees )) + 
  geom_boxplot(outlier.shape = 21, 
               outlier.alpha = .5,
               outlier.color = "black") + 
  geom_hline(yintercept = my_v_medEmp, 
             color = "darkblue", alpha = 0.5) + 
  scale_y_log10(breaks = c(1,10,100,1000, 10000), 
                labels=scales::comma) +
  coord_flip() +
  labs(x = "", title = "NY Employee Qty by Industry") + 
  theme_minimal() +
  annotate("text", x = 26.2, y = my_v_medEmp + 5, 
           label =sprintf("State Median: %s",
                          scales::comma(my_v_medEmp)), 
             color = "darkblue", 
             alpha = .5, 
             size = 2.3, vjust="inward", hjust="inward") +
  theme_minimal()

Question 3

A chart that the industries that generate the most revenue per employee. Similar to the previous plot now with a annotated median line that provides the viewer with a measure of the statewide median.

p <- inc %>% 
  dplyr::filter(State == "NY", complete.cases(.)) %>% 
  select(Industry, Revenue, Employees) %>% 
  mutate(Industry = as.character(Industry), 
         RevPerEmp = Revenue/Employees) %>% 
  arrange(Industry) %>% 
  ggplot(aes(x = reorder(Industry, RevPerEmp, mean), 
             y = RevPerEmp)) +
  geom_boxplot(outlier.shape = 21,
               outlier.alpha = .5) +
  geom_hline(yintercept = my_v_median, 
             color = "darkblue", alpha = 0.5) + 
  scale_y_log10(breaks = c(10000,50000,250000,1500000, 10000000),
                labels=scales::dollar) +
  coord_flip() +
  labs(x = "", y = "", 
       title = "NY Revenue per Employees by Industry") 

p + annotate("text", x = 26.2, y = my_v_median+15E3, 
             label =sprintf("State Median: $%s",
                            scales::comma(my_v_median)), 
             color = "darkblue", 
             alpha = .5, 
             size = 2.3, vjust="inward", hjust="inward") +
  #annotation_logticks() + 
  theme_minimal()