library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
library(ggplot2)
library(psych)

Principles of Data Visualization and Introduction to ggplot2

I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:

inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)

And lets preview this data:

head(inc)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
summary(inc)
##       Rank          Name            Growth_Rate         Revenue         
##  Min.   :   1   Length:5001        Min.   :  0.340   Min.   :2.000e+06  
##  1st Qu.:1252   Class :character   1st Qu.:  0.770   1st Qu.:5.100e+06  
##  Median :2502   Mode  :character   Median :  1.420   Median :1.090e+07  
##  Mean   :2502                      Mean   :  4.612   Mean   :4.822e+07  
##  3rd Qu.:3751                      3rd Qu.:  3.290   3rd Qu.:2.860e+07  
##  Max.   :5000                      Max.   :421.480   Max.   :1.010e+10  
##                                                                         
##    Industry           Employees           City              State          
##  Length:5001        Min.   :    1.0   Length:5001        Length:5001       
##  Class :character   1st Qu.:   25.0   Class :character   Class :character  
##  Mode  :character   Median :   53.0   Mode  :character   Mode  :character  
##                     Mean   :  232.7                                        
##                     3rd Qu.:  132.0                                        
##                     Max.   :66803.0                                        
##                     NA's   :12

Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:

# glimpse is helpful if there are lots of features. However, in this case it does not provide much utility
glimpse(inc)
## Rows: 5,001
## Columns: 8
## $ Rank        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Name        <chr> "Fuhu", "FederalConference.com", "The HCI Group", "Bridger…
## $ Growth_Rate <dbl> 421.48, 248.31, 245.45, 233.08, 213.37, 179.38, 174.04, 17…
## $ Revenue     <dbl> 1.179e+08, 4.960e+07, 2.550e+07, 1.900e+09, 8.700e+07, 4.5…
## $ Industry    <chr> "Consumer Products & Services", "Government Services", "He…
## $ Employees   <int> 104, 51, 132, 50, 220, 63, 27, 75, 97, 15, 149, 165, 250, …
## $ City        <chr> "El Segundo", "Dumfries", "Jacksonville", "Addison", "Bost…
## $ State       <chr> "CA", "VA", "FL", "TX", "MA", "TX", "TN", "CA", "UT", "RI"…
# describe() is a powerful summary tool, giving the user everything in summary() and more
describe(inc)
##             vars    n        mean           sd    median     trimmed
## Rank           1 5001     2501.64      1443.51 2.502e+03     2501.73
## Name*          2 5001     2501.00      1443.81 2.501e+03     2501.00
## Growth_Rate    3 5001        4.61        14.12 1.420e+00        2.14
## Revenue        4 5001 48222535.49 240542281.14 1.090e+07 17334966.26
## Industry*      5 5001       12.10         7.33 1.300e+01       12.05
## Employees      6 4989      232.72      1353.13 5.300e+01       81.78
## City*          7 5001      732.00       441.12 7.610e+02      731.74
## State*         8 5001       24.80        15.64 2.300e+01       24.44
##                     mad     min        max      range  skew kurtosis         se
## Rank            1853.25 1.0e+00 5.0000e+03 4.9990e+03  0.00    -1.20      20.41
## Name*           1853.25 1.0e+00 5.0010e+03 5.0000e+03  0.00    -1.20      20.42
## Growth_Rate        1.22 3.4e-01 4.2148e+02 4.2114e+02 12.55   242.34       0.20
## Revenue     10674720.00 2.0e+06 1.0100e+10 1.0098e+10 22.17   722.66 3401441.44
## Industry*          8.90 1.0e+00 2.5000e+01 2.4000e+01 -0.10    -1.18       0.10
## Employees         53.37 1.0e+00 6.6803e+04 6.6802e+04 29.81  1268.67      19.16
## City*            604.90 1.0e+00 1.5190e+03 1.5180e+03 -0.04    -1.26       6.24
## State*            19.27 1.0e+00 5.2000e+01 5.1000e+01  0.12    -1.46       0.22
# the last thing I typically check is for null values
colSums(is.na(inc))
##        Rank        Name Growth_Rate     Revenue    Industry   Employees 
##           0           0           0           0           0          12 
##        City       State 
##           0           0
# Lets check out the rows with missing entries
inc %>%
  filter(is.na(Employees))
##    Rank                             Name Growth_Rate   Revenue
## 1   183           First Flight Solutions       22.32   2700000
## 2  1064                         Popchips        3.98  93300000
## 3  1124                       Vocalocity        3.72  42900000
## 4  1653                     Higher Logic        2.36   6000000
## 5  1686      Global Communications Group        2.30   3600000
## 6  2197              JeffreyM Consulting        1.68  12100000
## 7  2743               Excalibur Exhibits        1.27   9900000
## 8  3001       Heartland Business Systems        1.12 156300000
## 9  3978                             SSEC        0.68  80400000
## 10 4112 Carolinas Home Medical Equipment        0.64   3300000
## 11 4566                         Oakbrook        0.48   8900000
## 12 4968                   Popcorn Palace        0.35   5500000
##                        Industry Employees          City State
## 1    Logistics & Transportation        NA  Emerald Isle    NC
## 2               Food & Beverage        NA San Francisco    CA
## 3            Telecommunications        NA       Atlanta    GA
## 4                      Software        NA    Washington    DC
## 5            Telecommunications        NA     Englewood    CO
## 6  Business Products & Services        NA      Bellevue    WA
## 7  Business Products & Services        NA       houston    TX
## 8                   IT Services        NA  Little Chute    WI
## 9                 Manufacturing        NA       Horsham    PA
## 10                       Health        NA      Matthews    NC
## 11                  Real Estate        NA       Madison    WI
## 12              Food & Beverage        NA Schiller Park    IL

One can approach the case of missing values in a few ways. Imputation is one option, and removing the rows is another. In this case, because the revenues of the companies in question vary widely (they are not all small companies) it could be appropriate to impute with the mean.

inc$Employees[is.na(inc$Employees)] <- mean(inc$Employees,na.rm = TRUE)

Question 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.

aggregated <- inc %>%
  group_by(State) %>%
  count(State) %>%
  arrange(desc(n))

aggregated %>%
  ggplot(aes(y=reorder(State, n), x=n)) +
  geom_bar(stat="identity") +
  xlab("Count") + 
  ylab("State")

Quesiton 2

Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.

# New York is the third largest by # companies
ny_state <- inc %>% filter(State=="NY")
summary(ny_state)
##       Rank          Name            Growth_Rate        Revenue         
##  Min.   :  26   Length:311         Min.   : 0.350   Min.   :2.000e+06  
##  1st Qu.:1186   Class :character   1st Qu.: 0.670   1st Qu.:4.300e+06  
##  Median :2702   Mode  :character   Median : 1.310   Median :8.800e+06  
##  Mean   :2612                      Mean   : 4.371   Mean   :5.872e+07  
##  3rd Qu.:4005                      3rd Qu.: 3.580   3rd Qu.:2.570e+07  
##  Max.   :4981                      Max.   :84.430   Max.   :4.600e+09  
##    Industry           Employees           City              State          
##  Length:311         Min.   :    1.0   Length:311         Length:311        
##  Class :character   1st Qu.:   21.0   Class :character   Class :character  
##  Mode  :character   Median :   45.0   Mode  :character   Mode  :character  
##                     Mean   :  271.3                                        
##                     3rd Qu.:  105.5                                        
##                     Max.   :32000.0
# plotting NY state
ny_state %>% 
  filter(complete.cases(.)) %>% # complete cases only
  group_by(Industry) %>% 
  select(Industry, Employees) %>%
  ggplot(mapping=aes(x=Industry, y=Employees)) +
  geom_boxplot()

This will not do. There are many high outliers which are skewing the image. Additionally, the text at the bottom is illegible.

# plotting NY state
ny_state %>% 
  filter(complete.cases(.)) %>% # complete cases only
  group_by(Industry) %>% 
  select(Industry, Employees) %>%
  ggplot(mapping=aes(x=Industry, y=Employees)) +
  geom_boxplot(outlier.shape=NA) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_cartesian(ylim = c(0, 1500))

Question 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.

# first create the new variable
inc$rev_per_employee = inc$Revenue / inc$Employees
head(inc, 20)
##    Rank                         Name Growth_Rate   Revenue
## 1     1                         Fuhu      421.48 1.179e+08
## 2     2        FederalConference.com      248.31 4.960e+07
## 3     3                The HCI Group      245.45 2.550e+07
## 4     4                      Bridger      233.08 1.900e+09
## 5     5                       DataXu      213.37 8.700e+07
## 6     6 MileStone Community Builders      179.38 4.570e+07
## 7     7        Value Payment Systems      174.04 2.550e+07
## 8     8         Emerge Digital Group      170.64 2.390e+07
## 9     9                    Goal Zero      169.81 3.310e+07
## 10   10                     Yagoozon      166.89 1.860e+07
## 11   11                       OBXtek      164.33 2.960e+07
## 12   12                       AdRoll      150.65 3.410e+07
## 13   13                   uBreakiFix      141.02 1.700e+07
## 14   14                        Sparc      128.63 2.110e+07
## 15   15                 LivingSocial      123.33 5.360e+08
## 16   16               Amped Wireless      110.68 1.430e+07
## 17   17            Intelligent Audit      105.73 1.450e+08
## 18   18            Integrity Funding      104.62 1.110e+07
## 19   19         Vertex Body Sciences      100.10 1.180e+07
## 20   20                      BlueKai       92.45 2.680e+07
##                        Industry Employees          City State rev_per_employee
## 1  Consumer Products & Services       104    El Segundo    CA        1133653.8
## 2           Government Services        51      Dumfries    VA         972549.0
## 3                        Health       132  Jacksonville    FL         193181.8
## 4                        Energy        50       Addison    TX       38000000.0
## 5       Advertising & Marketing       220        Boston    MA         395454.5
## 6                   Real Estate        63        Austin    TX         725396.8
## 7            Financial Services        27     Nashville    TN         944444.4
## 8       Advertising & Marketing        75 San Francisco    CA         318666.7
## 9  Consumer Products & Services        97     Bluffdale    UT         341237.1
## 10                       Retail        15       Warwick    RI        1240000.0
## 11          Government Services       149 Tysons Corner    VA         198657.7
## 12      Advertising & Marketing       165 San Francisco    CA         206666.7
## 13                       Retail       250       Orlando    FL          68000.0
## 14                     Software       160    Charleston    SC         131875.0
## 15 Consumer Products & Services      4100    Washington    DC         130731.7
## 16            Computer Hardware        26         Chino    CA         550000.0
## 17   Logistics & Transportation        15 Rochelle Park    NJ        9666666.7
## 18           Financial Services        11      Sarasota    FL        1009090.9
## 19              Food & Beverage        51      columbus    OH         231372.5
## 20      Advertising & Marketing       107     Cupertino    CA         250467.3
# plot with facet wrap
ggplot(inc, aes(rev_per_employee)) +
  geom_density() +
  facet_wrap(~Industry)

ny_state %>%
  group_by(Industry) %>%
  summarize(total_rev = sum(Revenue), 
            total_emp = sum(Employees), 
            rev_per_emp = total_rev/total_emp) %>%
  arrange(desc(rev_per_emp)) %>%
  na.omit() %>%
  ggplot(aes(x=reorder(Industry, rev_per_emp), y=rev_per_emp)) +
  geom_bar(stat="identity") +
  coord_flip()