##           installed_and_loaded.packages.
## prettydoc                           TRUE
## psych                               TRUE
## knitr                               TRUE
## tidyverse                           TRUE
## ggthemes                            TRUE
## scales                              TRUE

Principles of Data Visualization and Introduction to ggplot2

I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:

inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)

And lets preview this data:

head(inc)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
summary(inc)
##       Rank                          Name       Growth_Rate     
##  Min.   :   1   (Add)ventures         :   1   Min.   :  0.340  
##  1st Qu.:1252   @Properties           :   1   1st Qu.:  0.770  
##  Median :2502   1-Stop Translation USA:   1   Median :  1.420  
##  Mean   :2502   110 Consulting        :   1   Mean   :  4.612  
##  3rd Qu.:3751   11thStreetCoffee.com  :   1   3rd Qu.:  3.290  
##  Max.   :5000   123 Exteriors         :   1   Max.   :421.480  
##                 (Other)               :4995                    
##     Revenue                                  Industry      Employees      
##  Min.   :2.000e+06   IT Services                 : 733   Min.   :    1.0  
##  1st Qu.:5.100e+06   Business Products & Services: 482   1st Qu.:   25.0  
##  Median :1.090e+07   Advertising & Marketing     : 471   Median :   53.0  
##  Mean   :4.822e+07   Health                      : 355   Mean   :  232.7  
##  3rd Qu.:2.860e+07   Software                    : 342   3rd Qu.:  132.0  
##  Max.   :1.010e+10   Financial Services          : 260   Max.   :66803.0  
##                      (Other)                     :2358   NA's   :12       
##             City          State     
##  New York     : 160   CA     : 701  
##  Chicago      :  90   TX     : 387  
##  Austin       :  88   NY     : 311  
##  Houston      :  76   VA     : 283  
##  San Francisco:  75   FL     : 282  
##  Atlanta      :  74   IL     : 273  
##  (Other)      :4438   (Other):2764

Non-visual exploration

Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:

Let’s create a table that shows the class types, NA counts & unique values of each variable

metadata <- function(df){
  ###Takes a data frame & Checks NAs, class types, inspects the unique values
  df_len <- nrow(df)
  NA_ct = as.vector(rapply(df, function(x) sum(is.na(x))))

  #create dataframe  
  df_metadata <- data.frame(
    class_type = rapply(df, class),
    n_rows = rapply(df, length),
    complete_cases = sum(complete.cases(df)),
    NA_ct = NA_ct,
    NA_pct = NA_ct / df_len * 100,
    unique_value_ct = rapply(df, function(x)length(unique(x))),
    most_common_values_sample = rapply(inc, function(x) str_replace(paste(names(sort(summary(as.factor(x)), decreasing=T))[1:5], collapse = '; '), "\\(Other\\); ", ""))
  )
  return(df_metadata)
}

kable(metadata(inc), digits = 2)
class_type n_rows complete_cases NA_ct NA_pct unique_value_ct most_common_values_sample
Rank integer 5001 4989 0 0.00 4999 3424; 5000; 1; 2
Name factor 5001 4989 0 0.00 5001 (Add)ventures; @Properties; 1-Stop Translation USA; 110 Consulting
Growth_Rate numeric 5001 4989 0 0.00 1147 0.4; 0.48; 0.53; 0.65
Revenue numeric 5001 4989 0 0.00 1069 2100000; 2200000; 2400000; 4600000
Industry factor 5001 4989 0 0.00 25 IT Services; Business Products & Services; Advertising & Marketing; Health; Software
Employees integer 5001 4989 12 0.24 692 15; 25; 30; 12
City factor 5001 4989 0 0.00 1519 New York; Chicago; Austin; Houston
State factor 5001 4989 0 0.00 52 CA; TX; NY; VA; FL

Let’s use the psych package’s describe function to create a table that shows a more comprehensive set of summary statistics for each numerical variable.

metrics <- function(df){
  metrics_only <- df[, which(rapply(df, class) %in% c("numeric", "integer"))]
  df_metrics <- t(describe(metrics_only, quant = c(.25,.75)))[2:15,]
  return(df_metrics)
}

kable(metrics(inc), digits = 2, format.args = list(big.mark = ',', scientific = F, drop0trailing = T))
Rank Growth_Rate Revenue Employees
n 5,001 5,001 5,001 4,989
mean 2,501.64 4.61 48,222,535.49 232.72
sd 1,443.51 14.12 240,542,281.14 1,353.13
median 2,502 1.42 10,900,000 53
trimmed 2,501.73 2.14 17,334,966.26 81.78
mad 1,853.25 1.22 10,674,720 53.37
min 1 0.34 2,000,000 1
max 5,000 421.48 10,100,000,000 66,803
range 4,999 421.14 10,098,000,000 66,802
skew 0 12.55 22.17 29.81
kurtosis -1.2 242.34 722.66 1,268.67
se 20.41 0.2 3,401,441.44 19.16
Q0.25 1,252 0.77 5,100,000 25
Q0.75 3,751 3.29 28,600,000 132

Question 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state).

ggplot(data = inc) + 
  geom_bar(mapping = aes(x = reorder(State, State, function(x) length(x)))) + 
  labs(x = "State",
       title = "Number of Companies by State") + 
  theme(axis.text.y = element_text(size=6, color="black", face = "bold"),
        plot.margin=unit(c(0,3,0,3), "cm")) +
  coord_flip()

Question 2

Lets dig in on the state with the 3rd most companies in the data set.

state_third_most <- names(sort(table(inc$State), decreasing = T))[3]
df_third_most <- subset(inc, State == state_third_most)
df_third_most <- df_third_most[complete.cases(df_third_most),]

Plot 2.1

  • One way to visualize these distributions is to resize the plot limits so that it does not display the most extreme outliers.

  • However, some of the boxes are still too tiny to see.

#calculate some parameters to deal with the outliers
bpstats <- boxplot(Employees ~ Industry, data = df_third_most, plot = F)$stats
ylimits <- c(0, ceiling(max(bpstats) / 100)) * 100
ybreaks <- seq(ylimits[1], ylimits[2], by = 200)
outliers_not_shown <- paste(sum(df_third_most$Employees > max(ylimits)), "outlier(s) not displayed")

#to preserve alphabetical order when flipped
reordered_x_lab <- scale_x_discrete(limits = rev(levels(inc$Industry)))

plt_base <- ggplot(data = df_third_most, mapping = aes(x = Industry, y = Employees))

plt_base +
  geom_boxplot() +
  labs(title = "Company Sizes in NY by Industry",
       caption = paste("Red dot = mean", outliers_not_shown, sep = "\n")) +
  reordered_x_lab +
  scale_y_continuous(breaks = ybreaks) +
  stat_summary(fun.y=mean, geom="point", size=2, color = "red") +
  coord_flip(ylim = ylimits) +
  theme_fivethirtyeight()

Stackoverflow: Ignore outliers in ggplot2 boxplot

Stackoverflow: Boxplot show the value of mean

Plot 2.2

  • Another way to visualize these distributions is perform a log function in order to rescale the extreme outliers.

  • Now the extreme outliers can be seen.

plt_base +
  geom_boxplot() +
  reordered_x_lab +
  scale_y_log10(breaks = 10^(1:4)) +
  labs(title = "NY Company Sizes (log10) by Industry") +
  coord_flip() +
  theme_fivethirtyeight()

Question 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.

Plot 3.1

  • I’m assuming that we are doing the revenue/employee division at the company-level granularity. Otherwise I believe that showing distribution per industry doesn’t make any sense.

  • I removed 12 incomplete cases.

  • First, let’s resize the plot limits so that it does not display the most extreme outliers.

#create data
inc_q3 <- drop_na(data.frame(
  Industry = inc$Industry,
  revenue_per_employee = with(inc, Revenue/Employees)
))

#calculate some parameters to deal with the outliers
bpstats <- boxplot(revenue_per_employee ~ Industry, data = inc_q3, plot = F)$stats
ylimits <- c(0, ceiling(max(bpstats) / 100000) * 100000)
ybreaks <- seq(0, ylimits[2], by = 500000)
outliers_not_shown <- paste(sum(inc_q3$revenue_per_employee > max(ylimits), na.rm = T), "outlier(s) not displayed")

#plot
plt_base2 <- ggplot(data = inc_q3, mapping = aes(x = Industry, y = revenue_per_employee))

plt_base2 +
  geom_boxplot(outlier.size = 1) +
  labs(title = "Distribution of Revenue $ per Employee",
       caption = paste("Red dot = mean", outliers_not_shown, sep = "\n")) +
  reordered_x_lab +
  scale_y_continuous(breaks = ybreaks, labels = comma(ybreaks)) +
  stat_summary(fun.y=mean, geom="point", size=2, color = "red") +
  coord_flip(ylim = ylimits) +
  theme_fivethirtyeight()

Plot 3.2

  • Now, let’s use the log10 scale so we can see the most extreme outliers.
breaks_log10 <- 10^(4:7)

plt_base2 +
  geom_boxplot(outlier.size = 1) +
  reordered_x_lab +
  scale_y_log10(breaks = breaks_log10, labels = comma(breaks_log10)) +
  labs(title = "Distr. of Revenue $ per Employee (log10)") +
  coord_flip() +
  theme_fivethirtyeight()