Principles of Data Visualization and Introduction to ggplot2

I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(ggplot2)
library(outliers)

inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)

And lets preview this data:

head(inc)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
summary(inc)
##       Rank                          Name       Growth_Rate     
##  Min.   :   1   (Add)ventures         :   1   Min.   :  0.340  
##  1st Qu.:1252   @Properties           :   1   1st Qu.:  0.770  
##  Median :2502   1-Stop Translation USA:   1   Median :  1.420  
##  Mean   :2502   110 Consulting        :   1   Mean   :  4.612  
##  3rd Qu.:3751   11thStreetCoffee.com  :   1   3rd Qu.:  3.290  
##  Max.   :5000   123 Exteriors         :   1   Max.   :421.480  
##                 (Other)               :4995                    
##     Revenue                                  Industry      Employees      
##  Min.   :2.000e+06   IT Services                 : 733   Min.   :    1.0  
##  1st Qu.:5.100e+06   Business Products & Services: 482   1st Qu.:   25.0  
##  Median :1.090e+07   Advertising & Marketing     : 471   Median :   53.0  
##  Mean   :4.822e+07   Health                      : 355   Mean   :  232.7  
##  3rd Qu.:2.860e+07   Software                    : 342   3rd Qu.:  132.0  
##  Max.   :1.010e+10   Financial Services          : 260   Max.   :66803.0  
##                      (Other)                     :2358   NA's   :12       
##             City          State     
##  New York     : 160   CA     : 701  
##  Chicago      :  90   TX     : 387  
##  Austin       :  88   NY     : 311  
##  Houston      :  76   VA     : 283  
##  San Francisco:  75   FL     : 282  
##  Atlanta      :  74   IL     : 273  
##  (Other)      :4438   (Other):2764

Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:

Let’s find some outliers (notably Growth, employees and Revenue; maximum values are way too high) in our data. Not only that, let’s see data types for each column and remove scientific notations in summary.

# Insert your code here, create more chunks as necessary

#Outliers
max_growth <- subset(inc, Growth_Rate == max(inc['Growth_Rate']))
max_growth
##   Rank Name Growth_Rate   Revenue                     Industry Employees
## 1    1 Fuhu      421.48 117900000 Consumer Products & Services       104
##         City State
## 1 El Segundo    CA
max_rev <- subset(inc, Revenue == max(inc['Revenue']))
max_rev
##      Rank Name Growth_Rate  Revenue          Industry Employees
## 4788 4788  CDW        0.41 1.01e+10 Computer Hardware      6800
##              City State
## 4788 Vernon Hills    IL
max_employee <- subset(inc, inc['Employees'] == 66803)
max_employee
##      Rank                         Name Growth_Rate   Revenue
## 2344 2345 Integrity staffing Solutions        1.55 278200000
##             Industry Employees       City State
## 2344 Human Resources     66803 Wilmington    DE
#Let's see how data looks like without outliers.
inc_no_outlier <- subset(inc, !(Rank %in% c(max_growth['Rank'], max_rev['Rank'], max_employee['Rank'])) )
summary(inc_no_outlier)
##       Rank                          Name       Growth_Rate    
##  Min.   :   2   (Add)ventures         :   1   Min.   :  0.34  
##  1st Qu.:1252   @Properties           :   1   1st Qu.:  0.77  
##  Median :2502   1-Stop Translation USA:   1   Median :  1.42  
##  Mean   :2502   110 Consulting        :   1   Mean   :  4.53  
##  3rd Qu.:3751   11thStreetCoffee.com  :   1   3rd Qu.:  3.29  
##  Max.   :5000   123 Exteriors         :   1   Max.   :248.31  
##                 (Other)               :4992                   
##     Revenue                                  Industry      Employees      
##  Min.   :2.000e+06   IT Services                 : 733   Min.   :    1.0  
##  1st Qu.:5.100e+06   Business Products & Services: 482   1st Qu.:   25.0  
##  Median :1.090e+07   Advertising & Marketing     : 471   Median :   53.0  
##  Mean   :4.615e+07   Health                      : 355   Mean   :  218.1  
##  3rd Qu.:2.860e+07   Software                    : 342   3rd Qu.:  132.0  
##  Max.   :4.700e+09   Financial Services          : 260   Max.   :32000.0  
##                      (Other)                     :2355   NA's   :12       
##             City          State     
##  New York     : 160   CA     : 700  
##  Chicago      :  90   TX     : 387  
##  Austin       :  88   NY     : 311  
##  Houston      :  76   VA     : 283  
##  San Francisco:  75   FL     : 282  
##  Atlanta      :  74   IL     : 272  
##  (Other)      :4435   (Other):2763
#Well, it seems like data is still heavily right-skewed as Employees, Revenue and Growth rate have much higher Mean than Median even after removing outliers.

#Data types for each column
str(inc)
## 'data.frame':    5001 obs. of  8 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name       : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
##  $ Growth_Rate: num  421 248 245 233 213 ...
##  $ Revenue    : num  1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
##  $ Industry   : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
##  $ Employees  : int  104 51 132 50 220 63 27 75 97 15 ...
##  $ City       : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
##  $ State      : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
# Using scipen=999 to disable scientific notations in Revenue.
options(scipen=999)
summary(inc)
##       Rank                          Name       Growth_Rate     
##  Min.   :   1   (Add)ventures         :   1   Min.   :  0.340  
##  1st Qu.:1252   @Properties           :   1   1st Qu.:  0.770  
##  Median :2502   1-Stop Translation USA:   1   Median :  1.420  
##  Mean   :2502   110 Consulting        :   1   Mean   :  4.612  
##  3rd Qu.:3751   11thStreetCoffee.com  :   1   3rd Qu.:  3.290  
##  Max.   :5000   123 Exteriors         :   1   Max.   :421.480  
##                 (Other)               :4995                    
##     Revenue                                    Industry   
##  Min.   :    2000000   IT Services                 : 733  
##  1st Qu.:    5100000   Business Products & Services: 482  
##  Median :   10900000   Advertising & Marketing     : 471  
##  Mean   :   48222535   Health                      : 355  
##  3rd Qu.:   28600000   Software                    : 342  
##  Max.   :10100000000   Financial Services          : 260  
##                        (Other)                     :2358  
##    Employees                  City          State     
##  Min.   :    1.0   New York     : 160   CA     : 701  
##  1st Qu.:   25.0   Chicago      :  90   TX     : 387  
##  Median :   53.0   Austin       :  88   NY     : 311  
##  Mean   :  232.7   Houston      :  76   VA     : 283  
##  3rd Qu.:  132.0   San Francisco:  75   FL     : 282  
##  Max.   :66803.0   Atlanta      :  74   IL     : 273  
##  NA's   :12        (Other)      :4438   (Other):2764

Question 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.

# Answer Question 1 here

# I am using sqldf to group by State and count number of companies
query<-sqldf("select 
          State, count(distinct Name) as cnt_comp
          from inc 
          group by State")

# Using ggplot to graph
ggplot(query, aes(x=reorder(State, cnt_comp), y=cnt_comp)) +
 geom_bar(stat="identity", fill="green") +
 geom_text(aes(label=cnt_comp)) +
 ylab("# of companies") +
 xlab("State") +
 coord_flip() +
 ggtitle("Number of Companies by State") +
 theme_bw() 

Quesiton 2

Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.

# Answer Question 2 here

#Filter out N/As in full data
full <- inc[complete.cases(inc),]

#Using filtered full, choose 3rd most state
third <- subset(full, State == 'NY')

#The summary looks good, no N/As
summary(third)
##       Rank                        Name      Growth_Rate    
##  Min.   :  26   1st Equity          :  1   Min.   : 0.350  
##  1st Qu.:1186   33Across            :  1   1st Qu.: 0.670  
##  Median :2702   5Linx Enterprises   :  1   Median : 1.310  
##  Mean   :2612   Access Display Group:  1   Mean   : 4.371  
##  3rd Qu.:4005   Adafruit            :  1   3rd Qu.: 3.580  
##  Max.   :4981   AdCorp Media Group  :  1   Max.   :84.430  
##                 (Other)             :305                   
##     Revenue                                   Industry     Employees      
##  Min.   :   2000000   Advertising & Marketing     : 57   Min.   :    1.0  
##  1st Qu.:   4300000   IT Services                 : 43   1st Qu.:   21.0  
##  Median :   8800000   Business Products & Services: 26   Median :   45.0  
##  Mean   :  58715113   Consumer Products & Services: 17   Mean   :  271.3  
##  3rd Qu.:  25700000   Telecommunications          : 17   3rd Qu.:  105.5  
##  Max.   :4600000000   Education                   : 14   Max.   :32000.0  
##                       (Other)                     :137                    
##         City         State    
##  New York :160   NY     :311  
##  Brooklyn : 15   AK     :  0  
##  Rochester:  9   AL     :  0  
##  Buffalo  :  5   AR     :  0  
##  Fairport :  5   AZ     :  0  
##  new york :  5   CA     :  0  
##  (Other)  :112   (Other):  0
# I am using sqldf to group by industry to get mean employment in NY
query<-sqldf("select 
          Industry, avg(Employees) as avg_emp
          from third 
          group by Industry")

# Using ggplot to graph
ggplot(query, aes(x=reorder(Industry, avg_emp), y=round(avg_emp,0))) +
 geom_bar(stat="identity", fill="skyblue") +
 geom_text(aes(label=round(avg_emp,0))) +
 ylab("Mean Number Of Employees") +
 xlab("Industry") +
 ggtitle("Mean Number of Employees by Industry in New York") +
 coord_flip() +
 theme_bw() 

#There are outliers in Employees variables by Industry. We will get rid of these and re-graph our result.
ggplot(third, aes(x=reorder(Industry,Employees) , y=Employees)) +
  geom_boxplot() +
  ggtitle("Box-Plot by Industry and number of Employees") +
  coord_flip()

#outlierKD function will automatically detect outliers and replace dataset (third) with dataset without outliers
outlierKD <- function(dt, var) {
     var_name <- eval(substitute(var),eval(dt))
     na1 <- sum(is.na(var_name))
     m1 <- mean(var_name, na.rm = T)
     par(mfrow=c(2, 2), oma=c(0,0,3,0))
     boxplot(var_name, main="With outliers")
     hist(var_name, main="With outliers", xlab=NA, ylab=NA)
     outlier <- boxplot.stats(var_name)$out
     mo <- mean(outlier)
     var_name <- ifelse(var_name %in% outlier, NA, var_name)
     boxplot(var_name, main="Without outliers")
     hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
     title("Outlier Check", outer=TRUE)
     na2 <- sum(is.na(var_name))
     cat("Outliers identified:", na2 - na1, "n")
     cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
     cat("Mean of the outliers:", round(mo, 2), "n")
     m2 <- mean(var_name, na.rm = T)
     cat("Mean without removing outliers:", round(m1, 2), "n")
     cat("Mean if we remove outliers:", round(m2, 2), "n")
     dt[as.character(substitute(var))] <- invisible(var_name)
     assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
     cat("Outliers successfully removed", "n")
     return(invisible(dt))
}

#Using loop, remove outliers in Employees by each industry
industries <- unique(third$Industry)
third_wo <-NULL

for (i in industries){
  d2 <- subset(third, Industry == i)
  outlierKD(d2, Employees)
  third_wo <- rbind(d2, third_wo)
}

## Outliers identified: 1 nPropotion (%) of outliers: 6.2 nMean of the outliers: 10000 nMean without removing outliers: 626.29 nMean if we remove outliers: 40.44 nOutliers successfully removed n

## Outliers identified: 6 nPropotion (%) of outliers: 11.8 nMean of the outliers: 214.67 nMean without removing outliers: 58.44 nMean if we remove outliers: 40.06 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 8.3 nMean of the outliers: 483 nMean without removing outliers: 144.31 nMean if we remove outliers: 116.08 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 95.35 nMean if we remove outliers: 95.35 nOutliers successfully removed n

## Outliers identified: 3 nPropotion (%) of outliers: 27.3 nMean of the outliers: 70.33 nMean without removing outliers: 24.79 nMean if we remove outliers: 12.36 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 16.7 nMean of the outliers: 2280 nMean without removing outliers: 547.71 nMean if we remove outliers: 259 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 10 nMean of the outliers: 602 nMean without removing outliers: 108 nMean if we remove outliers: 58.6 nOutliers successfully removed n

## Outliers identified: 4 nPropotion (%) of outliers: 10.3 nMean of the outliers: 1233 nMean without removing outliers: 204.09 nMean if we remove outliers: 98.56 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 7.7 nMean of the outliers: 200 nMean without removing outliers: 59.86 nMean if we remove outliers: 49.08 nOutliers successfully removed n

## Outliers identified: 3 nPropotion (%) of outliers: 13 nMean of the outliers: 11780.67 nMean without removing outliers: 1492.46 nMean if we remove outliers: 150.52 nOutliers successfully removed n

## Outliers identified: 2 nPropotion (%) of outliers: 18.2 nMean of the outliers: 253.5 nMean without removing outliers: 73.31 nMean if we remove outliers: 40.55 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 10 nMean of the outliers: 2081 nMean without removing outliers: 437.55 nMean if we remove outliers: 273.2 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 8.3 nMean of the outliers: 298 nMean without removing outliers: 81.85 nMean if we remove outliers: 63.83 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 12.5 nMean of the outliers: 383 nMean without removing outliers: 76.44 nMean if we remove outliers: 38.12 nOutliers successfully removed n

## Outliers identified: 2 nPropotion (%) of outliers: 66.7 nMean of the outliers: 149.5 nMean without removing outliers: 129.2 nMean if we remove outliers: 115.67 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 18.25 nMean if we remove outliers: 18.25 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 29.5 nMean if we remove outliers: 29.5 nOutliers successfully removed n

## Outliers identified: 2 nPropotion (%) of outliers: 18.2 nMean of the outliers: 1035.5 nMean without removing outliers: 245.92 nMean if we remove outliers: 102.36 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 32.5 nMean if we remove outliers: 32.5 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 17 nMean if we remove outliers: 17 nOutliers successfully removed n

## Outliers identified: 1 nPropotion (%) of outliers: 20 nMean of the outliers: 219 nMean without removing outliers: 61 nMean if we remove outliers: 29.4 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 135 nMean if we remove outliers: 135 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 44 nMean if we remove outliers: 44 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 155 nMean if we remove outliers: 155 nOutliers successfully removed n

## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 53.5 nMean if we remove outliers: 53.5 nOutliers successfully removed n
# Now, dataset (third) is free of outliers but notice there are NAs because these NAs were outliers before
summary(third)
##       Rank                        Name      Growth_Rate    
##  Min.   :  26   1st Equity          :  1   Min.   : 0.350  
##  1st Qu.:1186   33Across            :  1   1st Qu.: 0.670  
##  Median :2702   5Linx Enterprises   :  1   Median : 1.310  
##  Mean   :2612   Access Display Group:  1   Mean   : 4.371  
##  3rd Qu.:4005   Adafruit            :  1   3rd Qu.: 3.580  
##  Max.   :4981   AdCorp Media Group  :  1   Max.   :84.430  
##                 (Other)             :305                   
##     Revenue                                   Industry     Employees      
##  Min.   :   2000000   Advertising & Marketing     : 57   Min.   :    1.0  
##  1st Qu.:   4300000   IT Services                 : 43   1st Qu.:   21.0  
##  Median :   8800000   Business Products & Services: 26   Median :   45.0  
##  Mean   :  58715113   Consumer Products & Services: 17   Mean   :  271.3  
##  3rd Qu.:  25700000   Telecommunications          : 17   3rd Qu.:  105.5  
##  Max.   :4600000000   Education                   : 14   Max.   :32000.0  
##                       (Other)                     :137                    
##         City         State    
##  New York :160   NY     :311  
##  Brooklyn : 15   AK     :  0  
##  Rochester:  9   AL     :  0  
##  Buffalo  :  5   AR     :  0  
##  Fairport :  5   AZ     :  0  
##  new york :  5   CA     :  0  
##  (Other)  :112   (Other):  0
summary(third_wo)
##       Rank                        Name      Growth_Rate    
##  Min.   :  26   1st Equity          :  1   Min.   : 0.350  
##  1st Qu.:1186   33Across            :  1   1st Qu.: 0.670  
##  Median :2702   5Linx Enterprises   :  1   Median : 1.310  
##  Mean   :2612   Access Display Group:  1   Mean   : 4.371  
##  3rd Qu.:4005   Adafruit            :  1   3rd Qu.: 3.580  
##  Max.   :4981   AdCorp Media Group  :  1   Max.   :84.430  
##                 (Other)             :305                   
##     Revenue                                   Industry     Employees      
##  Min.   :   2000000   Advertising & Marketing     : 57   Min.   :   1.00  
##  1st Qu.:   4300000   IT Services                 : 43   1st Qu.:  19.00  
##  Median :   8800000   Business Products & Services: 26   Median :  39.50  
##  Mean   :  58715113   Consumer Products & Services: 17   Mean   :  82.76  
##  3rd Qu.:  25700000   Telecommunications          : 17   3rd Qu.:  81.75  
##  Max.   :4600000000   Education                   : 14   Max.   :1134.00  
##                       (Other)                     :137   NA's   :31       
##         City         State    
##  New York :160   NY     :311  
##  Brooklyn : 15   AK     :  0  
##  Rochester:  9   AL     :  0  
##  Buffalo  :  5   AR     :  0  
##  Fairport :  5   AZ     :  0  
##  new york :  5   CA     :  0  
##  (Other)  :112   (Other):  0
# We want to filter out N/As in third without outliers once more
third_wo <- third_wo[complete.cases(third_wo),]

#Let's see how box-plot now looks like with new dataset. Pretty neat
ggplot(third_wo, aes(x=reorder(Industry,Employees) , y=Employees)) +
  geom_boxplot() +
  ggtitle("Box-Plot by Industry and Number of Employees") +
  coord_flip()

# I am re-using sqldf to group by industry to get mean employment in NY (without outliers)
query<-sqldf("select 
          Industry, avg(Employees) as avg_emp
          from third_wo 
          group by Industry")

# Using ggplot to re-graph (without outliers)
ggplot(query, aes(x=reorder(Industry, avg_emp), y=round(avg_emp,0))) +
 geom_bar(stat="identity", fill="skyblue") +
 geom_text(aes(label=round(avg_emp,0))) +
 ylab("Mean Number Of Employees") +
 xlab("Industry") +
 ggtitle("Mean Number of Employees by Industry in New York") +
 coord_flip() +
 theme_bw() 

# Use query again
query <- sqldf("select 
          Industry, avg(Employees) as avg_emp
          from third 
          group by Industry")
 
# We can also remove outliers in mean level
outlier_avg <- outlier(query$avg_emp)

# remove a row with outlier
query <- subset(query, avg_emp != outlier_avg)

# Graph again
ggplot(query, aes(x=reorder(Industry, avg_emp), y=round(avg_emp,0))) +
 geom_bar(stat="identity", fill="skyblue") +
 geom_text(aes(label=round(avg_emp,0))) +
 ylab("Mean Number Of Employees") +
 xlab("Industry") +
 ggtitle("Mean Number of Employees by Industry in New York") +
 coord_flip() +
 theme_bw() 

Question 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.

# Answer Question 3 here

# Use query again
query<-sqldf(
          "SELECT a.Industry, (tot_rev / emp_cnt) as rev_emp
          FROM
          (select 
          Industry, SUM(Revenue) as tot_rev, SUM(Employees) as emp_cnt
          from full 
          group by Industry) as a"
)

#Set K = 4 for K-mean clustering: Very high, high, medium, low by Revenue Per Employee
cl=(kmeans(query$rev_emp,4))

#set cluster as a factor and add it to our dataset
query$cluster=factor(cl$cluster)
centers=as.data.frame(cl$centers)

#Plot cluster points
ggplot(query, aes(x=reorder(Industry, rev_emp), y=round(rev_emp,2), color=cluster)) +
 geom_point() + 
 geom_text(aes(label=round(rev_emp/1000,2)), hjust=1, size=4) +
 ylab("Revenue Per Employee (1000's)") +
 xlab("Industry") +
 ggtitle("K-Mean Clustering : Revenue Per Employee") +
 coord_flip() +
 theme_bw() +
 scale_y_continuous(labels = function(x){paste0(x/1000, 'K')})