Principles of Data Visualization and Introduction to ggplot2

#Libraries Required
library(dplyr) 
library(ggplot2)
library(tidyr)
library(pastecs)
library(knitr)

I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:

inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)

And lets preview this data:

head(inc)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
summary(inc)
##       Rank          Name            Growth_Rate         Revenue         
##  Min.   :   1   Length:5001        Min.   :  0.340   Min.   :2.000e+06  
##  1st Qu.:1252   Class :character   1st Qu.:  0.770   1st Qu.:5.100e+06  
##  Median :2502   Mode  :character   Median :  1.420   Median :1.090e+07  
##  Mean   :2502                      Mean   :  4.612   Mean   :4.822e+07  
##  3rd Qu.:3751                      3rd Qu.:  3.290   3rd Qu.:2.860e+07  
##  Max.   :5000                      Max.   :421.480   Max.   :1.010e+10  
##                                                                         
##    Industry           Employees           City              State          
##  Length:5001        Min.   :    1.0   Length:5001        Length:5001       
##  Class :character   1st Qu.:   25.0   Class :character   Class :character  
##  Mode  :character   Median :   53.0   Mode  :character   Mode  :character  
##                     Mean   :  232.7                                        
##                     3rd Qu.:  132.0                                        
##                     Max.   :66803.0                                        
##                     NA's   :12

Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:

  1. There are 12 NA values in the Employees feature/column.
# Compute descriptive statistics for Revenue
Rev <- stat.desc(inc$Revenue)
round(Rev,2)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.001000e+03 0.000000e+00 0.000000e+00 2.000000e+06 1.010000e+10 1.009800e+10 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 2.411609e+11 1.090000e+07 4.822254e+07 3.401441e+06 6.668317e+06 5.786059e+16 
##      std.dev     coef.var 
## 2.405423e+08 4.990000e+00
# Compute descriptive statistics for Growth Rate
stat.desc(inc$Growth_Rate)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.001000e+03 0.000000e+00 0.000000e+00 3.400000e-01 4.214800e+02 4.211400e+02 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 2.306374e+04 1.420000e+00 4.611826e+00 1.997192e-01 3.915372e-01 1.994787e+02 
##      std.dev     coef.var 
## 1.412369e+01 3.062495e+00
# Compute descriptive statistics for Employees
stat.desc(inc$Employees)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 4.989000e+03 0.000000e+00 1.200000e+01 1.000000e+00 6.680300e+04 6.680200e+04 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.161030e+06 5.300000e+01 2.327180e+02 1.915720e+01 3.755654e+01 1.830955e+06 
##      std.dev     coef.var 
## 1.353128e+03 5.814454e+00

Question 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.

#Create a subset with just two features: State and Industry.
Ind_State <- aggregate(inc$Industry, by=list(inc$State), FUN=length)
Ind_State
##    Group.1   x
## 1       AK   2
## 2       AL  51
## 3       AR   9
## 4       AZ 100
## 5       CA 701
## 6       CO 134
## 7       CT  50
## 8       DC  43
## 9       DE  16
## 10      FL 282
## 11      GA 212
## 12      HI   7
## 13      IA  28
## 14      ID  17
## 15      IL 273
## 16      IN  69
## 17      KS  38
## 18      KY  40
## 19      LA  37
## 20      MA 182
## 21      MD 131
## 22      ME  13
## 23      MI 126
## 24      MN  88
## 25      MO  59
## 26      MS  12
## 27      MT   4
## 28      NC 137
## 29      ND  10
## 30      NE  27
## 31      NH  24
## 32      NJ 158
## 33      NM   5
## 34      NV  26
## 35      NY 311
## 36      OH 186
## 37      OK  46
## 38      OR  49
## 39      PA 164
## 40      PR   1
## 41      RI  16
## 42      SC  48
## 43      SD   3
## 44      TN  82
## 45      TX 387
## 46      UT  95
## 47      VA 283
## 48      VT   6
## 49      WA 130
## 50      WI  79
## 51      WV   2
## 52      WY   2
ggplot(Ind_State, aes(x=Group.1, y=x))+
geom_bar(stat="identity")+
coord_flip() + labs(x = "State", y = "Number of Industries")+
   theme_classic()

#re-order the states for better view
ggplot(Ind_State, aes(x=reorder(Group.1, x), y=x, fill=x))+
geom_bar(stat="identity",width=0.2, position = position_dodge(width=0.2))+
coord_flip() + labs(title= "Distribution of Companies", x = "State", y = "Number of Industris")+
   theme_minimal()

California, Texas, and New York have the highest number of Industries.

Quesiton 2

Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.

#NY is the thirst highest state with industries. (as per the graph above)

#Since we noted that Employees column has some NA values, complete.cases will list all rows in the data with completed values.

Ind_NY <- inc[complete.cases(inc),] %>%
  filter(State=='NY') %>%
  drop_na() %>%
  group_by(Industry) 
ggplot(Ind_NY, aes(x=Industry, y=Employees))+
geom_boxplot()+
coord_flip() + 
theme_classic()+
  labs(x = "Industry", y = "Employees")

Detect and remove outliers

The interquartile range is the central 50% or the area between the 75th and the 25th percentile of a distribution. A point is an outlier if it is above the 75th or below the 25th percentile by a factor of 1.5 times the IQR.

#Detect Ouliers
Q <- quantile(Ind_NY$Employees, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(Ind_NY$Employees)
up <-  Q[2]+1.5*iqr # Upper Range  
low<- Q[1]-1.5*iqr # Lower Range
#Removing outliers
eliminated<- subset(Ind_NY, Ind_NY$Employees > (Q[1] - 1.5*iqr) & Ind_NY$Employees < (Q[2]+1.5*iqr))
#Plot after removing the outliers
ggplot(eliminated, aes(x=Industry, y=Employees))+
geom_boxplot()+
coord_flip() + 
theme_classic()+
  labs(x = "Industry", y = "Employees")

#Get the average employment for NY state
AVG_EMP <- eliminated %>%
  group_by(Industry) %>%
  summarise(Employees_AVG = round(mean(Employees),1))
 
#Plot a distribution of average employment for NY state
ggplot(AVG_EMP, aes(x=Industry, y=Employees_AVG))+
geom_bar(stat="identity")+
coord_flip() + 
theme_classic()+
  labs(x = "Industry", y = "Average Employees")

Question 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.

inc %>% 
  #Select rows with companies based on NY
  filter(State == "NY") %>% 
  #drop any rows with missing values.
  drop_na() %>% 
  group_by(Industry) %>% 
  #Calculate the employees and total revenue and calculate revenue per employee
  summarise(Employees = sum(Employees),
            Revenue = sum(Revenue)) %>% 
  mutate(revenue_employee = Revenue/Employees) %>% 
  
#Create a bar chart of all industries by most revenue per employee in a descending order.
  ggplot(aes(x = reorder(Industry, revenue_employee), y = revenue_employee)) +
  geom_bar(stat = "identity", fill = "#FF6666") +
  coord_flip() + 
  labs(title = "Most Revenue per Employee in NY", x = "Industry", y = "Revenue per Employee") +
  theme_minimal()