Load library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2

Import and group data

#Read CSV into R
url <- read.csv('https://raw.githubusercontent.com/indianspice/IS608/master/Assign1/inc5000_data.csv', 
                stringsAsFactors = TRUE)

#Examine the data
head(url)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
str(url)
## 'data.frame':    5001 obs. of  8 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name       : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
##  $ Growth_Rate: num  421 248 245 233 213 ...
##  $ Revenue    : num  1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
##  $ Industry   : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
##  $ Employees  : int  104 51 132 50 220 63 27 75 97 15 ...
##  $ City       : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
##  $ State      : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
df <- data.frame(url)
head(df)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
#Group data by state
comp <- df %>%
    group_by(State) %>%
    tally() %>%
    arrange(desc(n))
head(comp)
## # A tibble: 6 × 2
##    State     n
##   <fctr> <int>
## 1     CA   701
## 2     TX   387
## 3     NY   311
## 4     VA   283
## 5     FL   282
## 6     IL   273
  1. Create a graph that shows the distribution of companies in the dataset by state
ggplot(data=comp, aes(x=reorder(State, -n), y=n)) + 
    geom_bar(stat = 'identity', fill = 'chocolate') +
    ggtitle("Distrubution of Companies by State") + 
    ylab("Number of Companies") + 
    xlab("State") +
    coord_flip()

  1. The state with the 3rd most companies in the data set. Imagine you work for the sate and are interested in how many people are employed by companies in different industries employ. Create a plot of average employment by industry for companies in this state (only use cases with full data (use R’s complete.cases()function). Your graph should show how variable the ranges are and exclude outliers.
head(df)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
#Subset data to extract NY the state with the 3rd most companeis
ny_df <- subset(df, State=='NY')
head(ny_df)
##    Rank                      Name Growth_Rate  Revenue
## 26   26              BeenVerified       84.43 13700000
## 30   30                  Sailthru       73.22  8100000
## 37   37              YellowHammer       67.40 18000000
## 38   38                 Conductor       67.02  7100000
## 48   48 Cinium Financial Services       53.65  5900000
## 70   70                  33Across       44.99 27900000
##                        Industry Employees      City State
## 26 Consumer Products & Services        17  New York    NY
## 30      Advertising & Marketing        79  New York    NY
## 37      Advertising & Marketing        27  New York    NY
## 38      Advertising & Marketing        89  New York    NY
## 48           Financial Services        32 Rock Hill    NY
## 70      Advertising & Marketing        75  New York    NY
#List rows of data with missing values
ny_df[!complete.cases(ny_df), ]
## [1] Rank        Name        Growth_Rate Revenue     Industry    Employees  
## [7] City        State      
## <0 rows> (or 0-length row.names)
#Examine the data for outliers
summary(ny_df)
##       Rank                        Name      Growth_Rate    
##  Min.   :  26   1st Equity          :  1   Min.   : 0.350  
##  1st Qu.:1186   33Across            :  1   1st Qu.: 0.670  
##  Median :2702   5Linx Enterprises   :  1   Median : 1.310  
##  Mean   :2612   Access Display Group:  1   Mean   : 4.371  
##  3rd Qu.:4005   Adafruit            :  1   3rd Qu.: 3.580  
##  Max.   :4981   AdCorp Media Group  :  1   Max.   :84.430  
##                 (Other)             :305                   
##     Revenue                                  Industry     Employees      
##  Min.   :2.000e+06   Advertising & Marketing     : 57   Min.   :    1.0  
##  1st Qu.:4.300e+06   IT Services                 : 43   1st Qu.:   21.0  
##  Median :8.800e+06   Business Products & Services: 26   Median :   45.0  
##  Mean   :5.872e+07   Consumer Products & Services: 17   Mean   :  271.3  
##  3rd Qu.:2.570e+07   Telecommunications          : 17   3rd Qu.:  105.5  
##  Max.   :4.600e+09   Education                   : 14   Max.   :32000.0  
##                      (Other)                     :137                    
##         City         State    
##  New York :160   NY     :311  
##  Brooklyn : 15   AK     :  0  
##  Rochester:  9   AL     :  0  
##  Buffalo  :  5   AR     :  0  
##  Fairport :  5   AZ     :  0  
##  new york :  5   CA     :  0  
##  (Other)  :112   (Other):  0
qplot(ny_df$Employees, geom = "histogram", binwidth = 1000)

#Remove outliers
nydf_sub <- subset(ny_df, Employees<45) #At 45 employees the distribution is normal

summary(nydf_sub$Employees)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   12.00   21.00   21.09   28.75   44.00

Averages

mean_industry <- aggregate(nydf_sub[, 6], list(nydf_sub$Industry), mean)
mean_industry$x <- ceiling(mean_industry$x)
#mean_industry <- rename(mean_industry,c("Group.1" = "Industry", "x" = "Average"))
head(mean_industry)
##                        Group.1  x
## 1      Advertising & Marketing 24
## 2 Business Products & Services 23
## 3            Computer Hardware 44
## 4                 Construction 20
## 5 Consumer Products & Services 16
## 6                    Education 26

Plot data

ggplot(mean_industry, aes(x=reorder(Group.1, x), x)) + 
    geom_bar(stat = 'identity', fill = 'chocolate') +
    coord_flip() + 
    ggtitle("Average Employment by Industry") + 
    xlab("Industry") +
    ylab("Average Employment")

  1. Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart to make this clear.
#Create df with revenue and industry
ind_rev <- df[, 4:5]

#Sum revenue per industry
revenue <- aggregate(df[, 1], list(df$Industry), sum)
revenue <- arrange(revenue, desc(x)) #Arrange in descending order
head(revenue)
##                        Group.1       x
## 1                  IT Services 1862354
## 2 Business Products & Services 1387216
## 3      Advertising & Marketing 1110956
## 4                       Health  857590
## 5                     Software  796243
## 6                Manufacturing  768151
summary(revenue$x) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  120000  193700  399900  500400  611700 1862000
sub_revenue <- subset(revenue, x<1860000) #Remove outlier
summary(sub_revenue$x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  120000  190800  373500  443700  523500 1387000

Revenue by Industry plot

ggplot(revenue, aes(x=reorder(Group.1, x), x)) + 
    geom_bar(stat = 'identity', fill = 'chocolate') +
    coord_flip() + 
    ggtitle("Total Revenue by Industry") + 
    xlab("Industry") +
    ylab("Revenue")

References