The following questions pertain to data on the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. That data is available here

download.file("https://raw.githubusercontent.com/jlaurito/CUNY_IS608/master/lecture1/data/inc5000_data.csv", "inc5000_data.csv", method="curl")
inc5000 <- read.csv("inc5000_data.csv", na.strings = "NA")
# check the header and structure
head(inc5000)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
str(inc5000)
## 'data.frame':    5001 obs. of  8 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name       : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
##  $ Growth_Rate: num  421 248 245 233 213 ...
##  $ Revenue    : num  1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
##  $ Industry   : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
##  $ Employees  : int  104 51 132 50 220 63 27 75 97 15 ...
##  $ City       : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
##  $ State      : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
  1. Create a graph that shows the distribution of companies in the dataset by State (i.e. how many are in each state). There are a lot of States, so consider which axis you should use assuming I am using a ‘portrait’ oriented screen
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
c <- ggplot(inc5000, aes(factor(State))) + geom_bar(fill="#009E73")
c <- c + coord_flip()
c <- c + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold")) 
c <- c + labs(title = "Number of Companies by State", x= "State", y= "Count")
c <- c + theme(plot.title = element_text(size=22))
c

ggsave('q1.png', height = 11, width = 8.5)
  1. For the State with the 3rd most companies, create a plot of average employment by industry for companies in this state (only use cases with full data. Your graph should show how variable the ranges are, and exclude outliers.
counts <- as.data.frame(table(inc5000$State))
colnames(counts) <- c("State", "Count")
head(counts) # check structure
##   State Count
## 1    AK     2
## 2    AL    51
## 3    AR     9
## 4    AZ   100
## 5    CA   701
## 6    CO   134
x <- sort(counts$Count, TRUE)[3] 
filter(counts, Count == x)
##   State Count
## 1    NY   311
# pull out only NY data from orinal dataset and remove incomplete cases
ny_data <- filter(inc5000, State == "NY")
ny_data <- ny_data[complete.cases(ny_data),]
str(ny_data)
## 'data.frame':    311 obs. of  8 variables:
##  $ Rank       : int  26 30 37 38 48 70 71 124 126 153 ...
##  $ Name       : Factor w/ 5001 levels "(Add)ventures",..: 529 3822 4972 1037 912 19 2608 3591 3684 3668 ...
##  $ Growth_Rate: num  84.4 73.2 67.4 67 53.6 ...
##  $ Revenue    : num  13700000 8100000 18000000 7100000 5900000 27900000 6900000 11500000 9800000 15400000 ...
##  $ Industry   : Factor w/ 25 levels "Advertising & Marketing",..: 5 1 1 1 10 1 1 24 21 25 ...
##  $ Employees  : int  17 79 27 89 32 75 42 28 17 42 ...
##  $ City       : Factor w/ 1519 levels "Acton","Addison",..: 929 929 929 929 1135 929 929 929 574 162 ...
##  $ State      : Factor w/ 52 levels "AK","AL","AR",..: 35 35 35 35 35 35 35 35 35 35 ...
d <- ggplot(ny_data) +  geom_bar(aes(Industry, Employees, fill = Industry), position = "dodge", stat = "summary", fun.y = "mean", fill="sky blue")
d <- d + coord_flip()
d <- d + theme(legend.position="none")
d <- d + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold")) 
d <- d + labs(title = "Average Number of Employees by Industry in NY", x= "Industry", y= "Average Number of Employees")
d <- d + theme(plot.title = element_text(size=17))
d

ggsave('q2.png', height = 8.5, width = 9)
  1. Generate a chart showing which industries generate the most revenue per employee.
#filter out incomplete cases
rev_data <- inc5000[complete.cases(inc5000),]
# Create a new column showing revenue/employee
rev_data <- rev_data %>% mutate(rev_per_em = Revenue / Employees)
str(rev_data)
## 'data.frame':    4989 obs. of  9 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name       : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
##  $ Growth_Rate: num  421 248 245 233 213 ...
##  $ Revenue    : num  1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
##  $ Industry   : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
##  $ Employees  : int  104 51 132 50 220 63 27 75 97 15 ...
##  $ City       : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
##  $ State      : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
##  $ rev_per_em : num  1133654 972549 193182 38000000 395455 ...
head(rev_data)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State rev_per_em
## 1 Consumer Products & Services       104   El Segundo    CA  1133653.8
## 2          Government Services        51     Dumfries    VA   972549.0
## 3                       Health       132 Jacksonville    FL   193181.8
## 4                       Energy        50      Addison    TX 38000000.0
## 5      Advertising & Marketing       220       Boston    MA   395454.5
## 6                  Real Estate        63       Austin    TX   725396.8
# Plot average revenue/employee by industry
e <- ggplot(rev_data) +  geom_bar(aes(Industry, rev_per_em, fill = Industry), position = "dodge", stat = "summary", fun.y = "mean", fill="darkslateblue")
e <- e + coord_flip()
e <- e + theme(legend.position="none")
e <- e + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold")) 
e <- e + labs(title = "Average Revenue per Employees by Industry", x= "Industry", y= "Average Revenue per Employees")
e <- e + theme(plot.title = element_text(size=17))
e

ggsave('q3.png', height = 8.5, width = 9)