The following questions pertain to data on the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. That data is available here
download.file("https://raw.githubusercontent.com/jlaurito/CUNY_IS608/master/lecture1/data/inc5000_data.csv", "inc5000_data.csv", method="curl")
inc5000 <- read.csv("inc5000_data.csv", na.strings = "NA")
# check the header and structure
head(inc5000)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
str(inc5000)
## 'data.frame': 5001 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
## $ Growth_Rate: num 421 248 245 233 213 ...
## $ Revenue : num 1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
## $ Industry : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
## $ Employees : int 104 51 132 50 220 63 27 75 97 15 ...
## $ City : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
c <- ggplot(inc5000, aes(factor(State))) + geom_bar(fill="#009E73")
c <- c + coord_flip()
c <- c + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold"))
c <- c + labs(title = "Number of Companies by State", x= "State", y= "Count")
c <- c + theme(plot.title = element_text(size=22))
c
ggsave('q1.png', height = 11, width = 8.5)
counts <- as.data.frame(table(inc5000$State))
colnames(counts) <- c("State", "Count")
head(counts) # check structure
## State Count
## 1 AK 2
## 2 AL 51
## 3 AR 9
## 4 AZ 100
## 5 CA 701
## 6 CO 134
x <- sort(counts$Count, TRUE)[3]
filter(counts, Count == x)
## State Count
## 1 NY 311
# pull out only NY data from orinal dataset and remove incomplete cases
ny_data <- filter(inc5000, State == "NY")
ny_data <- ny_data[complete.cases(ny_data),]
str(ny_data)
## 'data.frame': 311 obs. of 8 variables:
## $ Rank : int 26 30 37 38 48 70 71 124 126 153 ...
## $ Name : Factor w/ 5001 levels "(Add)ventures",..: 529 3822 4972 1037 912 19 2608 3591 3684 3668 ...
## $ Growth_Rate: num 84.4 73.2 67.4 67 53.6 ...
## $ Revenue : num 13700000 8100000 18000000 7100000 5900000 27900000 6900000 11500000 9800000 15400000 ...
## $ Industry : Factor w/ 25 levels "Advertising & Marketing",..: 5 1 1 1 10 1 1 24 21 25 ...
## $ Employees : int 17 79 27 89 32 75 42 28 17 42 ...
## $ City : Factor w/ 1519 levels "Acton","Addison",..: 929 929 929 929 1135 929 929 929 574 162 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 35 35 35 35 35 35 35 35 35 35 ...
d <- ggplot(ny_data) + geom_bar(aes(Industry, Employees, fill = Industry), position = "dodge", stat = "summary", fun.y = "mean", fill="sky blue")
d <- d + coord_flip()
d <- d + theme(legend.position="none")
d <- d + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold"))
d <- d + labs(title = "Average Number of Employees by Industry in NY", x= "Industry", y= "Average Number of Employees")
d <- d + theme(plot.title = element_text(size=17))
d
ggsave('q2.png', height = 8.5, width = 9)
#filter out incomplete cases
rev_data <- inc5000[complete.cases(inc5000),]
# Create a new column showing revenue/employee
rev_data <- rev_data %>% mutate(rev_per_em = Revenue / Employees)
str(rev_data)
## 'data.frame': 4989 obs. of 9 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
## $ Growth_Rate: num 421 248 245 233 213 ...
## $ Revenue : num 1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
## $ Industry : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
## $ Employees : int 104 51 132 50 220 63 27 75 97 15 ...
## $ City : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
## $ rev_per_em : num 1133654 972549 193182 38000000 395455 ...
head(rev_data)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State rev_per_em
## 1 Consumer Products & Services 104 El Segundo CA 1133653.8
## 2 Government Services 51 Dumfries VA 972549.0
## 3 Health 132 Jacksonville FL 193181.8
## 4 Energy 50 Addison TX 38000000.0
## 5 Advertising & Marketing 220 Boston MA 395454.5
## 6 Real Estate 63 Austin TX 725396.8
# Plot average revenue/employee by industry
e <- ggplot(rev_data) + geom_bar(aes(Industry, rev_per_em, fill = Industry), position = "dodge", stat = "summary", fun.y = "mean", fill="darkslateblue")
e <- e + coord_flip()
e <- e + theme(legend.position="none")
e <- e + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold"))
e <- e + labs(title = "Average Revenue per Employees by Industry", x= "Industry", y= "Average Revenue per Employees")
e <- e + theme(plot.title = element_text(size=17))
e
ggsave('q3.png', height = 8.5, width = 9)