Load library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
Import and group data
#Read CSV into R
url <- read.csv('https://raw.githubusercontent.com/indianspice/IS608/master/Assign1/inc5000_data.csv',
stringsAsFactors = TRUE)
#Examine the data
head(url)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
str(url)
## 'data.frame': 5001 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
## $ Growth_Rate: num 421 248 245 233 213 ...
## $ Revenue : num 1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
## $ Industry : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
## $ Employees : int 104 51 132 50 220 63 27 75 97 15 ...
## $ City : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
df <- data.frame(url)
head(df)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
#Group data by state
comp <- df %>%
group_by(State) %>%
tally() %>%
arrange(desc(n))
head(comp)
## # A tibble: 6 × 2
## State n
## <fctr> <int>
## 1 CA 701
## 2 TX 387
## 3 NY 311
## 4 VA 283
## 5 FL 282
## 6 IL 273
- Create a graph that shows the distribution of companies in the dataset by state
ggplot(data=comp, aes(x=reorder(State, -n), y=n)) +
geom_bar(stat = 'identity', fill = 'chocolate') +
ggtitle("Distrubution of Companies by State") +
ylab("Number of Companies") +
xlab("State") +
coord_flip()

- The state with the 3rd most companies in the data set. Imagine you work for the sate and are interested in how many people are employed by companies in different industries employ. Create a plot of average employment by industry for companies in this state (only use cases with full data (use R’s complete.cases()function). Your graph should show how variable the ranges are and exclude outliers.
head(df)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
#Subset data to extract NY the state with the 3rd most companeis
ny_df <- subset(df, State=='NY')
head(ny_df)
## Rank Name Growth_Rate Revenue
## 26 26 BeenVerified 84.43 13700000
## 30 30 Sailthru 73.22 8100000
## 37 37 YellowHammer 67.40 18000000
## 38 38 Conductor 67.02 7100000
## 48 48 Cinium Financial Services 53.65 5900000
## 70 70 33Across 44.99 27900000
## Industry Employees City State
## 26 Consumer Products & Services 17 New York NY
## 30 Advertising & Marketing 79 New York NY
## 37 Advertising & Marketing 27 New York NY
## 38 Advertising & Marketing 89 New York NY
## 48 Financial Services 32 Rock Hill NY
## 70 Advertising & Marketing 75 New York NY
#List rows of data with missing values
ny_df[!complete.cases(ny_df), ]
## [1] Rank Name Growth_Rate Revenue Industry Employees
## [7] City State
## <0 rows> (or 0-length row.names)
#Examine the data for outliers
summary(ny_df)
## Rank Name Growth_Rate
## Min. : 26 1st Equity : 1 Min. : 0.350
## 1st Qu.:1186 33Across : 1 1st Qu.: 0.670
## Median :2702 5Linx Enterprises : 1 Median : 1.310
## Mean :2612 Access Display Group: 1 Mean : 4.371
## 3rd Qu.:4005 Adafruit : 1 3rd Qu.: 3.580
## Max. :4981 AdCorp Media Group : 1 Max. :84.430
## (Other) :305
## Revenue Industry Employees
## Min. :2.000e+06 Advertising & Marketing : 57 Min. : 1.0
## 1st Qu.:4.300e+06 IT Services : 43 1st Qu.: 21.0
## Median :8.800e+06 Business Products & Services: 26 Median : 45.0
## Mean :5.872e+07 Consumer Products & Services: 17 Mean : 271.3
## 3rd Qu.:2.570e+07 Telecommunications : 17 3rd Qu.: 105.5
## Max. :4.600e+09 Education : 14 Max. :32000.0
## (Other) :137
## City State
## New York :160 NY :311
## Brooklyn : 15 AK : 0
## Rochester: 9 AL : 0
## Buffalo : 5 AR : 0
## Fairport : 5 AZ : 0
## new york : 5 CA : 0
## (Other) :112 (Other): 0
qplot(ny_df$Employees, geom = "histogram", binwidth = 1000)

#Remove outliers
nydf_sub <- subset(ny_df, Employees<45) #At 45 employees the distribution is normal
summary(nydf_sub$Employees)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 12.00 21.00 21.09 28.75 44.00
Averages
mean_industry <- aggregate(nydf_sub[, 6], list(nydf_sub$Industry), mean)
mean_industry$x <- ceiling(mean_industry$x)
#mean_industry <- rename(mean_industry,c("Group.1" = "Industry", "x" = "Average"))
head(mean_industry)
## Group.1 x
## 1 Advertising & Marketing 24
## 2 Business Products & Services 23
## 3 Computer Hardware 44
## 4 Construction 20
## 5 Consumer Products & Services 16
## 6 Education 26
Plot data
ggplot(mean_industry, aes(x=reorder(Group.1, x), x)) +
geom_bar(stat = 'identity', fill = 'chocolate') +
coord_flip() +
ggtitle("Average Employment by Industry") +
xlab("Industry") +
ylab("Average Employment")

- Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart to make this clear.
#Create df with revenue and industry
ind_rev <- df[, 4:5]
#Sum revenue per industry
revenue <- aggregate(df[, 1], list(df$Industry), sum)
revenue <- arrange(revenue, desc(x)) #Arrange in descending order
head(revenue)
## Group.1 x
## 1 IT Services 1862354
## 2 Business Products & Services 1387216
## 3 Advertising & Marketing 1110956
## 4 Health 857590
## 5 Software 796243
## 6 Manufacturing 768151
summary(revenue$x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 120000 193700 399900 500400 611700 1862000
sub_revenue <- subset(revenue, x<1860000) #Remove outlier
summary(sub_revenue$x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 120000 190800 373500 443700 523500 1387000
Revenue by Industry plot
ggplot(revenue, aes(x=reorder(Group.1, x), x)) +
geom_bar(stat = 'identity', fill = 'chocolate') +
coord_flip() +
ggtitle("Total Revenue by Industry") +
xlab("Industry") +
ylab("Revenue")
