library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
#Read CSV into R
url <- read.csv('https://raw.githubusercontent.com/indianspice/IS608/master/Assign1/inc5000_data.csv',
stringsAsFactors = TRUE)
#Examine the data
head(url)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
str(url)
## 'data.frame': 5001 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
## $ Growth_Rate: num 421 248 245 233 213 ...
## $ Revenue : num 1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
## $ Industry : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
## $ Employees : int 104 51 132 50 220 63 27 75 97 15 ...
## $ City : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
df <- data.frame(url)
head(df)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
#Group data by state
comp <- df %>%
group_by(State) %>%
tally() %>%
arrange(desc(n))
head(comp)
## # A tibble: 6 × 2
## State n
## <fctr> <int>
## 1 CA 701
## 2 TX 387
## 3 NY 311
## 4 VA 283
## 5 FL 282
## 6 IL 273
ggplot(data=comp, aes(x=reorder(State, -n), y=n)) +
geom_bar(stat = 'identity', fill = 'chocolate') +
ggtitle("Distrubution of Companies by State") +
ylab("Number of Companies") +
xlab("State") +
coord_flip()
head(df)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
#Subset data to extract NY the state with the 3rd most companeis
ny_df <- subset(df, State=='NY')
head(ny_df)
## Rank Name Growth_Rate Revenue
## 26 26 BeenVerified 84.43 13700000
## 30 30 Sailthru 73.22 8100000
## 37 37 YellowHammer 67.40 18000000
## 38 38 Conductor 67.02 7100000
## 48 48 Cinium Financial Services 53.65 5900000
## 70 70 33Across 44.99 27900000
## Industry Employees City State
## 26 Consumer Products & Services 17 New York NY
## 30 Advertising & Marketing 79 New York NY
## 37 Advertising & Marketing 27 New York NY
## 38 Advertising & Marketing 89 New York NY
## 48 Financial Services 32 Rock Hill NY
## 70 Advertising & Marketing 75 New York NY
#List rows of data with missing values
ny_df[!complete.cases(ny_df), ]
## [1] Rank Name Growth_Rate Revenue Industry Employees
## [7] City State
## <0 rows> (or 0-length row.names)
#Examine the data for outliers
summary(ny_df)
## Rank Name Growth_Rate
## Min. : 26 1st Equity : 1 Min. : 0.350
## 1st Qu.:1186 33Across : 1 1st Qu.: 0.670
## Median :2702 5Linx Enterprises : 1 Median : 1.310
## Mean :2612 Access Display Group: 1 Mean : 4.371
## 3rd Qu.:4005 Adafruit : 1 3rd Qu.: 3.580
## Max. :4981 AdCorp Media Group : 1 Max. :84.430
## (Other) :305
## Revenue Industry Employees
## Min. :2.000e+06 Advertising & Marketing : 57 Min. : 1.0
## 1st Qu.:4.300e+06 IT Services : 43 1st Qu.: 21.0
## Median :8.800e+06 Business Products & Services: 26 Median : 45.0
## Mean :5.872e+07 Consumer Products & Services: 17 Mean : 271.3
## 3rd Qu.:2.570e+07 Telecommunications : 17 3rd Qu.: 105.5
## Max. :4.600e+09 Education : 14 Max. :32000.0
## (Other) :137
## City State
## New York :160 NY :311
## Brooklyn : 15 AK : 0
## Rochester: 9 AL : 0
## Buffalo : 5 AR : 0
## Fairport : 5 AZ : 0
## new york : 5 CA : 0
## (Other) :112 (Other): 0
qplot(ny_df$Employees, geom = "histogram", binwidth = 1000)
#Remove outliers
nydf_sub <- subset(ny_df, Employees<45) #At 45 employees the distribution is normal
summary(nydf_sub$Employees)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 12.00 21.00 21.09 28.75 44.00
mean_industry <- aggregate(nydf_sub[, 6], list(nydf_sub$Industry), mean)
mean_industry$x <- ceiling(mean_industry$x)
#mean_industry <- rename(mean_industry,c("Group.1" = "Industry", "x" = "Average"))
head(mean_industry)
## Group.1 x
## 1 Advertising & Marketing 24
## 2 Business Products & Services 23
## 3 Computer Hardware 44
## 4 Construction 20
## 5 Consumer Products & Services 16
## 6 Education 26
ggplot(mean_industry, aes(x=reorder(Group.1, x), x)) +
geom_bar(stat = 'identity', fill = 'chocolate') +
coord_flip() +
ggtitle("Average Employment by Industry") +
xlab("Industry") +
ylab("Average Employment")
#Create df with revenue and industry
ind_rev <- df[, 4:5]
#Sum revenue per industry
revenue <- aggregate(df[, 1], list(df$Industry), sum)
revenue <- arrange(revenue, desc(x)) #Arrange in descending order
head(revenue)
## Group.1 x
## 1 IT Services 1862354
## 2 Business Products & Services 1387216
## 3 Advertising & Marketing 1110956
## 4 Health 857590
## 5 Software 796243
## 6 Manufacturing 768151
summary(revenue$x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 120000 193700 399900 500400 611700 1862000
sub_revenue <- subset(revenue, x<1860000) #Remove outlier
summary(sub_revenue$x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 120000 190800 373500 443700 523500 1387000
ggplot(revenue, aes(x=reorder(Group.1, x), x)) +
geom_bar(stat = 'identity', fill = 'chocolate') +
coord_flip() +
ggtitle("Total Revenue by Industry") +
xlab("Industry") +
ylab("Revenue")