Develop visualizations using this dataset consisting of the 5000 fastest growing companies in the US.
After loading the data we develop some summaries of the dataset.
glimpse(incmag_data)
## Observations: 5,001
## Variables: 8
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Name <fctr> Fuhu, FederalConference.com, The HCI Group, Bridg...
## $ Growth_Rate <dbl> 421.48, 248.31, 245.45, 233.08, 213.37, 179.38, 17...
## $ Revenue <dbl> 1.179e+08, 4.960e+07, 2.550e+07, 1.900e+09, 8.700e...
## $ Industry <fctr> Consumer Products & Services, Government Services...
## $ Employees <int> 104, 51, 132, 50, 220, 63, 27, 75, 97, 15, 149, 16...
## $ City <fctr> El Segundo, Dumfries, Jacksonville, Addison, Bost...
## $ State <fctr> CA, VA, FL, TX, MA, TX, TN, CA, UT, RI, VA, CA, F...
summary(incmag_data)
## Rank Name Growth_Rate
## Min. : 1 (Add)ventures : 1 Min. : 0.340
## 1st Qu.:1252 @Properties : 1 1st Qu.: 0.770
## Median :2502 1-Stop Translation USA: 1 Median : 1.420
## Mean :2502 110 Consulting : 1 Mean : 4.612
## 3rd Qu.:3751 11thStreetCoffee.com : 1 3rd Qu.: 3.290
## Max. :5000 123 Exteriors : 1 Max. :421.480
## (Other) :4995
## Revenue Industry Employees
## Min. :2.000e+06 IT Services : 733 Min. : 1.0
## 1st Qu.:5.100e+06 Business Products & Services: 482 1st Qu.: 25.0
## Median :1.090e+07 Advertising & Marketing : 471 Median : 53.0
## Mean :4.822e+07 Health : 355 Mean : 232.7
## 3rd Qu.:2.860e+07 Software : 342 3rd Qu.: 132.0
## Max. :1.010e+10 Financial Services : 260 Max. :66803.0
## (Other) :2358 NA's :12
## City State
## New York : 160 CA : 701
## Chicago : 90 TX : 387
## Austin : 88 NY : 311
## Houston : 76 VA : 283
## San Francisco: 75 FL : 282
## Atlanta : 74 IL : 273
## (Other) :4438 (Other):2764
head(incmag_data)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
Create a graph that shows the distribution of companies in the dataset by State
# figure 1, show the number of
p<-ggplot(count(incmag_data,State), aes(x=reorder(State,n),n))
p+geom_bar(stat="identity",fill="darkblue")+xlab("State")+ylab("# Firms")+ggtitle("Number of Firms by State")+coord_flip()+theme(axis.text.y = element_text(size=8))
#Following Tufte's guidance on minimalism
p<-ggplot(count(incmag_data,State), aes(x=reorder(State,n),n))
p+geom_bar(stat="identity",fill="darkblue",alpha=0.7)+xlab("State")+ylab("# Firms")+ggtitle("Number of Firms by State")+coord_flip()+theme_bw()+theme(axis.text.y = element_text(size=7))
Let’s dig in on the State with the 3rd most companies in the data set:
Create a plot of average employment by industry for companies in this state
only use cases with full data. Your graph should show how variable the ranges are,and exclude outliers.
#filter data to focus on the 3rd State
select_data<-filter(incmag_data,State=="NY")
#filter data to remove any missing values
select_data$complete<-complete.cases(select_data)
select_data<-filter(select_data,complete)
#filter-out the outliers
select_data<-select_data %>% group_by(Industry)
select_data$rank<-percent_rank(select_data$Employees)
select_data<-filter(select_data,rank<0.975,rank>0.025)
#check the result
summary(select_data$rank)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02903 0.26130 0.49680 0.49730 0.73230 0.97420
##plot the data
p<-ggplot(select_data,aes(Industry,Employees,color=Industry))
#use the box plot and change some formatting
p1<-p+geom_boxplot()+guides(colour="none")+ theme(axis.text.x = element_text(angle=90,vjust=0.5))+scale_y_continuous(breaks=seq(0,1600,200))
#add means to the chart
p1+stat_summary(fun.y="mean",geom="point",shape=23,size=2,fill="white")
Create a chart showing revenue per employee
#calculate revenue per emp
select_data<-mutate(select_data,rev_emp=(Revenue/Employees)/1000)
#group and calculate the mean revenue per employee by Industry
select_data<-select_data %>% group_by(Industry)
revenue_data<-summarize(select_data,mrev=mean(rev_emp))
#filter data to focus on the 3rd State
ggplot(revenue_data, aes(x=mrev,y=reorder(Industry,mrev))) + geom_point(color="red",size=2)+theme_minimal()+scale_x_continuous(breaks=seq(100,800,100))+xlab("Revenue Per Employee (in $ thousands)")+ggtitle("Mean Revenue per Employee by Industry")+ylab("Industry")