Home Work 1 Data 608

Develop visualizations using this dataset consisting of the 5000 fastest growing companies in the US.

After loading the data we develop some summaries of the dataset.

glimpse(incmag_data)
## Observations: 5,001
## Variables: 8
## $ Rank        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Name        <fctr> Fuhu, FederalConference.com, The HCI Group, Bridg...
## $ Growth_Rate <dbl> 421.48, 248.31, 245.45, 233.08, 213.37, 179.38, 17...
## $ Revenue     <dbl> 1.179e+08, 4.960e+07, 2.550e+07, 1.900e+09, 8.700e...
## $ Industry    <fctr> Consumer Products & Services, Government Services...
## $ Employees   <int> 104, 51, 132, 50, 220, 63, 27, 75, 97, 15, 149, 16...
## $ City        <fctr> El Segundo, Dumfries, Jacksonville, Addison, Bost...
## $ State       <fctr> CA, VA, FL, TX, MA, TX, TN, CA, UT, RI, VA, CA, F...
summary(incmag_data)
##       Rank                          Name       Growth_Rate     
##  Min.   :   1   (Add)ventures         :   1   Min.   :  0.340  
##  1st Qu.:1252   @Properties           :   1   1st Qu.:  0.770  
##  Median :2502   1-Stop Translation USA:   1   Median :  1.420  
##  Mean   :2502   110 Consulting        :   1   Mean   :  4.612  
##  3rd Qu.:3751   11thStreetCoffee.com  :   1   3rd Qu.:  3.290  
##  Max.   :5000   123 Exteriors         :   1   Max.   :421.480  
##                 (Other)               :4995                    
##     Revenue                                  Industry      Employees      
##  Min.   :2.000e+06   IT Services                 : 733   Min.   :    1.0  
##  1st Qu.:5.100e+06   Business Products & Services: 482   1st Qu.:   25.0  
##  Median :1.090e+07   Advertising & Marketing     : 471   Median :   53.0  
##  Mean   :4.822e+07   Health                      : 355   Mean   :  232.7  
##  3rd Qu.:2.860e+07   Software                    : 342   3rd Qu.:  132.0  
##  Max.   :1.010e+10   Financial Services          : 260   Max.   :66803.0  
##                      (Other)                     :2358   NA's   :12       
##             City          State     
##  New York     : 160   CA     : 701  
##  Chicago      :  90   TX     : 387  
##  Austin       :  88   NY     : 311  
##  Houston      :  76   VA     : 283  
##  San Francisco:  75   FL     : 282  
##  Atlanta      :  74   IL     : 273  
##  (Other)      :4438   (Other):2764
head(incmag_data)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX

Question 1

Create a graph that shows the distribution of companies in the dataset by State

# figure 1, show the number of 
p<-ggplot(count(incmag_data,State), aes(x=reorder(State,n),n))
p+geom_bar(stat="identity",fill="darkblue")+xlab("State")+ylab("# Firms")+ggtitle("Number of Firms by State")+coord_flip()+theme(axis.text.y = element_text(size=8))

#Following Tufte's guidance on minimalism
p<-ggplot(count(incmag_data,State), aes(x=reorder(State,n),n))
p+geom_bar(stat="identity",fill="darkblue",alpha=0.7)+xlab("State")+ylab("# Firms")+ggtitle("Number of Firms by State")+coord_flip()+theme_bw()+theme(axis.text.y = element_text(size=7))

Let’s dig in on the State with the 3rd most companies in the data set:

Question 2

Create a plot of average employment by industry for companies in this state
only use cases with full data. Your graph should show how variable the ranges are,and exclude outliers.

#filter data to focus on the 3rd State
select_data<-filter(incmag_data,State=="NY")
#filter data to remove any missing values
select_data$complete<-complete.cases(select_data)
select_data<-filter(select_data,complete)
#filter-out the outliers
select_data<-select_data %>% group_by(Industry) 
select_data$rank<-percent_rank(select_data$Employees)
select_data<-filter(select_data,rank<0.975,rank>0.025)
#check the result
summary(select_data$rank)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02903 0.26130 0.49680 0.49730 0.73230 0.97420
##plot the data
p<-ggplot(select_data,aes(Industry,Employees,color=Industry))
#use the box plot and change some formatting
p1<-p+geom_boxplot()+guides(colour="none")+ theme(axis.text.x = element_text(angle=90,vjust=0.5))+scale_y_continuous(breaks=seq(0,1600,200))
#add means to the chart
p1+stat_summary(fun.y="mean",geom="point",shape=23,size=2,fill="white")

Question 3

Create a chart showing revenue per employee

#calculate revenue per emp
select_data<-mutate(select_data,rev_emp=(Revenue/Employees)/1000)
#group and calculate the mean revenue per employee by Industry
select_data<-select_data %>% group_by(Industry)
revenue_data<-summarize(select_data,mrev=mean(rev_emp))
#filter data to focus on the 3rd State
ggplot(revenue_data, aes(x=mrev,y=reorder(Industry,mrev))) + geom_point(color="red",size=2)+theme_minimal()+scale_x_continuous(breaks=seq(100,800,100))+xlab("Revenue Per Employee (in $ thousands)")+ggtitle("Mean Revenue per Employee by Industry")+ylab("Industry")