Load Libraries

suppressMessages(if (!require('dplyr')) install.packages('dplyr'))
suppressMessages(if (!require('ggplot2')) install.packages('ggplot2'))

Load Data

data <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA608/master/lecture1/Data/inc5000_data.csv")

Graph 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use assuming I am using a ‘portrait’ oriented screen (ie taller than wide).

dplyr::count(data, State)  %>%
  mutate(State = factor(State, State)) %>%
  ggplot(aes(x = reorder(State, n), y = n))+ geom_bar(stat = "identity", width=0.5, position = position_dodge(width=0.5)) + 
  labs(title="Fastest Growing 5000 Companies: Distribution By State", x= "State", y = "Numbers") + coord_flip()

ggsave('Figure1.png')
## Saving 7 x 5 in image

Graph 2

Let’s dig in on the State with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries employ. Create a plot of average employment by industry for companies in this state (only use cases with full data (user R’s complete.cases() function). Your graph should show how variable the ranges are, and exclude outliers.

dNY <- data[complete.cases(data), ]
dNY <- subset(data, State == "NY") 


dNY <- group_by(dNY, Industry) %>% summarize(m = mean(Employees), max= max(Employees), min = min(Employees)) %>%
  na.omit()

upper <- dNY$max
lower <- dNY$min




ggplot(dNY, aes(x = Industry, y =m, ymax=max,  ymin = min, lower = lower, upper= upper)) + geom_boxplot(outlier.shape = NA) + coord_flip()+
  labs(title="Number of Employees By Industry in NY State", y = "Mean")

ggsave('Figure2.png')
## Saving 7 x 5 in image

Graph 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart makes this information clear

data_rev <- data[complete.cases(data), ] 
data_rev %>% 
  group_by(Industry) %>% 
  summarise(Total_Revenue = sum(Revenue), Total_employee = sum(Employees))%>%
  mutate(RevPerEmp = (Total_Revenue/Total_employee))%>%
ggplot(aes(x=reorder(Industry, RevPerEmp), y=RevPerEmp)) + 
    geom_bar(stat="identity", width=.5) +
    labs(title="Revenue Per Employee by Industry", x ="Industry", y = "Revenue Per Employee") + coord_flip()

ggsave('Figure3.png')
## Saving 7 x 5 in image