I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. Please use the data available at
https://github.com/charleyferrari/CUNY_DATA608/tree/master/lecture1/Data
● This assignment must be done in a single R script with ggplot2. Use of dplyr or plyr is encouraged but not required. ● Images for question ‘x’ must be saved to the lecture1 directory as ‘FigureX.jpg’ or ‘FigureX.png’ (ie for Question 1, Figure1.jpg, etc).
library(ggplot2)
library(dplyr)
library(Hmisc)
data <- read.csv('https://raw.githubusercontent.com/charleyferrari/CUNY_DATA608/master/lecture1/Data/inc5000_data.csv', header = TRUE)
stCnt <- data %>% count(State) %>% arrange(desc(n))
stCnt <- stCnt %>% mutate(quant = cut2(stCnt$n,quantile(stCnt$n, include.lowest=TRUE)))
# https://stackoverflow.com/questions/11728419/using-cut-and-quartile-to-generate-breaks-in-r-function
ggplot(stCnt, aes(x = reorder(State, n), y = n)) +
geom_bar(aes(fill = quant), color="black", stat = "identity") +
coord_flip() +
ggtitle("States with the Fastest Growing Companies") +
labs(y= NULL, x = NULL) +
scale_fill_discrete(name = "Quantile Groups") +
theme(legend.position="bottom")
stCnt$State[3]
## [1] NY
## 52 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA ... WY
#https://stackoverflow.com/questions/28687515/search-for-and-remove-outliers-from-a-dataframe-grouped-by-a-variable
ny <- data %>%
mutate(cases = complete.cases(data)) %>%
filter(cases=="TRUE") %>%
filter(State == "NY") %>%
#looks to see if values are more than 2 standard deviations from the mean.
filter(!(abs(Employees - mean(Employees)) > 2*sd(Employees))) %>%
group_by(Industry)%>%
#Find the mean and standard error
summarise(mean = mean(Employees),
n = length(Industry),
se = sd(Employees)/sqrt(n))
# Take a look at the outliers we eliminated
test <- data %>%
mutate(cases = complete.cases(data)) %>%
filter(cases=="TRUE") %>%
filter(State == "NY") %>%
arrange(desc(Employees))
ggplot(ny, aes(x = reorder(Industry, mean), y = mean)) +
geom_bar(aes(fill = Industry), color="black", stat = "identity") +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=0.6) +
ggtitle("Average # of Employees per Company by Industry") +
labs(y= NULL, x = NULL) +
guides(fill=FALSE) +
coord_flip()
rev <- data %>%
mutate(cases = complete.cases(data)) %>%
filter(cases=="TRUE") %>%
mutate(rev_emp = Revenue/Employees) %>%
#looks to see if values are more than 2 standard deviations from the mean.
filter(!(abs(rev_emp - mean(rev_emp)) > 2*sd(rev_emp))) %>%
group_by(Industry)%>%
#Find the mean and standard error
summarise(Revenue_Employee = sum(Revenue)/sum(Employees),
n = length(Industry),
se = sd(Revenue/Employees)/sqrt(n))
ggplot(rev, aes(x = reorder(Industry, Revenue_Employee), y = Revenue_Employee)) +
geom_bar(aes(fill = Industry), color="black", stat = "identity") +
geom_errorbar(aes(ymin=Revenue_Employee-se, ymax=Revenue_Employee+se), width=0.6) +
ggtitle("Average Revenue per Employee by Industry") +
labs(y= NULL, x = NULL) +
guides(fill=FALSE) +
scale_y_continuous(labels = scales::comma) +
coord_flip()