Url <- "https://raw.githubusercontent.com/charleyferrari/CUNY_DATA608/master/lecture1/Data/inc5000_data.csv"
growthBiz5000 <- read.csv(Url, stringsAsFactors=FALSE)
df_growthBiz5000 <- data.frame(growthBiz5000)
Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use assuming I am using a ‘portrait’ oriented screen (ie taller than wide).
df_compFreqByState <-
df_growthBiz5000 %>%
group_by(State) %>%
tally() %>%
arrange(desc(n))
df_compFreqByState$State <- factor(df_compFreqByState$State, levels=unique(as.character(df_compFreqByState$State)))
g <- ggplot(data=df_compFreqByState, aes(x=factor(State), y=n, filled.contour()))
g <- g + geom_bar(stat="identity", position = "stack") + scale_fill_brewer(palette = "BrBG")
g <- g + geom_text(aes(label=n), hjust= -1, size=2)
g <- g + theme(axis.text=element_text(size=6),axis.title=element_text(size=10,face="bold"))
g <- g + labs(title="Distribution of Companies by State", x="States", y="n (hundreds)")
g <- g + coord_flip()
g
tgtdir <- paste0(getwd(),"/lecture/Figure1.png")
ggsave(tgtdir, width = 4.8, height=6.4, dpi=100)
Let’s dig in on the State with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries employ. Create a plot of average employment by industry for companies in this state (only use cases with full data (user R’s complete.cases() function). Your graph should show how variable the ranges are, and exclude outliers.
rank3 <-
df_growthBiz5000 %>% #full dataset
group_by(State, Employees) %>%
summarize(n=n()) %>% #group by State record count
arrange(desc(n)) %>% #in descending order
mutate(rank = rank(n, ties.method = "first")) %>% #add rank column
summarize(rank3rd = max(rank) - 2) #get third greatest rank
rec <-
df_growthBiz5000 %>% #full dataset
group_by(State) %>%
summarize(n=n()) %>% #group by State record count
arrange(desc(n)) %>% #in descending order
mutate(rank = rank(n, ties.method = "first")) %>%
filter(rank == rank3$rank3rd)
df_growthBiz5000CompletedCases <-
df_growthBiz5000 %>%
filter(complete.cases(.))
df_NYIndustriesEmpAvgs <-
df_growthBiz5000CompletedCases %>% #full datasets of Complete Cases
filter(State == rec$State) %>% #filter by state rank=50 "NY"
group_by(Industry) %>%
summarize(EmpAvg=mean(Employees)) %>% #group by Industry, Sum Employees
arrange(desc(EmpAvg)) #in descending order
df_NYIndustriesEmpAvgs$EmpAvg <- as.integer(df_NYIndustriesEmpAvgs$EmpAvg)
df_NYIndustriesEmpAvgs$Industry <- factor(df_NYIndustriesEmpAvgs$Industry, levels=unique(as.character(df_NYIndustriesEmpAvgs$Industry)))
g <- ggplot(data=df_NYIndustriesEmpAvgs, aes(x=factor(Industry), y=EmpAvg, filled.contour()))
g <- g + geom_bar(stat="identity", position = "stack") + scale_fill_brewer(palette = "BrBG")
g <- g + geom_text(aes(label=EmpAvg), hjust= -1, size = 3)
g <- g + theme(axis.text=element_text(size=9),axis.title=element_text(size=10,face="bold"))
g <- g + labs(title="Employment by Industry in NY", x="Industry", y="Average Employees")
g <- g + coord_flip()
g
tgtdir <- paste0(getwd(),"/lecture/Figure2.png")
ggsave(tgtdir, width = 4.8, height=6.4, dpi=100)
Question 3: Most Revenue per Employee by Industry Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart makes this information clear.
df_IndustriesByTtlRev <-
df_growthBiz5000CompletedCases %>% #full datasets of Complete Cases
group_by(Industry) %>%
summarize(TtlRev=sum(Revenue)) %>% #group by Industry, Sum Total Revenue
arrange(desc(TtlRev)) #in descending order
df_IndustriesByTtlRev$Industry <- factor(df_IndustriesByTtlRev$Industry, levels=unique(as.character(df_IndustriesByTtlRev$Industry)))
g <- ggplot(data=df_IndustriesByTtlRev, aes(x=factor(Industry), y=TtlRev/10000000000))
g <- g + geom_bar(stat="identity")
g <- g + geom_text(aes(label=TtlRev/10000000000), hjust= -1, size = 2.5)
g <- g + theme(axis.text=element_text(size=8),axis.title=element_text(size=10,face="bold"))
g <- g + labs(title="Total Revenue by Industry", x="Industry", y="Revenue (in $ billion)")
g <- g + coord_flip()
g
tgtdir <- paste0(getwd(),"/lecture/Figure3.png")
ggsave(tgtdir, width = 4.8, height=6.4, dpi=100)