Load data
Url <- "https://raw.githubusercontent.com/charleyferrari/CUNY_DATA608/master/lecture1/Data/inc5000_data.csv"
growthBiz5000 <- read.csv(Url, stringsAsFactors=FALSE)
df_growthBiz5000 <- data.frame(growthBiz5000)
Question 1: Distribution of Companies by State

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use assuming I am using a ‘portrait’ oriented screen (ie taller than wide).

df_compFreqByState <-
    df_growthBiz5000 %>%
        group_by(State) %>%
            tally() %>%
                arrange(desc(n))
df_compFreqByState$State <- factor(df_compFreqByState$State, levels=unique(as.character(df_compFreqByState$State)))
Plot
g <- ggplot(data=df_compFreqByState, aes(x=factor(State), y=n, filled.contour()))
g <- g + geom_bar(stat="identity", position = "stack") + scale_fill_brewer(palette = "BrBG")
g <- g + geom_text(aes(label=n), hjust= -1, size=2)
g <- g + theme(axis.text=element_text(size=6),axis.title=element_text(size=10,face="bold"))
g <- g + labs(title="Distribution of Companies by State", x="States", y="n (hundreds)") 
g <- g + coord_flip()
g

Save Figure1.png to lecture directory
tgtdir <- paste0(getwd(),"/lecture/Figure1.png")
ggsave(tgtdir, width = 4.8, height=6.4, dpi=100)
Question 2: Average Employee Count by Industry for State with 3rd most Employees.

Let’s dig in on the State with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries employ. Create a plot of average employment by industry for companies in this state (only use cases with full data (user R’s complete.cases() function). Your graph should show how variable the ranges are, and exclude outliers.

rank3 <-
    df_growthBiz5000 %>% #full dataset
        group_by(State, Employees) %>% 
            summarize(n=n()) %>% #group by State record count
                arrange(desc(n)) %>% #in descending order
                    mutate(rank = rank(n, ties.method = "first")) %>% #add rank column
                        summarize(rank3rd = max(rank) - 2) #get third greatest rank  
rec <-
    df_growthBiz5000 %>% #full dataset
        group_by(State) %>% 
            summarize(n=n()) %>% #group by State record count
                arrange(desc(n)) %>% #in descending order
                    mutate(rank = rank(n, ties.method = "first")) %>%
                        filter(rank == rank3$rank3rd)

df_growthBiz5000CompletedCases <- 
    df_growthBiz5000 %>% 
        filter(complete.cases(.))

df_NYIndustriesEmpAvgs <- 
    df_growthBiz5000CompletedCases %>% #full datasets of Complete Cases
        filter(State == rec$State) %>% #filter by state rank=50 "NY"
            group_by(Industry) %>%
                summarize(EmpAvg=mean(Employees)) %>% #group by Industry, Sum Employees
                    arrange(desc(EmpAvg)) #in descending order

df_NYIndustriesEmpAvgs$EmpAvg <- as.integer(df_NYIndustriesEmpAvgs$EmpAvg)
df_NYIndustriesEmpAvgs$Industry <- factor(df_NYIndustriesEmpAvgs$Industry, levels=unique(as.character(df_NYIndustriesEmpAvgs$Industry)))
Plot
g <- ggplot(data=df_NYIndustriesEmpAvgs, aes(x=factor(Industry), y=EmpAvg, filled.contour())) 
g <- g + geom_bar(stat="identity", position = "stack") + scale_fill_brewer(palette = "BrBG")
g <- g + geom_text(aes(label=EmpAvg), hjust= -1, size = 3)
g <- g + theme(axis.text=element_text(size=9),axis.title=element_text(size=10,face="bold"))
g <- g + labs(title="Employment by Industry in NY", x="Industry", y="Average Employees") 
g <- g + coord_flip()
g 

Save Figure2.png to lecture directory
tgtdir <- paste0(getwd(),"/lecture/Figure2.png")
ggsave(tgtdir, width = 4.8, height=6.4, dpi=100)

Question 3: Most Revenue per Employee by Industry Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart makes this information clear.

df_IndustriesByTtlRev <- 
    df_growthBiz5000CompletedCases %>% #full datasets of Complete Cases
            group_by(Industry) %>%
                summarize(TtlRev=sum(Revenue)) %>% #group by Industry, Sum Total Revenue
                    arrange(desc(TtlRev)) #in descending order
df_IndustriesByTtlRev$Industry <- factor(df_IndustriesByTtlRev$Industry, levels=unique(as.character(df_IndustriesByTtlRev$Industry)))
Plot
g <- ggplot(data=df_IndustriesByTtlRev, aes(x=factor(Industry), y=TtlRev/10000000000)) 
g <- g + geom_bar(stat="identity")
g <- g + geom_text(aes(label=TtlRev/10000000000), hjust= -1, size = 2.5)
g <- g + theme(axis.text=element_text(size=8),axis.title=element_text(size=10,face="bold"))
g <- g + labs(title="Total Revenue by Industry", x="Industry", y="Revenue (in $ billion)")
g <- g + coord_flip()
g 

Save Figure3.png to lecture directory.
tgtdir <- paste0(getwd(),"/lecture/Figure3.png")
ggsave(tgtdir, width = 4.8, height=6.4, dpi=100)