suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(DT)))
The data is about 5,000 fastest growing companies in the US, as compiled by Inc. magazine. Avalible here:https://github.com/charleyferrari/CUNY_DATA608/tree/master/lecture1/Data.
db <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA608/master/lecture1/Data/inc5000_data.csv")
datatable(head(db, n = nrow(db)), options = list(pageLength = 5))
Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use assuming I am using a ‘portrait’ oriented screen (ie taller than wide).
#count the number of campanies for each state
dstbt<- db %>%
group_by(State) %>%
summarize(cmp_count=n())
## Warning in grouped_df_impl(data, unname(vars), drop): '.Random.seed' is not
## an integer vector but of type 'NULL', so ignored
print(paste("There are", nrow(dstbt), "states in the data."))
## [1] "There are 52 states in the data."
print(paste("The highest number of companies is", max(dstbt$cmp_count),"."))
## [1] "The highest number of companies is 701 ."
print(paste("The lowest number of companies is", min(dstbt$cmp_count),"."))
## [1] "The lowest number of companies is 1 ."
dstbt$cmp_count<- as.numeric(dstbt$cmp_count)
ggplot(dstbt,aes(x=State,y=cmp_count))+
geom_bar(stat = "identity")+
coord_flip()+
theme_classic()+
labs(title="Distribution of Companies", x="State",y="Number of Companies")
The top three states that have the most companies are California, Texus and New York.
Let’s dig in on the State with the 3 rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries employ. Create a plot of average employment by industry for companies in this state (only use cases with full data (user R’s complete.cases() function). Your graph should show how variable the ranges are, and exclude outliers.
db$State <- as.character(db$State)
# check if there is missing data
any(is.na(db$Employees))
## [1] TRUE
# list rows of data that have completed values
complete_db <- db[complete.cases(db),]
# find the index for the row(s) that the State(s) have the 3rd most camponies
third_num <- sort(dstbt$cmp_count,decreasing=T)[3]
#find the State with the 3rd most companies
third_st <- dstbt[which(dstbt$cmp_count == third_num),1]
print(paste("the State with the 3rd most companies is ",third_st$State))
## [1] "the State with the 3rd most companies is NY"
# extract the data for the State
db$State <- as.character(db$State)
third_st$State <- as.character(third_st$State)
third_st_db <- complete_db[which(complete_db$State == third_st$State),]
# highlight the outliers
temp <- third_st_db %>%
group_by(Industry) %>%
mutate(iqr=IQR(Employees),q3=quantile(Employees)["75%"],q1=quantile(Employees)["25%"]) %>%
mutate(upper_lim=q3+1.5*iqr,lower_lim=q1-1.5*iqr)
outlier<- temp[which(temp$Employees>temp$upper_lim | temp$Employees<temp$lower_lim),]
ggplot(temp,aes(x=Industry,y=Employees))+
geom_boxplot()+
coord_flip()+
geom_point(data=outlier, aes(x=Industry, y=Employees),color="red",pch = 1,size = 1.7)+
theme_classic()
# remove outliers
clean_data <- temp[which(temp$Employees<=temp$upper_lim & temp$Employees>=temp$lower_lim),]
# plot after removing the outliers
ggplot(clean_data,aes(x=Industry,y=Employees))+
geom_boxplot()+
coord_flip()+
theme_classic()
# calculate the average employment for that State
avg_emp <- clean_data %>%
group_by(Industry) %>%
summarise(emp_avg=round(mean(Employees),1))
# distribution of average employment for that State
ggplot(avg_emp,aes(x=Industry,y=emp_avg))+
geom_bar(stat = "identity")+
coord_flip()+
theme_classic()+
labs(title="Employees", x="Industry",y="Average employees")
As shown in the first box plot, the red dots represent the outliers that have employees more than Q3+1.5xIQR or less than Q1-1.5xIQR. After removing the outliers, the box plot is not squeezed to the bottom and the distribution of employee numbers for each industry could be seen clearly.
In New York, human rescources has the most employees, around 280 employees per company, and the travel and hospitality comes as second one. Then environmenatal services and business products & servecies are the third and fourth with most employees, which are about 150 employees per company.
Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart makes this information clear.
If removing those outliers has revenue higher than Q3+1.5IQR or lower than Q1 -1.5IQR
# highlight the outliers for the revenue
temp_rev <- clean_data %>%
group_by(Industry) %>%
mutate(iqr_r=IQR(Revenue),q3_r=quantile(Revenue)["75%"],q1_r=quantile(Revenue)["25%"]) %>%
mutate(upper_lim_r=q3_r+1.5*iqr_r,lower_lim_r=q1_r-1.5*iqr_r)
outlier_rev<- temp_rev[which(temp_rev$Revenue>temp_rev$upper_lim_r | temp_rev$Revenue<temp_rev$lower_lim_r),]
ggplot(temp_rev,aes(x=Industry,y=Revenue))+
geom_boxplot()+
coord_flip()+
geom_point(data=outlier_rev, aes(x=Industry, y=Revenue),color="Green",pch = 1,size = 1.7)+
theme_classic()
# remove outliers
clean_data_rev <- temp_rev[which(temp_rev$Revenue<=temp_rev$upper_lim_r & temp_rev$Revenue>=temp_rev$lower_lim_r),]
# plot after removing the outliers
ggplot(clean_data_rev,aes(x=Industry,y=Revenue))+
geom_boxplot()+
coord_flip()+
theme_classic()
Green dots are outliers.
# calculate the average revenue per employee when not removing outliers for revenue data
rev_emp <- clean_data %>%
group_by(Industry) %>%
summarise(rev_sum=sum(Revenue),emp_sum=sum(Employees)) %>% mutate(avg_rev_emp=rev_sum/emp_sum)
# calculate the average revenue per employee when removing outliers for revenue data
rev_emp_2 <- clean_data_rev %>%
group_by(Industry) %>%
summarise(rev_sum=sum(Revenue),emp_sum=sum(Employees)) %>% mutate(avg_rev_emp=rev_sum/emp_sum)
ggplot(rev_emp,aes(x=Industry,y=avg_rev_emp))+
geom_bar(stat = "identity")+
coord_flip()+
theme_classic()+
labs(title="Revenue/Employee 1", x="Industry",y="Average revenue per employee")
ggplot(rev_emp_2,aes(x=Industry,y=avg_rev_emp))+
geom_bar(stat = "identity")+
coord_flip()+
theme_classic()+
labs(title="Revenue/Employee 2", x="Industry",y="Average revenue per employee")
Comparing the two figures before and after removing “outliers” in revenue, we can see the patterns changed. For example, the revenue per employee of telecommunication, retail, health, consumer product and services, financial services, and business product and services decreased a lot. But the the industry generate the most revenue per empoyee is the same: logistics & transportation.