Principles of Data Visualization and Introduction to ggplot2
Data provided contains 5,000 fastest growing companies in the US as compiled by Inc. magazine. lets read this in:
And lets preview this data:
my_stats <- inc %>%
dplyr::filter(State == "NY", complete.cases(.)) %>%
select(Industry, Revenue, Employees) %>%
mutate(Industry = as.character(Industry),
RevPerEmp = Revenue/Employees) %>%
group_by(Industry) %>%
summarise(MedRevPerEmp = median(RevPerEmp),
MedEmpPerIndustry = median(Employees)) %>% ungroup()
my_v_median <- 200000
my_v_medEmp <- 45
head(inc, 3)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 117900000
## 2 2 FederalConference.com 248.31 49600000
## 3 3 The HCI Group 245.45 25500000
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
summary(inc[,-c(1:2,7,8)])
## Growth_Rate Revenue Industry
## Min. : 0.340 Min. :2.000e+06 IT Services : 733
## 1st Qu.: 0.770 1st Qu.:5.100e+06 Business Products & Services: 482
## Median : 1.420 Median :1.090e+07 Advertising & Marketing : 471
## Mean : 4.612 Mean :4.822e+07 Health : 355
## 3rd Qu.: 3.290 3rd Qu.:2.860e+07 Software : 342
## Max. :421.480 Max. :1.010e+10 Financial Services : 260
## (Other) :2358
## Employees
## Min. : 1.0
## 1st Qu.: 25.0
## Median : 53.0
## Mean : 232.7
## 3rd Qu.: 132.0
## Max. :66803.0
## NA's :12
Below, the summary stats of 4 features of interest provide some background on the data included:
# some functions for reviewing my data and formatting
fun_mean <- function(x){
return(data.frame(y = mean(x),
label = mean(x, na.rm = T)))}
na_review <- function(df){
# returns df of vars w/ NA qty desc.
na_qty <- colSums(is.na(df)) %>% as.data.frame(stringsAsFactors=F)
colnames(na_qty) <- c("NA_qty")
na_qty <- cbind('Variable' = rownames(na_qty), na_qty) %>%
select(Variable, NA_qty)
rownames(na_qty) <- NULL
na_qty <- na_qty %>%
arrange(desc(NA_qty)) %>% dplyr::filter(NA_qty > 0) %>%
mutate(Variable = as.character(Variable)) %>%
mutate(Pct_of_Tot = round(NA_qty/nrow(df), 4) * 100)
return(na_qty)
}
na_review(inc)
## Variable NA_qty Pct_of_Tot
## 1 Employees 12 0.24
psych::describe(inc)
## vars n mean sd median trimmed
## Rank 1 5001 2501.64 1443.51 2.502e+03 2501.73
## Name* 2 5001 2501.00 1443.81 2.501e+03 2501.00
## Growth_Rate 3 5001 4.61 14.12 1.420e+00 2.14
## Revenue 4 5001 48222535.49 240542281.14 1.090e+07 17334966.26
## Industry* 5 5001 12.10 7.33 1.300e+01 12.05
## Employees 6 4989 232.72 1353.13 5.300e+01 81.78
## City* 7 5001 732.00 441.12 7.610e+02 731.74
## State* 8 5001 24.80 15.64 2.300e+01 24.44
## mad min max range skew kurtosis
## Rank 1853.25 1.0e+00 5.0000e+03 4.9990e+03 0.00 -1.20
## Name* 1853.25 1.0e+00 5.0010e+03 5.0000e+03 0.00 -1.20
## Growth_Rate 1.22 3.4e-01 4.2148e+02 4.2114e+02 12.55 242.34
## Revenue 10674720.00 2.0e+06 1.0100e+10 1.0098e+10 22.17 722.66
## Industry* 8.90 1.0e+00 2.5000e+01 2.4000e+01 -0.10 -1.18
## Employees 53.37 1.0e+00 6.6803e+04 6.6802e+04 29.81 1268.67
## City* 604.90 1.0e+00 1.5190e+03 1.5180e+03 -0.04 -1.26
## State* 19.27 1.0e+00 5.2000e+01 5.1000e+01 0.12 -1.46
## se
## Rank 20.41
## Name* 20.42
## Growth_Rate 0.20
## Revenue 3401441.44
## Industry* 0.10
## Employees 19.16
## City* 6.24
## State* 0.22
A graph that shows the distribution of companies in the dataset by State (ie how many are in each state).
# note that I coded in such a way that each step is very explicit
# and not for streamlining purposes.
inc %>% dplyr::group_by(State) %>%
dplyr::summarise(CoQty = n()) %>%
arrange(CoQty) %>%
ggplot(aes(x = reorder(State, CoQty), y = CoQty)) +
geom_bar(stat="identity", alpha = .8) +
coord_flip() +
labs(x = "", y = "",
title = "NY Company Qty by State") +
theme_minimal()
The state with the 3rd most companies in the data set is NY. Examining just this state, the plot below shows the median employment by industry for companies. Boxplots with outliers can get the job done.
#View(inc %>% dplyr::filter(State == "NY", complete.cases(.)) )
inc %>% dplyr::filter(State == "NY", complete.cases(.)) %>%
select(Industry, Employees) %>%
mutate(Industry = as.character(Industry)) %>%
arrange(Industry) %>%
ggplot(aes(x = reorder(Industry, Employees, median),
y = Employees )) +
geom_boxplot(outlier.shape = 21,
outlier.alpha = .5,
outlier.color = "black") +
geom_hline(yintercept = my_v_medEmp,
color = "darkblue", alpha = 0.5) +
scale_y_log10(breaks = c(1,10,100,1000, 10000),
labels=scales::comma) +
coord_flip() +
labs(x = "", title = "NY Employee Qty by Industry") +
theme_minimal() +
annotate("text", x = 26.2, y = my_v_medEmp + 5,
label =sprintf("State Median: %s",
scales::comma(my_v_medEmp)),
color = "darkblue",
alpha = .5,
size = 2.3, vjust="inward", hjust="inward") +
theme_minimal()
A chart that the industries that generate the most revenue per employee. Similar to the previous plot now with a annotated median line that provides the viewer with a measure of the statewide median.
p <- inc %>%
dplyr::filter(State == "NY", complete.cases(.)) %>%
select(Industry, Revenue, Employees) %>%
mutate(Industry = as.character(Industry),
RevPerEmp = Revenue/Employees) %>%
arrange(Industry) %>%
ggplot(aes(x = reorder(Industry, RevPerEmp, mean),
y = RevPerEmp)) +
geom_boxplot(outlier.shape = 21,
outlier.alpha = .5) +
geom_hline(yintercept = my_v_median,
color = "darkblue", alpha = 0.5) +
scale_y_log10(breaks = c(10000,50000,250000,1500000, 10000000),
labels=scales::dollar) +
coord_flip() +
labs(x = "", y = "",
title = "NY Revenue per Employees by Industry")
p + annotate("text", x = 26.2, y = my_v_median+15E3,
label =sprintf("State Median: $%s",
scales::comma(my_v_median)),
color = "darkblue",
alpha = .5,
size = 2.3, vjust="inward", hjust="inward") +
#annotation_logticks() +
theme_minimal()