Principles of Data Visualization and Introduction to ggplot2
I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(ggplot2)
library(outliers)
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)
And lets preview this data:
head(inc)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
summary(inc)
## Rank Name Growth_Rate
## Min. : 1 (Add)ventures : 1 Min. : 0.340
## 1st Qu.:1252 @Properties : 1 1st Qu.: 0.770
## Median :2502 1-Stop Translation USA: 1 Median : 1.420
## Mean :2502 110 Consulting : 1 Mean : 4.612
## 3rd Qu.:3751 11thStreetCoffee.com : 1 3rd Qu.: 3.290
## Max. :5000 123 Exteriors : 1 Max. :421.480
## (Other) :4995
## Revenue Industry Employees
## Min. :2.000e+06 IT Services : 733 Min. : 1.0
## 1st Qu.:5.100e+06 Business Products & Services: 482 1st Qu.: 25.0
## Median :1.090e+07 Advertising & Marketing : 471 Median : 53.0
## Mean :4.822e+07 Health : 355 Mean : 232.7
## 3rd Qu.:2.860e+07 Software : 342 3rd Qu.: 132.0
## Max. :1.010e+10 Financial Services : 260 Max. :66803.0
## (Other) :2358 NA's :12
## City State
## New York : 160 CA : 701
## Chicago : 90 TX : 387
## Austin : 88 NY : 311
## Houston : 76 VA : 283
## San Francisco: 75 FL : 282
## Atlanta : 74 IL : 273
## (Other) :4438 (Other):2764
Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:
Let’s find some outliers (notably Growth, employees and Revenue; maximum values are way too high) in our data. Not only that, let’s see data types for each column and remove scientific notations in summary.
# Insert your code here, create more chunks as necessary
#Outliers
max_growth <- subset(inc, Growth_Rate == max(inc['Growth_Rate']))
max_growth
## Rank Name Growth_Rate Revenue Industry Employees
## 1 1 Fuhu 421.48 117900000 Consumer Products & Services 104
## City State
## 1 El Segundo CA
max_rev <- subset(inc, Revenue == max(inc['Revenue']))
max_rev
## Rank Name Growth_Rate Revenue Industry Employees
## 4788 4788 CDW 0.41 1.01e+10 Computer Hardware 6800
## City State
## 4788 Vernon Hills IL
max_employee <- subset(inc, inc['Employees'] == 66803)
max_employee
## Rank Name Growth_Rate Revenue
## 2344 2345 Integrity staffing Solutions 1.55 278200000
## Industry Employees City State
## 2344 Human Resources 66803 Wilmington DE
#Let's see how data looks like without outliers.
inc_no_outlier <- subset(inc, !(Rank %in% c(max_growth['Rank'], max_rev['Rank'], max_employee['Rank'])) )
summary(inc_no_outlier)
## Rank Name Growth_Rate
## Min. : 2 (Add)ventures : 1 Min. : 0.34
## 1st Qu.:1252 @Properties : 1 1st Qu.: 0.77
## Median :2502 1-Stop Translation USA: 1 Median : 1.42
## Mean :2502 110 Consulting : 1 Mean : 4.53
## 3rd Qu.:3751 11thStreetCoffee.com : 1 3rd Qu.: 3.29
## Max. :5000 123 Exteriors : 1 Max. :248.31
## (Other) :4992
## Revenue Industry Employees
## Min. :2.000e+06 IT Services : 733 Min. : 1.0
## 1st Qu.:5.100e+06 Business Products & Services: 482 1st Qu.: 25.0
## Median :1.090e+07 Advertising & Marketing : 471 Median : 53.0
## Mean :4.615e+07 Health : 355 Mean : 218.1
## 3rd Qu.:2.860e+07 Software : 342 3rd Qu.: 132.0
## Max. :4.700e+09 Financial Services : 260 Max. :32000.0
## (Other) :2355 NA's :12
## City State
## New York : 160 CA : 700
## Chicago : 90 TX : 387
## Austin : 88 NY : 311
## Houston : 76 VA : 283
## San Francisco: 75 FL : 282
## Atlanta : 74 IL : 272
## (Other) :4435 (Other):2763
#Well, it seems like data is still heavily right-skewed as Employees, Revenue and Growth rate have much higher Mean than Median even after removing outliers.
#Data types for each column
str(inc)
## 'data.frame': 5001 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 5001 levels "(Add)ventures",..: 1770 1633 4423 690 1198 2839 4733 1468 1869 4968 ...
## $ Growth_Rate: num 421 248 245 233 213 ...
## $ Revenue : num 1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
## $ Industry : Factor w/ 25 levels "Advertising & Marketing",..: 5 12 13 7 1 20 10 1 5 21 ...
## $ Employees : int 104 51 132 50 220 63 27 75 97 15 ...
## $ City : Factor w/ 1519 levels "Acton","Addison",..: 391 365 635 2 139 66 912 1179 131 1418 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 5 47 10 45 20 45 44 5 46 41 ...
# Using scipen=999 to disable scientific notations in Revenue.
options(scipen=999)
summary(inc)
## Rank Name Growth_Rate
## Min. : 1 (Add)ventures : 1 Min. : 0.340
## 1st Qu.:1252 @Properties : 1 1st Qu.: 0.770
## Median :2502 1-Stop Translation USA: 1 Median : 1.420
## Mean :2502 110 Consulting : 1 Mean : 4.612
## 3rd Qu.:3751 11thStreetCoffee.com : 1 3rd Qu.: 3.290
## Max. :5000 123 Exteriors : 1 Max. :421.480
## (Other) :4995
## Revenue Industry
## Min. : 2000000 IT Services : 733
## 1st Qu.: 5100000 Business Products & Services: 482
## Median : 10900000 Advertising & Marketing : 471
## Mean : 48222535 Health : 355
## 3rd Qu.: 28600000 Software : 342
## Max. :10100000000 Financial Services : 260
## (Other) :2358
## Employees City State
## Min. : 1.0 New York : 160 CA : 701
## 1st Qu.: 25.0 Chicago : 90 TX : 387
## Median : 53.0 Austin : 88 NY : 311
## Mean : 232.7 Houston : 76 VA : 283
## 3rd Qu.: 132.0 San Francisco: 75 FL : 282
## Max. :66803.0 Atlanta : 74 IL : 273
## NA's :12 (Other) :4438 (Other):2764
Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.
# Answer Question 1 here
# I am using sqldf to group by State and count number of companies
query<-sqldf("select
State, count(distinct Name) as cnt_comp
from inc
group by State")
# Using ggplot to graph
ggplot(query, aes(x=reorder(State, cnt_comp), y=cnt_comp)) +
geom_bar(stat="identity", fill="green") +
geom_text(aes(label=cnt_comp)) +
ylab("# of companies") +
xlab("State") +
coord_flip() +
ggtitle("Number of Companies by State") +
theme_bw()
Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.
# Answer Question 2 here
#Filter out N/As in full data
full <- inc[complete.cases(inc),]
#Using filtered full, choose 3rd most state
third <- subset(full, State == 'NY')
#The summary looks good, no N/As
summary(third)
## Rank Name Growth_Rate
## Min. : 26 1st Equity : 1 Min. : 0.350
## 1st Qu.:1186 33Across : 1 1st Qu.: 0.670
## Median :2702 5Linx Enterprises : 1 Median : 1.310
## Mean :2612 Access Display Group: 1 Mean : 4.371
## 3rd Qu.:4005 Adafruit : 1 3rd Qu.: 3.580
## Max. :4981 AdCorp Media Group : 1 Max. :84.430
## (Other) :305
## Revenue Industry Employees
## Min. : 2000000 Advertising & Marketing : 57 Min. : 1.0
## 1st Qu.: 4300000 IT Services : 43 1st Qu.: 21.0
## Median : 8800000 Business Products & Services: 26 Median : 45.0
## Mean : 58715113 Consumer Products & Services: 17 Mean : 271.3
## 3rd Qu.: 25700000 Telecommunications : 17 3rd Qu.: 105.5
## Max. :4600000000 Education : 14 Max. :32000.0
## (Other) :137
## City State
## New York :160 NY :311
## Brooklyn : 15 AK : 0
## Rochester: 9 AL : 0
## Buffalo : 5 AR : 0
## Fairport : 5 AZ : 0
## new york : 5 CA : 0
## (Other) :112 (Other): 0
# I am using sqldf to group by industry to get mean employment in NY
query<-sqldf("select
Industry, avg(Employees) as avg_emp
from third
group by Industry")
# Using ggplot to graph
ggplot(query, aes(x=reorder(Industry, avg_emp), y=round(avg_emp,0))) +
geom_bar(stat="identity", fill="skyblue") +
geom_text(aes(label=round(avg_emp,0))) +
ylab("Mean Number Of Employees") +
xlab("Industry") +
ggtitle("Mean Number of Employees by Industry in New York") +
coord_flip() +
theme_bw()
#There are outliers in Employees variables by Industry. We will get rid of these and re-graph our result.
ggplot(third, aes(x=reorder(Industry,Employees) , y=Employees)) +
geom_boxplot() +
ggtitle("Box-Plot by Industry and number of Employees") +
coord_flip()
#outlierKD function will automatically detect outliers and replace dataset (third) with dataset without outliers
outlierKD <- function(dt, var) {
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main="Without outliers")
hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
cat("Mean of the outliers:", round(mo, 2), "n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "n")
cat("Mean if we remove outliers:", round(m2, 2), "n")
dt[as.character(substitute(var))] <- invisible(var_name)
assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "n")
return(invisible(dt))
}
#Using loop, remove outliers in Employees by each industry
industries <- unique(third$Industry)
third_wo <-NULL
for (i in industries){
d2 <- subset(third, Industry == i)
outlierKD(d2, Employees)
third_wo <- rbind(d2, third_wo)
}
## Outliers identified: 1 nPropotion (%) of outliers: 6.2 nMean of the outliers: 10000 nMean without removing outliers: 626.29 nMean if we remove outliers: 40.44 nOutliers successfully removed n
## Outliers identified: 6 nPropotion (%) of outliers: 11.8 nMean of the outliers: 214.67 nMean without removing outliers: 58.44 nMean if we remove outliers: 40.06 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 8.3 nMean of the outliers: 483 nMean without removing outliers: 144.31 nMean if we remove outliers: 116.08 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 95.35 nMean if we remove outliers: 95.35 nOutliers successfully removed n
## Outliers identified: 3 nPropotion (%) of outliers: 27.3 nMean of the outliers: 70.33 nMean without removing outliers: 24.79 nMean if we remove outliers: 12.36 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 16.7 nMean of the outliers: 2280 nMean without removing outliers: 547.71 nMean if we remove outliers: 259 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 10 nMean of the outliers: 602 nMean without removing outliers: 108 nMean if we remove outliers: 58.6 nOutliers successfully removed n
## Outliers identified: 4 nPropotion (%) of outliers: 10.3 nMean of the outliers: 1233 nMean without removing outliers: 204.09 nMean if we remove outliers: 98.56 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 7.7 nMean of the outliers: 200 nMean without removing outliers: 59.86 nMean if we remove outliers: 49.08 nOutliers successfully removed n
## Outliers identified: 3 nPropotion (%) of outliers: 13 nMean of the outliers: 11780.67 nMean without removing outliers: 1492.46 nMean if we remove outliers: 150.52 nOutliers successfully removed n
## Outliers identified: 2 nPropotion (%) of outliers: 18.2 nMean of the outliers: 253.5 nMean without removing outliers: 73.31 nMean if we remove outliers: 40.55 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 10 nMean of the outliers: 2081 nMean without removing outliers: 437.55 nMean if we remove outliers: 273.2 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 8.3 nMean of the outliers: 298 nMean without removing outliers: 81.85 nMean if we remove outliers: 63.83 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 12.5 nMean of the outliers: 383 nMean without removing outliers: 76.44 nMean if we remove outliers: 38.12 nOutliers successfully removed n
## Outliers identified: 2 nPropotion (%) of outliers: 66.7 nMean of the outliers: 149.5 nMean without removing outliers: 129.2 nMean if we remove outliers: 115.67 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 18.25 nMean if we remove outliers: 18.25 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 29.5 nMean if we remove outliers: 29.5 nOutliers successfully removed n
## Outliers identified: 2 nPropotion (%) of outliers: 18.2 nMean of the outliers: 1035.5 nMean without removing outliers: 245.92 nMean if we remove outliers: 102.36 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 32.5 nMean if we remove outliers: 32.5 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 17 nMean if we remove outliers: 17 nOutliers successfully removed n
## Outliers identified: 1 nPropotion (%) of outliers: 20 nMean of the outliers: 219 nMean without removing outliers: 61 nMean if we remove outliers: 29.4 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 135 nMean if we remove outliers: 135 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 44 nMean if we remove outliers: 44 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 155 nMean if we remove outliers: 155 nOutliers successfully removed n
## Outliers identified: 0 nPropotion (%) of outliers: 0 nMean of the outliers: NaN nMean without removing outliers: 53.5 nMean if we remove outliers: 53.5 nOutliers successfully removed n
# Now, dataset (third) is free of outliers but notice there are NAs because these NAs were outliers before
summary(third)
## Rank Name Growth_Rate
## Min. : 26 1st Equity : 1 Min. : 0.350
## 1st Qu.:1186 33Across : 1 1st Qu.: 0.670
## Median :2702 5Linx Enterprises : 1 Median : 1.310
## Mean :2612 Access Display Group: 1 Mean : 4.371
## 3rd Qu.:4005 Adafruit : 1 3rd Qu.: 3.580
## Max. :4981 AdCorp Media Group : 1 Max. :84.430
## (Other) :305
## Revenue Industry Employees
## Min. : 2000000 Advertising & Marketing : 57 Min. : 1.0
## 1st Qu.: 4300000 IT Services : 43 1st Qu.: 21.0
## Median : 8800000 Business Products & Services: 26 Median : 45.0
## Mean : 58715113 Consumer Products & Services: 17 Mean : 271.3
## 3rd Qu.: 25700000 Telecommunications : 17 3rd Qu.: 105.5
## Max. :4600000000 Education : 14 Max. :32000.0
## (Other) :137
## City State
## New York :160 NY :311
## Brooklyn : 15 AK : 0
## Rochester: 9 AL : 0
## Buffalo : 5 AR : 0
## Fairport : 5 AZ : 0
## new york : 5 CA : 0
## (Other) :112 (Other): 0
summary(third_wo)
## Rank Name Growth_Rate
## Min. : 26 1st Equity : 1 Min. : 0.350
## 1st Qu.:1186 33Across : 1 1st Qu.: 0.670
## Median :2702 5Linx Enterprises : 1 Median : 1.310
## Mean :2612 Access Display Group: 1 Mean : 4.371
## 3rd Qu.:4005 Adafruit : 1 3rd Qu.: 3.580
## Max. :4981 AdCorp Media Group : 1 Max. :84.430
## (Other) :305
## Revenue Industry Employees
## Min. : 2000000 Advertising & Marketing : 57 Min. : 1.00
## 1st Qu.: 4300000 IT Services : 43 1st Qu.: 19.00
## Median : 8800000 Business Products & Services: 26 Median : 39.50
## Mean : 58715113 Consumer Products & Services: 17 Mean : 82.76
## 3rd Qu.: 25700000 Telecommunications : 17 3rd Qu.: 81.75
## Max. :4600000000 Education : 14 Max. :1134.00
## (Other) :137 NA's :31
## City State
## New York :160 NY :311
## Brooklyn : 15 AK : 0
## Rochester: 9 AL : 0
## Buffalo : 5 AR : 0
## Fairport : 5 AZ : 0
## new york : 5 CA : 0
## (Other) :112 (Other): 0
# We want to filter out N/As in third without outliers once more
third_wo <- third_wo[complete.cases(third_wo),]
#Let's see how box-plot now looks like with new dataset. Pretty neat
ggplot(third_wo, aes(x=reorder(Industry,Employees) , y=Employees)) +
geom_boxplot() +
ggtitle("Box-Plot by Industry and Number of Employees") +
coord_flip()
# I am re-using sqldf to group by industry to get mean employment in NY (without outliers)
query<-sqldf("select
Industry, avg(Employees) as avg_emp
from third_wo
group by Industry")
# Using ggplot to re-graph (without outliers)
ggplot(query, aes(x=reorder(Industry, avg_emp), y=round(avg_emp,0))) +
geom_bar(stat="identity", fill="skyblue") +
geom_text(aes(label=round(avg_emp,0))) +
ylab("Mean Number Of Employees") +
xlab("Industry") +
ggtitle("Mean Number of Employees by Industry in New York") +
coord_flip() +
theme_bw()
# Use query again
query <- sqldf("select
Industry, avg(Employees) as avg_emp
from third
group by Industry")
# We can also remove outliers in mean level
outlier_avg <- outlier(query$avg_emp)
# remove a row with outlier
query <- subset(query, avg_emp != outlier_avg)
# Graph again
ggplot(query, aes(x=reorder(Industry, avg_emp), y=round(avg_emp,0))) +
geom_bar(stat="identity", fill="skyblue") +
geom_text(aes(label=round(avg_emp,0))) +
ylab("Mean Number Of Employees") +
xlab("Industry") +
ggtitle("Mean Number of Employees by Industry in New York") +
coord_flip() +
theme_bw()
Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.
# Answer Question 3 here
# Use query again
query<-sqldf(
"SELECT a.Industry, (tot_rev / emp_cnt) as rev_emp
FROM
(select
Industry, SUM(Revenue) as tot_rev, SUM(Employees) as emp_cnt
from full
group by Industry) as a"
)
#Set K = 4 for K-mean clustering: Very high, high, medium, low by Revenue Per Employee
cl=(kmeans(query$rev_emp,4))
#set cluster as a factor and add it to our dataset
query$cluster=factor(cl$cluster)
centers=as.data.frame(cl$centers)
#Plot cluster points
ggplot(query, aes(x=reorder(Industry, rev_emp), y=round(rev_emp,2), color=cluster)) +
geom_point() +
geom_text(aes(label=round(rev_emp/1000,2)), hjust=1, size=4) +
ylab("Revenue Per Employee (1000's)") +
xlab("Industry") +
ggtitle("K-Mean Clustering : Revenue Per Employee") +
coord_flip() +
theme_bw() +
scale_y_continuous(labels = function(x){paste0(x/1000, 'K')})