Principles of Data Visualization and Introduction to ggplot2

I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:

library(ggpubr)
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer)
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)

And lets preview this data:

head(inc)
##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
class(inc)
## [1] "data.frame"
dim(inc)
## [1] 5001    8
colnames(inc)
## [1] "Rank"        "Name"        "Growth_Rate" "Revenue"     "Industry"   
## [6] "Employees"   "City"        "State"
summary(inc)
##       Rank          Name            Growth_Rate         Revenue         
##  Min.   :   1   Length:5001        Min.   :  0.340   Min.   :2.000e+06  
##  1st Qu.:1252   Class :character   1st Qu.:  0.770   1st Qu.:5.100e+06  
##  Median :2502   Mode  :character   Median :  1.420   Median :1.090e+07  
##  Mean   :2502                      Mean   :  4.612   Mean   :4.822e+07  
##  3rd Qu.:3751                      3rd Qu.:  3.290   3rd Qu.:2.860e+07  
##  Max.   :5000                      Max.   :421.480   Max.   :1.010e+10  
##                                                                         
##    Industry           Employees           City              State          
##  Length:5001        Min.   :    1.0   Length:5001        Length:5001       
##  Class :character   1st Qu.:   25.0   Class :character   Class :character  
##  Mode  :character   Median :   53.0   Mode  :character   Mode  :character  
##                     Mean   :  232.7                                        
##                     3rd Qu.:  132.0                                        
##                     Max.   :66803.0                                        
##                     NA's   :12

Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:

dim(inc)
## [1] 5001    8
str(inc)
## 'data.frame':    5001 obs. of  8 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name       : chr  "Fuhu" "FederalConference.com" "The HCI Group" "Bridger" ...
##  $ Growth_Rate: num  421 248 245 233 213 ...
##  $ Revenue    : num  1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
##  $ Industry   : chr  "Consumer Products & Services" "Government Services" "Health" "Energy" ...
##  $ Employees  : int  104 51 132 50 220 63 27 75 97 15 ...
##  $ City       : chr  "El Segundo" "Dumfries" "Jacksonville" "Addison" ...
##  $ State      : chr  "CA" "VA" "FL" "TX" ...
# Insert your code here, create more chunks as necessary
tabind<-as.data.frame(table(inc$Industry))
colnames(tabind)<-c("Industry","Count")


tabind <-tabind[order(-tabind$Count),]
rownames(tabind)<-NULL
tabind
##                        Industry Count
## 1                   IT Services   733
## 2  Business Products & Services   482
## 3       Advertising & Marketing   471
## 4                        Health   355
## 5                      Software   342
## 6            Financial Services   260
## 7                 Manufacturing   256
## 8  Consumer Products & Services   203
## 9                        Retail   203
## 10          Government Services   202
## 11              Human Resources   196
## 12                 Construction   187
## 13   Logistics & Transportation   155
## 14              Food & Beverage   131
## 15           Telecommunications   129
## 16                       Energy   109
## 17                  Real Estate    96
## 18                    Education    83
## 19                  Engineering    74
## 20                     Security    73
## 21         Travel & Hospitality    62
## 22                        Media    54
## 23       Environmental Services    51
## 24                    Insurance    50
## 25            Computer Hardware    44
tabst<-as.data.frame(table(inc$State))

colnames(tabst)<-c("State","Count")

tabst <-tabst[order(-tabst$Count),]
rownames(tabst)<-NULL
tabst
##    State Count
## 1     CA   701
## 2     TX   387
## 3     NY   311
## 4     VA   283
## 5     FL   282
## 6     IL   273
## 7     GA   212
## 8     OH   186
## 9     MA   182
## 10    PA   164
## 11    NJ   158
## 12    NC   137
## 13    CO   134
## 14    MD   131
## 15    WA   130
## 16    MI   126
## 17    AZ   100
## 18    UT    95
## 19    MN    88
## 20    TN    82
## 21    WI    79
## 22    IN    69
## 23    MO    59
## 24    AL    51
## 25    CT    50
## 26    OR    49
## 27    SC    48
## 28    OK    46
## 29    DC    43
## 30    KY    40
## 31    KS    38
## 32    LA    37
## 33    IA    28
## 34    NE    27
## 35    NV    26
## 36    NH    24
## 37    ID    17
## 38    DE    16
## 39    RI    16
## 40    ME    13
## 41    MS    12
## 42    ND    10
## 43    AR     9
## 44    HI     7
## 45    VT     6
## 46    NM     5
## 47    MT     4
## 48    SD     3
## 49    AK     2
## 50    WV     2
## 51    WY     2
## 52    PR     1

Question 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.

# Answer Question 1 here



ggplot(tabst, aes(x = reorder(State, Count), y = Count)) + 
  geom_bar(stat = "identity")+scale_fill_manual(values=c("#669933"))+coord_flip()+labs(title="Companies by State")+xlab("State")+theme(axis.text=element_text(size=4))

# For next question, we see from the plot that NY has the 3rd most companies

Question 2

Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.

# Answer Question 2 here
NY_only<-subset(inc,inc$State=="NY")
str(NY_only)
## 'data.frame':    311 obs. of  8 variables:
##  $ Rank       : int  26 30 37 38 48 70 71 124 126 153 ...
##  $ Name       : chr  "BeenVerified" "Sailthru" "YellowHammer" "Conductor" ...
##  $ Growth_Rate: num  84.4 73.2 67.4 67 53.6 ...
##  $ Revenue    : num  13700000 8100000 18000000 7100000 5900000 27900000 6900000 11500000 9800000 15400000 ...
##  $ Industry   : chr  "Consumer Products & Services" "Advertising & Marketing" "Advertising & Marketing" "Advertising & Marketing" ...
##  $ Employees  : int  17 79 27 89 32 75 42 28 17 42 ...
##  $ City       : chr  "New York" "New York" "New York" "New York" ...
##  $ State      : chr  "NY" "NY" "NY" "NY" ...
# The number of NY companies
dim(NY_only)
## [1] 311   8
Complete_NY<-NY_only[complete.cases(NY_only),]

#The number of complete records
dim(Complete_NY)
## [1] 311   8
#quick peak for outliers
qplot(Complete_NY$Industry,Complete_NY$Employees, geom="boxplot")

#need to remove companies with >5000 employees
Complete_NY2<-subset(Complete_NY,Complete_NY$Employees<5000)

#Companies <5000, quick look
qplot(Complete_NY2$Industry,Complete_NY2$Employees, geom="boxplot")

#need to remove companies with>500
Complete_NY3<-subset(Complete_NY2,Complete_NY2$Employees<500)

dim(Complete_NY3)
## [1] 293   8
#Look before final refinement
qplot(Complete_NY3$Industry,Complete_NY3$Employees, geom="boxplot")

#refine
ggplot(Complete_NY3, aes(x =Industry, y=Employees)) +
geom_boxplot() +labs(title = "Distribution of Employees by Industry", subtitle="NY companies with < 500 employees", caption="Mean=red marker")+coord_flip()+theme(axis.text=element_text(size=4))+stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")

Question 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.

Complete_NY$rev_per_EE<-Complete_NY$Revenue/Complete_NY$Employees/1000

Complete_NYR<-subset(Complete_NY,Complete_NY$rev_per_EE<4000)

ggplot(Complete_NYR, aes(x =Industry, y=rev_per_EE)) +
geom_boxplot() +labs(title = "Distribution of Revenue/Employee by Industry in NY",  caption="Mean=red marker, outliers >4000 excluded")+coord_flip()+theme(axis.text=element_text(size=4))+stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")+ylab("Revenue per Employee (000's)")+ xlab(NULL)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.