Principles of Data Visualization and Introduction to ggplot2
I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:
library(ggpubr)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer)
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)
And lets preview this data:
head(inc)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
class(inc)
## [1] "data.frame"
dim(inc)
## [1] 5001 8
colnames(inc)
## [1] "Rank" "Name" "Growth_Rate" "Revenue" "Industry"
## [6] "Employees" "City" "State"
summary(inc)
## Rank Name Growth_Rate Revenue
## Min. : 1 Length:5001 Min. : 0.340 Min. :2.000e+06
## 1st Qu.:1252 Class :character 1st Qu.: 0.770 1st Qu.:5.100e+06
## Median :2502 Mode :character Median : 1.420 Median :1.090e+07
## Mean :2502 Mean : 4.612 Mean :4.822e+07
## 3rd Qu.:3751 3rd Qu.: 3.290 3rd Qu.:2.860e+07
## Max. :5000 Max. :421.480 Max. :1.010e+10
##
## Industry Employees City State
## Length:5001 Min. : 1.0 Length:5001 Length:5001
## Class :character 1st Qu.: 25.0 Class :character Class :character
## Mode :character Median : 53.0 Mode :character Mode :character
## Mean : 232.7
## 3rd Qu.: 132.0
## Max. :66803.0
## NA's :12
Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:
dim(inc)
## [1] 5001 8
str(inc)
## 'data.frame': 5001 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr "Fuhu" "FederalConference.com" "The HCI Group" "Bridger" ...
## $ Growth_Rate: num 421 248 245 233 213 ...
## $ Revenue : num 1.18e+08 4.96e+07 2.55e+07 1.90e+09 8.70e+07 ...
## $ Industry : chr "Consumer Products & Services" "Government Services" "Health" "Energy" ...
## $ Employees : int 104 51 132 50 220 63 27 75 97 15 ...
## $ City : chr "El Segundo" "Dumfries" "Jacksonville" "Addison" ...
## $ State : chr "CA" "VA" "FL" "TX" ...
# Insert your code here, create more chunks as necessary
tabind<-as.data.frame(table(inc$Industry))
colnames(tabind)<-c("Industry","Count")
tabind <-tabind[order(-tabind$Count),]
rownames(tabind)<-NULL
tabind
## Industry Count
## 1 IT Services 733
## 2 Business Products & Services 482
## 3 Advertising & Marketing 471
## 4 Health 355
## 5 Software 342
## 6 Financial Services 260
## 7 Manufacturing 256
## 8 Consumer Products & Services 203
## 9 Retail 203
## 10 Government Services 202
## 11 Human Resources 196
## 12 Construction 187
## 13 Logistics & Transportation 155
## 14 Food & Beverage 131
## 15 Telecommunications 129
## 16 Energy 109
## 17 Real Estate 96
## 18 Education 83
## 19 Engineering 74
## 20 Security 73
## 21 Travel & Hospitality 62
## 22 Media 54
## 23 Environmental Services 51
## 24 Insurance 50
## 25 Computer Hardware 44
tabst<-as.data.frame(table(inc$State))
colnames(tabst)<-c("State","Count")
tabst <-tabst[order(-tabst$Count),]
rownames(tabst)<-NULL
tabst
## State Count
## 1 CA 701
## 2 TX 387
## 3 NY 311
## 4 VA 283
## 5 FL 282
## 6 IL 273
## 7 GA 212
## 8 OH 186
## 9 MA 182
## 10 PA 164
## 11 NJ 158
## 12 NC 137
## 13 CO 134
## 14 MD 131
## 15 WA 130
## 16 MI 126
## 17 AZ 100
## 18 UT 95
## 19 MN 88
## 20 TN 82
## 21 WI 79
## 22 IN 69
## 23 MO 59
## 24 AL 51
## 25 CT 50
## 26 OR 49
## 27 SC 48
## 28 OK 46
## 29 DC 43
## 30 KY 40
## 31 KS 38
## 32 LA 37
## 33 IA 28
## 34 NE 27
## 35 NV 26
## 36 NH 24
## 37 ID 17
## 38 DE 16
## 39 RI 16
## 40 ME 13
## 41 MS 12
## 42 ND 10
## 43 AR 9
## 44 HI 7
## 45 VT 6
## 46 NM 5
## 47 MT 4
## 48 SD 3
## 49 AK 2
## 50 WV 2
## 51 WY 2
## 52 PR 1
Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.
# Answer Question 1 here
ggplot(tabst, aes(x = reorder(State, Count), y = Count)) +
geom_bar(stat = "identity")+scale_fill_manual(values=c("#669933"))+coord_flip()+labs(title="Companies by State")+xlab("State")+theme(axis.text=element_text(size=4))
# For next question, we see from the plot that NY has the 3rd most companies
Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.
# Answer Question 2 here
NY_only<-subset(inc,inc$State=="NY")
str(NY_only)
## 'data.frame': 311 obs. of 8 variables:
## $ Rank : int 26 30 37 38 48 70 71 124 126 153 ...
## $ Name : chr "BeenVerified" "Sailthru" "YellowHammer" "Conductor" ...
## $ Growth_Rate: num 84.4 73.2 67.4 67 53.6 ...
## $ Revenue : num 13700000 8100000 18000000 7100000 5900000 27900000 6900000 11500000 9800000 15400000 ...
## $ Industry : chr "Consumer Products & Services" "Advertising & Marketing" "Advertising & Marketing" "Advertising & Marketing" ...
## $ Employees : int 17 79 27 89 32 75 42 28 17 42 ...
## $ City : chr "New York" "New York" "New York" "New York" ...
## $ State : chr "NY" "NY" "NY" "NY" ...
# The number of NY companies
dim(NY_only)
## [1] 311 8
Complete_NY<-NY_only[complete.cases(NY_only),]
#The number of complete records
dim(Complete_NY)
## [1] 311 8
#quick peak for outliers
qplot(Complete_NY$Industry,Complete_NY$Employees, geom="boxplot")
#need to remove companies with >5000 employees
Complete_NY2<-subset(Complete_NY,Complete_NY$Employees<5000)
#Companies <5000, quick look
qplot(Complete_NY2$Industry,Complete_NY2$Employees, geom="boxplot")
#need to remove companies with>500
Complete_NY3<-subset(Complete_NY2,Complete_NY2$Employees<500)
dim(Complete_NY3)
## [1] 293 8
#Look before final refinement
qplot(Complete_NY3$Industry,Complete_NY3$Employees, geom="boxplot")
#refine
ggplot(Complete_NY3, aes(x =Industry, y=Employees)) +
geom_boxplot() +labs(title = "Distribution of Employees by Industry", subtitle="NY companies with < 500 employees", caption="Mean=red marker")+coord_flip()+theme(axis.text=element_text(size=4))+stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")
Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.
Complete_NY$rev_per_EE<-Complete_NY$Revenue/Complete_NY$Employees/1000
Complete_NYR<-subset(Complete_NY,Complete_NY$rev_per_EE<4000)
ggplot(Complete_NYR, aes(x =Industry, y=rev_per_EE)) +
geom_boxplot() +labs(title = "Distribution of Revenue/Employee by Industry in NY", caption="Mean=red marker, outliers >4000 excluded")+coord_flip()+theme(axis.text=element_text(size=4))+stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")+ylab("Revenue per Employee (000's)")+ xlab(NULL)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.