Principles of Data Visualization and Introduction to ggplot2
I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)
And lets preview this data:
head(inc)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
summary(inc)
## Rank Name Growth_Rate
## Min. : 1 (Add)ventures : 1 Min. : 0.340
## 1st Qu.:1252 @Properties : 1 1st Qu.: 0.770
## Median :2502 1-Stop Translation USA: 1 Median : 1.420
## Mean :2502 110 Consulting : 1 Mean : 4.612
## 3rd Qu.:3751 11thStreetCoffee.com : 1 3rd Qu.: 3.290
## Max. :5000 123 Exteriors : 1 Max. :421.480
## (Other) :4995
## Revenue Industry Employees
## Min. :2.000e+06 IT Services : 733 Min. : 1.0
## 1st Qu.:5.100e+06 Business Products & Services: 482 1st Qu.: 25.0
## Median :1.090e+07 Advertising & Marketing : 471 Median : 53.0
## Mean :4.822e+07 Health : 355 Mean : 232.7
## 3rd Qu.:2.860e+07 Software : 342 3rd Qu.: 132.0
## Max. :1.010e+10 Financial Services : 260 Max. :66803.0
## (Other) :2358 NA's :12
## City State
## New York : 160 CA : 701
## Chicago : 90 TX : 387
## Austin : 88 NY : 311
## Houston : 76 VA : 283
## San Francisco: 75 FL : 282
## Atlanta : 74 IL : 273
## (Other) :4438 (Other):2764
#str(inc)
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
p <- ggplot(inc, aes(factor(State))) + geom_bar(fill="purple")
p <- p + coord_flip()
p <- p + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold"))
p <- p + labs(title = "Counts of Companies by State", x= "State", y= "Count")
p <- p + theme(plot.title = element_text(size=18))
p
2.For the State with the 3rd most companies, create a plot of average employment by industry for companies in this state (only use cases with full data. Your graph should show how variable the ranges are, and exclude outliers.
counts <- as.data.frame(table(inc$State))
colnames(counts) <- c("State", "Count")
head(counts)
## State Count
## 1 AK 2
## 2 AL 51
## 3 AR 9
## 4 AZ 100
## 5 CA 701
## 6 CO 134
Find the 3rd most companies by state
x <- sort(counts$Count, TRUE)[3]
filter(counts, Count == x)
## State Count
## 1 NY 311
Remove incomplete cases
ny_inc <- filter(inc, State == "NY")
ny_inc <- ny_inc[complete.cases(ny_inc),]
glimpse(ny_inc)
## Observations: 311
## Variables: 8
## $ Rank <int> 26, 30, 37, 38, 48, 70, 71, 124, 126, 153, 174, 21...
## $ Name <fct> BeenVerified, Sailthru, YellowHammer, Conductor, C...
## $ Growth_Rate <dbl> 84.43, 73.22, 67.40, 67.02, 53.65, 44.99, 44.85, 2...
## $ Revenue <dbl> 13700000, 8100000, 18000000, 7100000, 5900000, 279...
## $ Industry <fct> Consumer Products & Services, Advertising & Market...
## $ Employees <int> 17, 79, 27, 89, 32, 75, 42, 28, 17, 42, 99, 119, 2...
## $ City <fct> New York, New York, New York, New York, Rock Hill,...
## $ State <fct> NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, NY...
try box plot
ny_inc <- ny_inc[c("Industry","Employees")]
IM <- aggregate(ny_inc$Employees, by=list(ny_inc$Industry),
FUN=mean, na.rm=TRUE)
colnames(IM) <- c("Industry","EmployeeMean")
p <- ggplot(ny_inc,aes(ny_inc$Industry, ny_inc$Employees))+geom_boxplot()+theme_classic()+scale_y_log10()+labs(title="Employees by Industry in NY", x="Industry", y="Employee Counts log scale")
p+coord_flip()
#remove incomplete cases
inc_rev <- inc[complete.cases(inc),]
# Create a new column rev_per_em = revenue/employee using mutate
inc_rev <- inc_rev %>% mutate(rev_per_em = Revenue / Employees)
glimpse(inc_rev)
## Observations: 4,989
## Variables: 9
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Name <fct> Fuhu, FederalConference.com, The HCI Group, Bridge...
## $ Growth_Rate <dbl> 421.48, 248.31, 245.45, 233.08, 213.37, 179.38, 17...
## $ Revenue <dbl> 1.179e+08, 4.960e+07, 2.550e+07, 1.900e+09, 8.700e...
## $ Industry <fct> Consumer Products & Services, Government Services,...
## $ Employees <int> 104, 51, 132, 50, 220, 63, 27, 75, 97, 15, 149, 16...
## $ City <fct> El Segundo, Dumfries, Jacksonville, Addison, Bosto...
## $ State <fct> CA, VA, FL, TX, MA, TX, TN, CA, UT, RI, VA, CA, FL...
## $ rev_per_em <dbl> 1133653.8, 972549.0, 193181.8, 38000000.0, 395454....
make a plot
p2 <- ggplot(inc_rev) + geom_bar(aes(Industry, rev_per_em, fill = Industry), position = "dodge", stat = "summary", fun.y = "mean", fill="purple")
p2 <- p2 + coord_flip()
p2 <- p2 + theme(legend.position="none")
p2 <- p2 + theme(text = element_text(size=12), axis.title=element_text(size=14,face="bold"))
p2 <- p2 + labs(title = "Average Revenue per Employees by Industry", x= "Industry", y= "Average Revenue per Employees")
p2 <- p2 + theme(plot.title = element_text(size=18))
p2