suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA608/master/lecture1/Data/inc5000_data.csv", header= TRUE)
head(inc,2)
Rank Name Growth_Rate Revenue
1 1 Fuhu 421.48 117900000
2 2 FederalConference.com 248.31 49600000
Industry Employees City State
1 Consumer Products & Services 104 El Segundo CA
2 Government Services 51 Dumfries VA
summary(inc[,c(3:6,8)])
Growth_Rate Revenue Industry
Min. : 0.340 Min. :2.000e+06 IT Services : 733
1st Qu.: 0.770 1st Qu.:5.100e+06 Business Products & Services: 482
Median : 1.420 Median :1.090e+07 Advertising & Marketing : 471
Mean : 4.612 Mean :4.822e+07 Health : 355
3rd Qu.: 3.290 3rd Qu.:2.860e+07 Software : 342
Max. :421.480 Max. :1.010e+10 Financial Services : 260
(Other) :2358
Employees State
Min. : 1.0 CA : 701
1st Qu.: 25.0 TX : 387
Median : 53.0 NY : 311
Mean : 232.7 VA : 283
3rd Qu.: 132.0 FL : 282
Max. :66803.0 IL : 273
NA's :12 (Other):2764
all_inc <- inc[complete.cases(inc)==TRUE,]
cnt <- ddply(all_inc, .(State), summarize, cnt = length(State))
p3 <- ggplot(cnt, aes(x=State, y=cnt)) + geom_bar(stat='identity')
p3
coord_flip()p4 <- ggplot(cnt, aes(x=State, y=cnt)) + geom_bar(stat='identity')
p4 + coord_flip()
reorderp_states <- ggplot(cnt, aes(x=reorder(State,cnt), y=cnt)) + geom_bar(stat='identity')
p_states + coord_flip()
ny <- subset(all_inc, State == 'NY')
p5 <- ggplot(ny, aes(x=Industry, y=Employees)) + geom_point()
p5 + coord_flip()
winsor <- function(x, bot, top) { return(min(top, max(x, bot))) }
ny$clip_employ <- sapply(ny$Employees, winsor, bot=0, top =2500)
p5 <- ggplot(ny, aes(x=Industry, y=clip_employ))
p5 + geom_point() + coord_flip()
p5 + geom_boxplot() + coord_flip(ylim=c(0,2500))
p5 + geom_boxplot() + coord_flip(ylim=c(0,2500)) +
annotate('text', label= c('outliers','3,000','10,000','32,000'),
x = c(18,16,5,2), y=c(2300,2400,2400,2400), size=c(4,3,3,3))
ny_ave <- ny %>%
group_by(Industry) %>%
summarize(mean = mean(Employees),
sd = sd(Employees),
median = median(clip_employ),
lower = quantile(clip_employ)[2],
upper = quantile(clip_employ)[4])
head(ny_ave,2)
# A tibble: 2 × 6
Industry mean sd median lower upper
<fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Advertising & Marketing 58.4386 62.22971 38.0 21.0 65.00
2 Business Products & Services 1492.4615 6240.70574 70.5 30.5 332.75
p6 <- ggplot(ny_ave, aes(x=Industry, y=median)) + geom_point()
p6 <- p6 + geom_pointrange(ymin=ny_ave$lower, ymax=ny_ave$upper)
p6 + ylim(c(0,750)) + coord_flip()
p7 <- ggplot(ny_ave, aes(x=Industry, y=median)) + geom_bar(stat='identity')
p7 <- p7 + geom_errorbar(ymin=ny_ave$lower, ymax=ny_ave$upper, width=.1)
p7 + ylim(c(0,750)) + coord_flip()
ny_ave$i = reorder(ny_ave$Industry, ny_ave$median)
p8 <- ggplot(ny_ave, aes(x=i, y=median)) + geom_bar(stat='identity',fill='coral')
p8 <- p8 + geom_errorbar(ymin=ny_ave$lower, ymax=ny_ave$upper, width=.1, color='blue')
p8 + ylim(c(0,750)) + coord_flip() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
all_inc$rev_per_employ <- all_inc$Revenue / all_inc$Employees
p9 <- ggplot(all_inc, aes(x=Industry, y=rev_per_employ))
p9 + geom_boxplot() + coord_flip()
all_inc$rev_per_employ <- all_inc$Revenue / all_inc$Employees
p10 <- ggplot(all_inc, aes(x=Industry, y=rev_per_employ))
p10 + geom_boxplot() + coord_flip()
p11 <- ggplot(all_inc,aes(x=rev_per_employ))
p11 <- p11 + geom_density() + facet_wrap(~ Industry)
p11 + scale_x_log10(breaks=c(10000, 100000, 1000000, 10000000))