install.packages(“dplyr”, repos = https://CRAN.R-project.org“) install.packages(”ggplot2“, repos =”https://CRAN.R-project.org) install.packages(“RCurl”, repos = “https://CRAN.R-project.org)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(RCurl)
## Loading required package: bitops
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)
head(inc)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
summary(inc)
## Rank Name Growth_Rate
## Min. : 1 (Add)ventures : 1 Min. : 0.340
## 1st Qu.:1252 @Properties : 1 1st Qu.: 0.770
## Median :2502 1-Stop Translation USA: 1 Median : 1.420
## Mean :2502 110 Consulting : 1 Mean : 4.612
## 3rd Qu.:3751 11thStreetCoffee.com : 1 3rd Qu.: 3.290
## Max. :5000 123 Exteriors : 1 Max. :421.480
## (Other) :4995
## Revenue Industry Employees
## Min. :2.000e+06 IT Services : 733 Min. : 1.0
## 1st Qu.:5.100e+06 Business Products & Services: 482 1st Qu.: 25.0
## Median :1.090e+07 Advertising & Marketing : 471 Median : 53.0
## Mean :4.822e+07 Health : 355 Mean : 232.7
## 3rd Qu.:2.860e+07 Software : 342 3rd Qu.: 132.0
## Max. :1.010e+10 Financial Services : 260 Max. :66803.0
## (Other) :2358 NA's :12
## City State
## New York : 160 CA : 701
## Chicago : 90 TX : 387
## Austin : 88 NY : 311
## Houston : 76 VA : 283
## San Francisco: 75 FL : 282
## Atlanta : 74 IL : 273
## (Other) :4438 (Other):2764
subset1 <- subset(inc, select = c(Name, State))
subset1$Count <- 1
#(subset1)
Subset1_state <- subset1 %>%
group_by(State) %>%
summarize(Number = sum(Count))
#View(Subset1_state)
g <- ggplot(Subset1_state, aes(State, Number))
g + geom_bar(stat="identity", width = 0.5, fill="tomato2", position = position_stack(reverse = TRUE)) +
coord_flip() +
labs(title="Bar Chart",
subtitle="Number of companies in each state",
caption="Source: inc5000_data.csv") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
Rank <- arrange(Subset1_state, desc(Number))
Top_Number_3 <- Rank[3,]
#View(Top_Number_3)
Subset2 <- subset(inc, select = c(Industry, Employees, State), State == "NY")
#View(Subset2)
# Utalized the complete.cases(Subset2) to check if they are complete dataset.
complete.cases(Subset2)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [309] TRUE TRUE TRUE
Subset2_Employment <- Subset2 %>%
group_by(Industry) %>%
summarise(mean = mean(Employees), median = median(Employees), sd = sd(Employees) )
#View(Subset2_Employment)
p <- ggplot(Subset2_Employment, aes(Industry, mean))
p + geom_bar(stat="identity", width = 0.5, fill="tomato2") +
labs(title="Bar Chart",
subtitle="Average employement in different industries in New York",
caption="Source: inc5000_data.csv") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
Subset3 <- subset(inc, select = c(Industry, Employees, Revenue))
Subset3 <- Subset3[complete.cases(Subset3),]
Subset_Revenue_Employee <- Subset3 %>%
group_by(Industry) %>%
summarise(Total_Revenue = sum(Revenue), Total_Number_Employee = sum(Employees))
#View(Subset_Revenue_Employee)
Subset_Revenue_Per_Employee <- transform(Subset_Revenue_Employee, Revenue_per_Employee = Total_Revenue / Total_Number_Employee)
#View(Subset_Revenue_Per_Employee)
r <- ggplot(Subset_Revenue_Per_Employee, aes(Industry, Revenue_per_Employee))
r + geom_bar(stat="identity", width = 0.5, fill="tomato2", position = position_stack(reverse = TRUE)) +
coord_flip() +
labs(title="Bar Chart",
subtitle="The revenue per employee in different industries",
caption="Source: inc5000_data.csv") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))