\(~\)
\(~\)
<- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE) inc
\(~\)
head(inc)
## Rank Name Growth_Rate Revenue
## 1 1 Fuhu 421.48 1.179e+08
## 2 2 FederalConference.com 248.31 4.960e+07
## 3 3 The HCI Group 245.45 2.550e+07
## 4 4 Bridger 233.08 1.900e+09
## 5 5 DataXu 213.37 8.700e+07
## 6 6 MileStone Community Builders 179.38 4.570e+07
## Industry Employees City State
## 1 Consumer Products & Services 104 El Segundo CA
## 2 Government Services 51 Dumfries VA
## 3 Health 132 Jacksonville FL
## 4 Energy 50 Addison TX
## 5 Advertising & Marketing 220 Boston MA
## 6 Real Estate 63 Austin TX
summary(inc)
## Rank Name Growth_Rate Revenue
## Min. : 1 Length:5001 Min. : 0.340 Min. :2.000e+06
## 1st Qu.:1252 Class :character 1st Qu.: 0.770 1st Qu.:5.100e+06
## Median :2502 Mode :character Median : 1.420 Median :1.090e+07
## Mean :2502 Mean : 4.612 Mean :4.822e+07
## 3rd Qu.:3751 3rd Qu.: 3.290 3rd Qu.:2.860e+07
## Max. :5000 Max. :421.480 Max. :1.010e+10
##
## Industry Employees City State
## Length:5001 Min. : 1.0 Length:5001 Length:5001
## Class :character 1st Qu.: 25.0 Class :character Class :character
## Mode :character Median : 53.0 Mode :character Mode :character
## Mean : 232.7
## 3rd Qu.: 132.0
## Max. :66803.0
## NA's :12
\(~\)
# Loading libraries
library(tidyverse)
library(ggplot2)
library(psych)
# Insert your code here, create more chunks as necessary
# Offers an overview of what the data looks like, has 5,001 rows with 8 columns, along with the column names
glimpse(inc)
## Rows: 5,001
## Columns: 8
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Name <chr> "Fuhu", "FederalConference.com", "The HCI Group", "Bridger…
## $ Growth_Rate <dbl> 421.48, 248.31, 245.45, 233.08, 213.37, 179.38, 174.04, 17…
## $ Revenue <dbl> 1.179e+08, 4.960e+07, 2.550e+07, 1.900e+09, 8.700e+07, 4.5…
## $ Industry <chr> "Consumer Products & Services", "Government Services", "He…
## $ Employees <int> 104, 51, 132, 50, 220, 63, 27, 75, 97, 15, 149, 165, 250, …
## $ City <chr> "El Segundo", "Dumfries", "Jacksonville", "Addison", "Bost…
## $ State <chr> "CA", "VA", "FL", "TX", "MA", "TX", "TN", "CA", "UT", "RI"…
# looking deeper into the data set with the describe function
describe(inc)
## vars n mean sd median trimmed
## Rank 1 5001 2501.64 1443.51 2.502e+03 2501.73
## Name* 2 5001 2501.00 1443.81 2.501e+03 2501.00
## Growth_Rate 3 5001 4.61 14.12 1.420e+00 2.14
## Revenue 4 5001 48222535.49 240542281.14 1.090e+07 17334966.26
## Industry* 5 5001 12.10 7.33 1.300e+01 12.05
## Employees 6 4989 232.72 1353.13 5.300e+01 81.78
## City* 7 5001 732.00 441.12 7.610e+02 731.74
## State* 8 5001 24.80 15.64 2.300e+01 24.44
## mad min max range skew kurtosis se
## Rank 1853.25 1.0e+00 5.0000e+03 4.9990e+03 0.00 -1.20 20.41
## Name* 1853.25 1.0e+00 5.0010e+03 5.0000e+03 0.00 -1.20 20.42
## Growth_Rate 1.22 3.4e-01 4.2148e+02 4.2114e+02 12.55 242.34 0.20
## Revenue 10674720.00 2.0e+06 1.0100e+10 1.0098e+10 22.17 722.66 3401441.44
## Industry* 8.90 1.0e+00 2.5000e+01 2.4000e+01 -0.10 -1.18 0.10
## Employees 53.37 1.0e+00 6.6803e+04 6.6802e+04 29.81 1268.67 19.16
## City* 604.90 1.0e+00 1.5190e+03 1.5180e+03 -0.04 -1.26 6.24
## State* 19.27 1.0e+00 5.2000e+01 5.1000e+01 0.12 -1.46 0.22
\(~\)
# Answer Question 1 here
# sort by statem in descending order
<- inc %>%
ques_1 group_by(State) %>%
count(State) %>%
arrange(desc(n)) %>%
as_tibble(ques_1)
# plot bar chart
ggplot(ques_1, aes(x = reorder(State, n), y = n)) +
geom_bar(stat = "identity") +
theme_classic() +
coord_flip() +
xlab("State") +
ylab("Number of Companies") +
ggtitle("Number of Companies by State") +
geom_text(aes(label = n), vjust = 0.6, hjust = 1.2, size = 2, color="white")
\(~\)
complete.cases()
function.) In addition to this, your graph
should show how variable the ranges are, and you should deal with
outliers.# Answer Question 2 here
# Based on question 1 we know NY is the third state with most companies so we filter it out
<- filter(inc, State == 'NY')
ny_state summary(ny_state)
## Rank Name Growth_Rate Revenue
## Min. : 26 Length:311 Min. : 0.350 Min. :2.000e+06
## 1st Qu.:1186 Class :character 1st Qu.: 0.670 1st Qu.:4.300e+06
## Median :2702 Mode :character Median : 1.310 Median :8.800e+06
## Mean :2612 Mean : 4.371 Mean :5.872e+07
## 3rd Qu.:4005 3rd Qu.: 3.580 3rd Qu.:2.570e+07
## Max. :4981 Max. :84.430 Max. :4.600e+09
## Industry Employees City State
## Length:311 Min. : 1.0 Length:311 Length:311
## Class :character 1st Qu.: 21.0 Class :character Class :character
## Mode :character Median : 45.0 Mode :character Mode :character
## Mean : 271.3
## 3rd Qu.: 105.5
## Max. :32000.0
# using the whole data set to compare NY with
summary(inc)
## Rank Name Growth_Rate Revenue
## Min. : 1 Length:5001 Min. : 0.340 Min. :2.000e+06
## 1st Qu.:1252 Class :character 1st Qu.: 0.770 1st Qu.:5.100e+06
## Median :2502 Mode :character Median : 1.420 Median :1.090e+07
## Mean :2502 Mean : 4.612 Mean :4.822e+07
## 3rd Qu.:3751 3rd Qu.: 3.290 3rd Qu.:2.860e+07
## Max. :5000 Max. :421.480 Max. :1.010e+10
##
## Industry Employees City State
## Length:5001 Min. : 1.0 Length:5001 Length:5001
## Class :character 1st Qu.: 25.0 Class :character Class :character
## Mode :character Median : 53.0 Mode :character Mode :character
## Mean : 232.7
## 3rd Qu.: 132.0
## Max. :66803.0
## NA's :12
# plotting NY state
<- ny_state %>%
ques_2a filter(complete.cases(.)) %>% # complete cases only
group_by(Industry) %>%
select(Industry, Employees)
# boxplot showing NY by industry
ggplot(ques_2a, mapping = aes(x = Industry, y = Employees)) +
geom_boxplot() +
theme_classic() +
labs(title = 'Distribution of Employment by Industry in NY', x = 'Industry', y = 'Number of Employees') +
coord_cartesian(ylim = c(0, 1500)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# comparison plot of the country
<- inc %>%
ques_2b filter(complete.cases(.)) %>% # complete cases only
group_by(Industry) %>%
select(Industry, Employees)
ggplot(ques_2b, mapping = aes(x = Industry, y = Employees)) +
geom_boxplot() +
theme_classic() +
labs(title = 'Distribution of Employment by Industry in the Country', x = 'Industry', y = 'Number of Employees') +
coord_cartesian(ylim = c(0, 1500)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
\(~\)
# turning off scientific notation
options(scipen = 999)
# showing NY state only
<- ny_state %>%
ques_3a group_by(Industry) %>%
summarize(total_rev = sum(Revenue), total_emp = sum(Employees), rev_per_emp = total_rev/total_emp) %>%
arrange(desc(rev_per_emp)) %>%
na.omit()
ggplot(ques_3a, aes(x = reorder(Industry, rev_per_emp), y = rev_per_emp)) +
geom_bar(stat = "identity") +
labs(title = "Revenue per Employee by Industry in NY", x = "Industry", y = "Revenue per Employee") +
theme_classic() +
coord_flip()
# Answer Question 3 here
# showing the country as a whole
<- inc %>%
ques_3b group_by(Industry) %>%
summarize(total_rev = sum(Revenue), total_emp = sum(Employees), rev_per_emp = total_rev/total_emp) %>%
arrange(desc(rev_per_emp)) %>%
na.omit()
ggplot(ques_3b, aes(x = reorder(Industry, rev_per_emp), y = rev_per_emp)) +
geom_bar(stat = "identity") +
labs(title = "Revenue per Employee by Industry in the Country", x = "Industry", y = "Revenue per Employee") +
theme_classic() +
coord_flip()