\(~\)
\(~\)
inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)\(~\)
head(inc)##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX
summary(inc)##       Rank          Name            Growth_Rate         Revenue         
##  Min.   :   1   Length:5001        Min.   :  0.340   Min.   :2.000e+06  
##  1st Qu.:1252   Class :character   1st Qu.:  0.770   1st Qu.:5.100e+06  
##  Median :2502   Mode  :character   Median :  1.420   Median :1.090e+07  
##  Mean   :2502                      Mean   :  4.612   Mean   :4.822e+07  
##  3rd Qu.:3751                      3rd Qu.:  3.290   3rd Qu.:2.860e+07  
##  Max.   :5000                      Max.   :421.480   Max.   :1.010e+10  
##                                                                         
##    Industry           Employees           City              State          
##  Length:5001        Min.   :    1.0   Length:5001        Length:5001       
##  Class :character   1st Qu.:   25.0   Class :character   Class :character  
##  Mode  :character   Median :   53.0   Mode  :character   Mode  :character  
##                     Mean   :  232.7                                        
##                     3rd Qu.:  132.0                                        
##                     Max.   :66803.0                                        
##                     NA's   :12
\(~\)
# Loading libraries
library(tidyverse)
library(ggplot2)
library(psych)# Insert your code here, create more chunks as necessary
# Offers an overview of what the data looks like, has 5,001 rows with 8 columns, along with the column names
glimpse(inc)## Rows: 5,001
## Columns: 8
## $ Rank        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Name        <chr> "Fuhu", "FederalConference.com", "The HCI Group", "Bridger…
## $ Growth_Rate <dbl> 421.48, 248.31, 245.45, 233.08, 213.37, 179.38, 174.04, 17…
## $ Revenue     <dbl> 1.179e+08, 4.960e+07, 2.550e+07, 1.900e+09, 8.700e+07, 4.5…
## $ Industry    <chr> "Consumer Products & Services", "Government Services", "He…
## $ Employees   <int> 104, 51, 132, 50, 220, 63, 27, 75, 97, 15, 149, 165, 250, …
## $ City        <chr> "El Segundo", "Dumfries", "Jacksonville", "Addison", "Bost…
## $ State       <chr> "CA", "VA", "FL", "TX", "MA", "TX", "TN", "CA", "UT", "RI"…
# looking deeper into the data set with the describe function
describe(inc)##             vars    n        mean           sd    median     trimmed
## Rank           1 5001     2501.64      1443.51 2.502e+03     2501.73
## Name*          2 5001     2501.00      1443.81 2.501e+03     2501.00
## Growth_Rate    3 5001        4.61        14.12 1.420e+00        2.14
## Revenue        4 5001 48222535.49 240542281.14 1.090e+07 17334966.26
## Industry*      5 5001       12.10         7.33 1.300e+01       12.05
## Employees      6 4989      232.72      1353.13 5.300e+01       81.78
## City*          7 5001      732.00       441.12 7.610e+02      731.74
## State*         8 5001       24.80        15.64 2.300e+01       24.44
##                     mad     min        max      range  skew kurtosis         se
## Rank            1853.25 1.0e+00 5.0000e+03 4.9990e+03  0.00    -1.20      20.41
## Name*           1853.25 1.0e+00 5.0010e+03 5.0000e+03  0.00    -1.20      20.42
## Growth_Rate        1.22 3.4e-01 4.2148e+02 4.2114e+02 12.55   242.34       0.20
## Revenue     10674720.00 2.0e+06 1.0100e+10 1.0098e+10 22.17   722.66 3401441.44
## Industry*          8.90 1.0e+00 2.5000e+01 2.4000e+01 -0.10    -1.18       0.10
## Employees         53.37 1.0e+00 6.6803e+04 6.6802e+04 29.81  1268.67      19.16
## City*            604.90 1.0e+00 1.5190e+03 1.5180e+03 -0.04    -1.26       6.24
## State*            19.27 1.0e+00 5.2000e+01 5.1000e+01  0.12    -1.46       0.22
\(~\)
# Answer Question 1 here
# sort by statem in descending order
ques_1 <- inc %>% 
  group_by(State) %>%
  count(State) %>% 
  arrange(desc(n)) %>% 
  as_tibble(ques_1)
  
# plot bar chart
ggplot(ques_1, aes(x = reorder(State, n), y = n)) +
  geom_bar(stat = "identity") +
  theme_classic() +
  coord_flip() +
  xlab("State") +
  ylab("Number of Companies") +
  ggtitle("Number of Companies by State") +
  geom_text(aes(label = n), vjust = 0.6, hjust = 1.2, size = 2, color="white")\(~\)
complete.cases() function.) In addition to this, your graph
should show how variable the ranges are, and you should deal with
outliers.# Answer Question 2 here
# Based on question 1 we know NY is the third state with most companies so we filter it out
ny_state <- filter(inc, State == 'NY')
summary(ny_state)##       Rank          Name            Growth_Rate        Revenue         
##  Min.   :  26   Length:311         Min.   : 0.350   Min.   :2.000e+06  
##  1st Qu.:1186   Class :character   1st Qu.: 0.670   1st Qu.:4.300e+06  
##  Median :2702   Mode  :character   Median : 1.310   Median :8.800e+06  
##  Mean   :2612                      Mean   : 4.371   Mean   :5.872e+07  
##  3rd Qu.:4005                      3rd Qu.: 3.580   3rd Qu.:2.570e+07  
##  Max.   :4981                      Max.   :84.430   Max.   :4.600e+09  
##    Industry           Employees           City              State          
##  Length:311         Min.   :    1.0   Length:311         Length:311        
##  Class :character   1st Qu.:   21.0   Class :character   Class :character  
##  Mode  :character   Median :   45.0   Mode  :character   Mode  :character  
##                     Mean   :  271.3                                        
##                     3rd Qu.:  105.5                                        
##                     Max.   :32000.0
# using the whole data set to compare NY with
summary(inc)##       Rank          Name            Growth_Rate         Revenue         
##  Min.   :   1   Length:5001        Min.   :  0.340   Min.   :2.000e+06  
##  1st Qu.:1252   Class :character   1st Qu.:  0.770   1st Qu.:5.100e+06  
##  Median :2502   Mode  :character   Median :  1.420   Median :1.090e+07  
##  Mean   :2502                      Mean   :  4.612   Mean   :4.822e+07  
##  3rd Qu.:3751                      3rd Qu.:  3.290   3rd Qu.:2.860e+07  
##  Max.   :5000                      Max.   :421.480   Max.   :1.010e+10  
##                                                                         
##    Industry           Employees           City              State          
##  Length:5001        Min.   :    1.0   Length:5001        Length:5001       
##  Class :character   1st Qu.:   25.0   Class :character   Class :character  
##  Mode  :character   Median :   53.0   Mode  :character   Mode  :character  
##                     Mean   :  232.7                                        
##                     3rd Qu.:  132.0                                        
##                     Max.   :66803.0                                        
##                     NA's   :12
# plotting NY state
ques_2a <- ny_state %>% 
  filter(complete.cases(.)) %>% # complete cases only
  group_by(Industry) %>% 
  select(Industry, Employees)
# boxplot showing NY by industry
ggplot(ques_2a, mapping = aes(x = Industry, y = Employees)) +
  geom_boxplot() +
  theme_classic() +
  labs(title = 'Distribution of Employment by Industry in NY', x = 'Industry', y = 'Number of Employees') +
  coord_cartesian(ylim = c(0, 1500)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))# comparison plot of the country
ques_2b <- inc %>% 
  filter(complete.cases(.)) %>% # complete cases only
  group_by(Industry) %>% 
  select(Industry, Employees)
ggplot(ques_2b, mapping = aes(x = Industry, y = Employees)) +
  geom_boxplot() +
  theme_classic() +
  labs(title = 'Distribution of Employment by Industry in the Country', x = 'Industry', y = 'Number of Employees') +
  coord_cartesian(ylim = c(0, 1500)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))\(~\)
# turning off scientific notation
options(scipen = 999) 
# showing NY state only
ques_3a <- ny_state %>%
  group_by(Industry) %>%
  summarize(total_rev = sum(Revenue), total_emp = sum(Employees), rev_per_emp = total_rev/total_emp) %>%
  arrange(desc(rev_per_emp)) %>%
  na.omit()
ggplot(ques_3a, aes(x = reorder(Industry, rev_per_emp), y = rev_per_emp)) +
  geom_bar(stat = "identity") +
  labs(title = "Revenue per Employee by Industry in NY", x = "Industry", y = "Revenue per Employee") +
  theme_classic() +
  coord_flip()# Answer Question 3 here
# showing the country as a whole
ques_3b <- inc %>%
  group_by(Industry) %>%
  summarize(total_rev = sum(Revenue), total_emp = sum(Employees), rev_per_emp = total_rev/total_emp) %>%
  arrange(desc(rev_per_emp)) %>%
  na.omit()
ggplot(ques_3b, aes(x = reorder(Industry, rev_per_emp), y = rev_per_emp)) +
  geom_bar(stat = "identity") +
  labs(title = "Revenue per Employee by Industry in the Country", x = "Industry", y = "Revenue per Employee") +
  theme_classic() +
  coord_flip()