R Notebook Homework 1

Principles of Data Visualization and Introduction to ggplot2

I have provided you with data about the 5,000 fastest growing companies in the US, as compiled by Inc. magazine. lets read this in:

inc <- read.csv("https://raw.githubusercontent.com/charleyferrari/CUNY_DATA_608/master/module1/Data/inc5000_data.csv", header= TRUE)

And lets preview this data:

head(inc)

##   Rank                         Name Growth_Rate   Revenue
## 1    1                         Fuhu      421.48 1.179e+08
## 2    2        FederalConference.com      248.31 4.960e+07
## 3    3                The HCI Group      245.45 2.550e+07
## 4    4                      Bridger      233.08 1.900e+09
## 5    5                       DataXu      213.37 8.700e+07
## 6    6 MileStone Community Builders      179.38 4.570e+07
##                       Industry Employees         City State
## 1 Consumer Products & Services       104   El Segundo    CA
## 2          Government Services        51     Dumfries    VA
## 3                       Health       132 Jacksonville    FL
## 4                       Energy        50      Addison    TX
## 5      Advertising & Marketing       220       Boston    MA
## 6                  Real Estate        63       Austin    TX

summary(inc)

##       Rank          Name            Growth_Rate         Revenue         
##  Min.   :   1   Length:5001        Min.   :  0.340   Min.   :2.000e+06  
##  1st Qu.:1252   Class :character   1st Qu.:  0.770   1st Qu.:5.100e+06  
##  Median :2502   Mode  :character   Median :  1.420   Median :1.090e+07  
##  Mean   :2502                      Mean   :  4.612   Mean   :4.822e+07  
##  3rd Qu.:3751                      3rd Qu.:  3.290   3rd Qu.:2.860e+07  
##  Max.   :5000                      Max.   :421.480   Max.   :1.010e+10  
##                                                                         
##    Industry           Employees           City              State          
##  Length:5001        Min.   :    1.0   Length:5001        Length:5001       
##  Class :character   1st Qu.:   25.0   Class :character   Class :character  
##  Mode  :character   Median :   53.0   Mode  :character   Mode  :character  
##                     Mean   :  232.7                                        
##                     3rd Qu.:  132.0                                        
##                     Max.   :66803.0                                        
##                     NA's   :12

Think a bit on what these summaries mean. Use the space below to add some more relevant non-visual exploratory information you think helps you understand this data:

Industries having most fast growing companies (in desc order)

# Insert your code here, create more chunks as necessary

describe(inc$Employees)

##    vars    n   mean      sd median trimmed   mad min   max range  skew kurtosis
## X1    1 4989 232.72 1353.13     53   81.78 53.37   1 66803 66802 29.81  1268.67
##       se
## X1 19.16

I <- data.frame(inc %>% group_by(Industry) %>% tally()) %>% arrange(desc(n))

kable(I) %>% scroll_box(height = "300px")

Industry	n
IT Services	733
Business Products & Services	482
Advertising & Marketing	471
Health	355
Software	342
Financial Services	260
Manufacturing	256
Consumer Products & Services	203
Retail	203
Government Services	202
Human Resources	196
Construction	187
Logistics & Transportation	155
Food & Beverage	131
Telecommunications	129
Energy	109
Real Estate	96
Education	83
Engineering	74
Security	73
Travel & Hospitality	62
Media	54
Environmental Services	51
Insurance	50
Computer Hardware	44

prop.table(table(inc$Industry))

## 
##      Advertising & Marketing Business Products & Services 
##                   0.09418116                   0.09638072 
##            Computer Hardware                 Construction 
##                   0.00879824                   0.03739252 
## Consumer Products & Services                    Education 
##                   0.04059188                   0.01659668 
##                       Energy                  Engineering 
##                   0.02179564                   0.01479704 
##       Environmental Services           Financial Services 
##                   0.01019796                   0.05198960 
##              Food & Beverage          Government Services 
##                   0.02619476                   0.04039192 
##                       Health              Human Resources 
##                   0.07098580                   0.03919216 
##                    Insurance                  IT Services 
##                   0.00999800                   0.14657069 
##   Logistics & Transportation                Manufacturing 
##                   0.03099380                   0.05118976 
##                        Media                  Real Estate 
##                   0.01079784                   0.01919616 
##                       Retail                     Security 
##                   0.04059188                   0.01459708 
##                     Software           Telecommunications 
##                   0.06838632                   0.02579484 
##         Travel & Hospitality 
##                   0.01239752

States having most fast growing companies (in desc order)

S <- data.frame(inc %>% group_by(State) %>% tally()) %>% arrange(desc(n))

kable(S) %>% scroll_box(height = "300px")

State	n
CA	701
TX	387
NY	311
VA	283
FL	282
IL	273
GA	212
OH	186
MA	182
PA	164
NJ	158
NC	137
CO	134
MD	131
WA	130
MI	126
AZ	100
UT	95
MN	88
TN	82
WI	79
IN	69
MO	59
AL	51
CT	50
OR	49
SC	48
OK	46
DC	43
KY	40
KS	38
LA	37
IA	28
NE	27
NV	26
NH	24
ID	17
DE	16
RI	16
ME	13
MS	12
ND	10
AR	9
HI	7
VT	6
NM	5
MT	4
SD	3
AK	2
WV	2
WY	2
PR	1

prop.table(table(inc$State))

## 
##         AK         AL         AR         AZ         CA         CO         CT 
## 0.00039992 0.01019796 0.00179964 0.01999600 0.14017197 0.02679464 0.00999800 
##         DC         DE         FL         GA         HI         IA         ID 
## 0.00859828 0.00319936 0.05638872 0.04239152 0.00139972 0.00559888 0.00339932 
##         IL         IN         KS         KY         LA         MA         MD 
## 0.05458908 0.01379724 0.00759848 0.00799840 0.00739852 0.03639272 0.02619476 
##         ME         MI         MN         MO         MS         MT         NC 
## 0.00259948 0.02519496 0.01759648 0.01179764 0.00239952 0.00079984 0.02739452 
##         ND         NE         NH         NJ         NM         NV         NY 
## 0.00199960 0.00539892 0.00479904 0.03159368 0.00099980 0.00519896 0.06218756 
##         OH         OK         OR         PA         PR         RI         SC 
## 0.03719256 0.00919816 0.00979804 0.03279344 0.00019996 0.00319936 0.00959808 
##         SD         TN         TX         UT         VA         VT         WA 
## 0.00059988 0.01639672 0.07738452 0.01899620 0.05658868 0.00119976 0.02599480 
##         WI         WV         WY 
## 0.01579684 0.00039992 0.00039992

Question 1

Create a graph that shows the distribution of companies in the dataset by State (ie how many are in each state). There are a lot of States, so consider which axis you should use. This visualization is ultimately going to be consumed on a ‘portrait’ oriented screen (ie taller than wide), which should further guide your layout choices.

# Answer Question 1 here

X <- inc %>% group_by(State) %>% summarise(counts = n()) %>% arrange(counts)

ggplot(X, aes(x = reorder(State, counts), y = counts)) + geom_bar(stat = "identity", fill='light blue',color='blue') + coord_flip() + geom_text(aes(label = counts), vjust = 0.5, hjust = -0.1, size=2.75) + ggtitle("Distribution of companies in the dataset by State") + xlab("State") + ylab("Company Count")

Quesiton 2

Lets dig in on the state with the 3rd most companies in the data set. Imagine you work for the state and are interested in how many people are employed by companies in different industries. Create a plot that shows the average and/or median employment by industry for companies in this state (only use cases with full data, use R’s complete.cases() function.) In addition to this, your graph should show how variable the ranges are, and you should deal with outliers.

# Answer Question 2 here

NY <- inc %>% filter(State == 'NY') %>% filter(complete.cases(.)) %>% group_by(Industry) %>% summarize(A=mean(Employees), M=median(Employees)) 

kable(NY) %>% scroll_box(height = "300px")

Industry	A	M
Advertising & Marketing	58.43860	38.0
Business Products & Services	1492.46154	70.5
Computer Hardware	44.00000	44.0
Construction	61.00000	24.5
Consumer Products & Services	626.29412	25.0
Education	59.85714	50.5
Energy	129.20000	120.0
Engineering	53.50000	54.5
Environmental Services	155.00000	155.0
Financial Services	144.30769	81.0
Food & Beverage	76.44444	41.0
Government Services	17.00000	17.0
Health	81.84615	45.0
Human Resources	437.54545	56.0
Insurance	32.50000	32.5
IT Services	204.09302	54.0
Logistics & Transportation	29.50000	23.5
Manufacturing	73.30769	30.0
Media	108.00000	45.0
Real Estate	18.25000	18.0
Retail	24.78571	13.5
Security	135.00000	32.5
Software	245.92308	80.0
Telecommunications	95.35294	31.0
Travel & Hospitality	547.71429	61.0

ggplot(melt(NY, id.vars='Industry'), aes(x=Industry, y=value, fill=variable)) + geom_bar(stat='identity') + coord_flip() + ylab("Employees") + ggtitle("New York Employees per Industry")

To see the outliers properly boxplot can be used.

NY_X <- inc %>% filter(State == 'NY') %>% filter(complete.cases(.)) %>% group_by(Industry)

ggplot(NY_X, aes(x=Industry, y=Employees)) + geom_boxplot(width = 0.5,fill='blue',outlier.colour="red") + coord_flip(ylim = c(0, 1500), expand = TRUE)

Question 3

Now imagine you work for an investor and want to see which industries generate the most revenue per employee. Create a chart that makes this information clear. Once again, the distribution per industry should be shown.

# Answer Question 3 here

NY_R <- NY_X %>%  mutate(R = Revenue/Employees) %>%  group_by(Industry) %>% summarize(A = mean(R), M = median(R))

kable(NY_R) %>% scroll_box(height = "300px")

Industry	A	M
Advertising & Marketing	373403.5	255555.6
Business Products & Services	527816.9	203148.1
Computer Hardware	520454.5	520454.5
Construction	238694.5	238613.0
Consumer Products & Services	382942.6	222222.2
Education	112060.6	114848.5
Energy	8472533.5	283211.7
Engineering	215744.7	202998.1
Environmental Services	134366.7	134366.7
Financial Services	400174.4	219607.8
Food & Beverage	174630.9	120238.1
Government Services	158823.5	158823.5
Health	532491.0	155000.0
Human Resources	337366.3	175000.0
Insurance	371000.0	371000.0
IT Services	228816.1	164285.7
Logistics & Transportation	1245870.1	996285.7
Manufacturing	665818.6	217500.0
Media	333549.6	262500.0
Real Estate	383809.5	330952.4
Retail	520790.3	305000.0
Security	153277.8	149000.0
Software	143749.0	133333.3
Telecommunications	408143.4	410714.3
Travel & Hospitality	282089.8	223333.3

ggplot(melt(NY_R, id.vars='Industry'), aes(x=Industry, y=value, fill=variable)) + geom_bar(stat='identity') + coord_flip() + ylab("Revenue") + ggtitle("New York Revenue per Employees per Industry")

State	n
CA	701
TX	387
NY	311
VA	283
FL	282
IL	273
GA	212
OH	186
MA	182
PA	164
NJ	158
NC	137
CO	134
MD	131
WA	130
MI	126
AZ	100
UT	95
MN	88
TN	82
WI	79
IN	69
MO	59
AL	51
CT	50
OR	49
SC	48
OK	46
DC	43
KY	40
KS	38
LA	37
IA	28
NE	27
NV	26
NH	24
ID	17
DE	16
RI	16
ME	13
MS	12
ND	10
AR	9
HI	7
VT	6
NM	5
MT	4
SD	3
AK	2
WV	2
WY	2
PR	1

State	n
CA	701
TX	387
NY	311
VA	283
FL	282
IL	273
GA	212
OH	186
MA	182
PA	164
NJ	158
NC	137
CO	134
MD	131
WA	130
MI	126
AZ	100
UT	95
MN	88
TN	82
WI	79
IN	69
MO	59
AL	51
CT	50
OR	49
SC	48
OK	46
DC	43
KY	40
KS	38
LA	37
IA	28
NE	27
NV	26
NH	24
ID	17
DE	16
RI	16
ME	13
MS	12
ND	10
AR	9
HI	7
VT	6
NM	5
MT	4
SD	3
AK	2
WV	2
WY	2
PR	1

State	n
CA	701
TX	387
NY	311
VA	283
FL	282
IL	273
GA	212
OH	186
MA	182
PA	164
NJ	158
NC	137
CO	134
MD	131
WA	130
MI	126
AZ	100
UT	95
MN	88
TN	82
WI	79
IN	69
MO	59
AL	51
CT	50
OR	49
SC	48
OK	46
DC	43
KY	40
KS	38
LA	37
IA	28
NE	27
NV	26
NH	24
ID	17
DE	16
RI	16
ME	13
MS	12
ND	10
AR	9
HI	7
VT	6
NM	5
MT	4
SD	3
AK	2
WV	2
WY	2
PR	1