df = read.csv('clean_employee_reviews.csv')
head(df,1)
## X company location dates summary
## 1 1 google none 2018-12-11 Best Company to work for
## pros cons
## 1 People are smart and friendly Bureaucracy is slowing things down
## advice.to.mgmt overall.ratings work.balance.stars culture.values.stars
## 1 none 5 4 5
## carrer.opportunities.stars comp.benefit.stars senior.mangemnet.stars
## 1 5 4 5
## helpful.count
## 1 0
## link
## 1 https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm
## status position
## 1 Current Employee Anonymous Employee
str(df)
## 'data.frame': 67529 obs. of 18 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ company : Factor w/ 6 levels "amazon","apple",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ location : Factor w/ 2044 levels "Aberdeen, MD",..: 1300 1231 1279 1231 1061 1231 1279 1300 1279 1300 ...
## $ dates : Factor w/ 3822 levels "2008-01-31","2008-02-05",..: 3821 1822 2145 2419 3676 3819 3821 3821 3820 3819 ...
## $ summary : Factor w/ 42652 levels "","-","-----",..: 6040 28376 16532 37964 39675 29231 35728 19977 16363 11511 ...
## $ pros : Factor w/ 66085 levels "_ Customer focused company, where you can see the impact of your day to day job on customers - makes very happy"| __truncated__,..: 49518 4365 3363 64849 27504 49503 36286 62492 34877 39522 ...
## $ cons : Factor w/ 66049 levels "_ Feels that getting promoted to the \"next level\" is unnecessary over-complicated - especially when change role.",..: 8986 4345 3412 21185 22133 16181 59557 34591 65801 46681 ...
## $ advice.to.mgmt : Factor w/ 3 levels "1","3","none": 3 3 3 3 3 3 3 3 3 3 ...
## $ overall.ratings : int 5 4 5 5 5 5 5 5 5 5 ...
## $ work.balance.stars : num 4 2 5 2 5 4 5 5 5 5 ...
## $ culture.values.stars : int 5 3 4 5 5 4 4 5 5 5 ...
## $ carrer.opportunities.stars: num 5 3 5 5 5 4 4 5 5 5 ...
## $ comp.benefit.stars : num 4 5 5 4 5 5 5 5 5 5 ...
## $ senior.mangemnet.stars : num 5 3 4 5 5 4 4 5 5 5 ...
## $ helpful.count : int 0 2094 949 498 49 1 0 0 0 0 ...
## $ link : Factor w/ 6754 levels "https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1.htm",..: 4098 4098 4098 4098 4098 4098 4098 4098 4098 4098 ...
## $ status : Factor w/ 2 levels "Current Employee",..: 1 2 1 1 2 2 1 2 1 1 ...
## $ position : Factor w/ 6821 levels "-",".NET Developer",..: 284 4211 5727 284 5718 4715 5718 284 284 284 ...
summary(df)
## X company location
## Min. : 1 amazon :26430 none :25085
## 1st Qu.:16883 apple :12950 Redmond, WA : 5613
## Median :33765 facebook : 1590 Seattle, WA : 4963
## Mean :33765 google : 7819 Hyderabad (India): 2150
## 3rd Qu.:50647 microsoft:17930 Mountain View, CA: 1706
## Max. :67529 netflix : 810 Bengaluru (India): 1607
## (Other) :26405
## dates summary
## 2017-09-19: 272 Great place to work: 725
## 2008-06-11: 126 Great company : 456
## 2017-09-18: 106 Software Engineer : 452
## 2018-08-29: 97 Specialist : 412
## 2017-09-25: 96 Good : 389
## 2017-09-22: 92 Great Company : 383
## (Other) :66740 (Other) :64712
## pros cons
## Great company to work for : 42 None that I can think of : 40
## Great people to work with : 25 None that I can think of. : 37
## Great company to work for.: 16 None I can think of : 22
## great company to work for : 13 No cons as of now : 13
## Great place to work at : 13 Nothing I can think of : 11
## Good company to work for : 10 Nothing that I can think of: 11
## (Other) :67410 (Other) :67395
## advice.to.mgmt overall.ratings work.balance.stars culture.values.stars
## 1 : 2 Min. :1.000 Min. :1.000 Min. :1.000
## 3 : 1 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## none:67526 Median :4.000 Median :3.500 Median :4.000
## Mean :3.826 Mean :3.373 Mean :3.784
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :7160 NA's :13546
## carrer.opportunities.stars comp.benefit.stars senior.mangemnet.stars
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000
## Median :4.000 Median :4.000 Median :3.000
## Mean :3.634 Mean :3.943 Mean :3.323
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :7108 NA's :7161 NA's :7775
## helpful.count
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.268
## 3rd Qu.: 1.000
## Max. :2094.000
##
## link
## https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1.htm : 10
## https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P10.htm : 10
## https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P100.htm : 10
## https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1000.htm: 10
## https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1001.htm: 10
## https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1002.htm: 10
## (Other) :67469
## status position
## Current Employee:42540 Anonymous Employee :27002
## Former Employee :24989 Software Engineer : 1768
## Software Development Engineer: 1244
## Specialist : 1060
## Program Manager : 826
## Warehouse Associate : 681
## (Other) :34948
as.data.frame(table(df$company)) # there are six company in this dataset
## Var1 Freq
## 1 amazon 26430
## 2 apple 12950
## 3 facebook 1590
## 4 google 7819
## 5 microsoft 17930
## 6 netflix 810
# separate tables according to companies
df_gg = df[df$company =='google',]
df_amz = df[df$company =='amazon',]
df_apl = df[df$company =='apple',]
df_fb = df[df$company =='facebook',]
df_ms = df[df$company =='microsoft',]
df_nf = df[df$company =='netflix',]
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# overall rating comparison (mean)
df %>%
group_by(company) %>%
summarize(avg_rating = mean(overall.ratings)) %>%
ggplot(aes(x=company, y= avg_rating, fill =company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Overall Rating")

# work balance comparison (mean)
df %>%
filter(!is.na(work.balance.stars)) %>% #remove missing values
group_by(company) %>%
summarize(avg_rating_work_balance_stars = mean(work.balance.stars)) %>%
ggplot(aes(x=company, y= avg_rating_work_balance_stars, fill = company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Work Balance Rating")

# culture values comparison (mean)
df %>%
group_by(company) %>%
filter(!is.na(culture.values.stars)) %>% #remove missing values
summarize(avg_rating = mean(culture.values.stars)) %>%
ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Culture Values Rating")

# career opportunities comparison (mean)
df %>%
group_by(company) %>%
filter(!is.na(carrer.opportunities.stars)) %>% #remove missing values
summarize(avg_rating = mean(carrer.opportunities.stars)) %>%
ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Carrer Opportunities Rating")

# company benefits comparison (mean)
df %>%
group_by(company) %>%
filter(!is.na(comp.benefit.stars)) %>% #remove missing values
summarize(avg_rating = mean(comp.benefit.stars)) %>%
ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+ labs(x = "", y = "", title = "Company Benefit Rating")

# senior management comparison (mean)
df %>%
group_by(company) %>%
filter(!is.na(senior.mangemnet.stars)) %>% #remove missing values
summarize(avg_rating = mean(senior.mangemnet.stars)) %>%
ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+ labs(x = "", y = "", title = "Senior Managment Rating")

# boxplot to compare employee's opion about the 6 aspects by each company
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(df, aes(x = company, y = overall.ratings)) +
geom_boxplot( fill = "blue2", alpha = 0.7)+coord_flip()
p2 <- ggplot(df, aes(x = company, y = work.balance.stars)) +
geom_boxplot( fill = "green4", alpha = 0.7)+coord_flip()
p3 <-ggplot(df, aes(x = company, y = culture.values.stars)) +
geom_boxplot(fill = "gold2", alpha = 0.7)+coord_flip()
p4 <-ggplot(df, aes(x = company, y = carrer.opportunities.stars)) +
geom_boxplot(fill = "slategray4", alpha = 0.7)+coord_flip()
p5 <-ggplot(df, aes(x = company, y = comp.benefit.stars)) +
geom_boxplot(fill = "purple4", alpha = 0.7)+coord_flip()
p6 <-ggplot(df, aes(x = company, y = senior.mangemnet.stars)) +
geom_boxplot(fill = "red4", alpha = 0.7)+coord_flip()
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)
## Warning: Removed 7160 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13546 rows containing non-finite values (stat_boxplot).
## Warning: Removed 7108 rows containing non-finite values (stat_boxplot).
## Warning: Removed 7161 rows containing non-finite values (stat_boxplot).
## Warning: Removed 7775 rows containing non-finite values (stat_boxplot).

#to be continued