project_script-new.R

df = read.csv('clean_employee_reviews.csv')
head(df,1)

##   X company location      dates                  summary
## 1 1  google     none 2018-12-11 Best Company to work for
##                            pros                               cons
## 1 People are smart and friendly Bureaucracy is slowing things down
##   advice.to.mgmt overall.ratings work.balance.stars culture.values.stars
## 1           none               5                  4                    5
##   carrer.opportunities.stars comp.benefit.stars senior.mangemnet.stars
## 1                          5                  4                      5
##   helpful.count
## 1             0
##                                                            link
## 1 https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm
##             status           position
## 1 Current Employee Anonymous Employee

str(df)

## 'data.frame':    67529 obs. of  18 variables:
##  $ X                         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ company                   : Factor w/ 6 levels "amazon","apple",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ location                  : Factor w/ 2044 levels "Aberdeen, MD",..: 1300 1231 1279 1231 1061 1231 1279 1300 1279 1300 ...
##  $ dates                     : Factor w/ 3822 levels "2008-01-31","2008-02-05",..: 3821 1822 2145 2419 3676 3819 3821 3821 3820 3819 ...
##  $ summary                   : Factor w/ 42652 levels "","-","-----",..: 6040 28376 16532 37964 39675 29231 35728 19977 16363 11511 ...
##  $ pros                      : Factor w/ 66085 levels "_ Customer focused company, where you can see the impact of your day to day job on customers - makes very happy"| __truncated__,..: 49518 4365 3363 64849 27504 49503 36286 62492 34877 39522 ...
##  $ cons                      : Factor w/ 66049 levels "_ Feels that getting promoted to the \"next level\" is unnecessary over-complicated - especially when change role.",..: 8986 4345 3412 21185 22133 16181 59557 34591 65801 46681 ...
##  $ advice.to.mgmt            : Factor w/ 3 levels "1","3","none": 3 3 3 3 3 3 3 3 3 3 ...
##  $ overall.ratings           : int  5 4 5 5 5 5 5 5 5 5 ...
##  $ work.balance.stars        : num  4 2 5 2 5 4 5 5 5 5 ...
##  $ culture.values.stars      : int  5 3 4 5 5 4 4 5 5 5 ...
##  $ carrer.opportunities.stars: num  5 3 5 5 5 4 4 5 5 5 ...
##  $ comp.benefit.stars        : num  4 5 5 4 5 5 5 5 5 5 ...
##  $ senior.mangemnet.stars    : num  5 3 4 5 5 4 4 5 5 5 ...
##  $ helpful.count             : int  0 2094 949 498 49 1 0 0 0 0 ...
##  $ link                      : Factor w/ 6754 levels "https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1.htm",..: 4098 4098 4098 4098 4098 4098 4098 4098 4098 4098 ...
##  $ status                    : Factor w/ 2 levels "Current Employee",..: 1 2 1 1 2 2 1 2 1 1 ...
##  $ position                  : Factor w/ 6821 levels "-",".NET Developer",..: 284 4211 5727 284 5718 4715 5718 284 284 284 ...

summary(df)

##        X              company                   location    
##  Min.   :    1   amazon   :26430   none             :25085  
##  1st Qu.:16883   apple    :12950   Redmond, WA      : 5613  
##  Median :33765   facebook : 1590   Seattle, WA      : 4963  
##  Mean   :33765   google   : 7819   Hyderabad (India): 2150  
##  3rd Qu.:50647   microsoft:17930   Mountain View, CA: 1706  
##  Max.   :67529   netflix  :  810   Bengaluru (India): 1607  
##                                    (Other)          :26405  
##         dates                      summary     
##  2017-09-19:  272   Great place to work:  725  
##  2008-06-11:  126   Great company      :  456  
##  2017-09-18:  106   Software Engineer  :  452  
##  2018-08-29:   97   Specialist         :  412  
##  2017-09-25:   96   Good               :  389  
##  2017-09-22:   92   Great Company      :  383  
##  (Other)   :66740   (Other)            :64712  
##                          pros                                cons      
##  Great company to work for :   42   None that I can think of   :   40  
##  Great people to work with :   25   None that I can think of.  :   37  
##  Great company to work for.:   16   None I can think of        :   22  
##  great company to work for :   13   No cons as of now          :   13  
##  Great place to work at    :   13   Nothing I can think of     :   11  
##  Good company to work for  :   10   Nothing that I can think of:   11  
##  (Other)                   :67410   (Other)                    :67395  
##  advice.to.mgmt overall.ratings work.balance.stars culture.values.stars
##  1   :    2     Min.   :1.000   Min.   :1.000      Min.   :1.000       
##  3   :    1     1st Qu.:3.000   1st Qu.:2.000      1st Qu.:3.000       
##  none:67526     Median :4.000   Median :3.500      Median :4.000       
##                 Mean   :3.826   Mean   :3.373      Mean   :3.784       
##                 3rd Qu.:5.000   3rd Qu.:4.000      3rd Qu.:5.000       
##                 Max.   :5.000   Max.   :5.000      Max.   :5.000       
##                                 NA's   :7160       NA's   :13546       
##  carrer.opportunities.stars comp.benefit.stars senior.mangemnet.stars
##  Min.   :1.000              Min.   :1.000      Min.   :1.000         
##  1st Qu.:3.000              1st Qu.:3.000      1st Qu.:2.000         
##  Median :4.000              Median :4.000      Median :3.000         
##  Mean   :3.634              Mean   :3.943      Mean   :3.323         
##  3rd Qu.:5.000              3rd Qu.:5.000      3rd Qu.:4.000         
##  Max.   :5.000              Max.   :5.000      Max.   :5.000         
##  NA's   :7108               NA's   :7161       NA's   :7775          
##  helpful.count     
##  Min.   :   0.000  
##  1st Qu.:   0.000  
##  Median :   0.000  
##  Mean   :   1.268  
##  3rd Qu.:   1.000  
##  Max.   :2094.000  
##                    
##                                                                link      
##  https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1.htm   :   10  
##  https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P10.htm  :   10  
##  https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P100.htm :   10  
##  https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1000.htm:   10  
##  https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1001.htm:   10  
##  https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036_P1002.htm:   10  
##  (Other)                                                         :67469  
##               status                               position    
##  Current Employee:42540   Anonymous Employee           :27002  
##  Former Employee :24989   Software Engineer            : 1768  
##                           Software Development Engineer: 1244  
##                           Specialist                   : 1060  
##                           Program Manager              :  826  
##                           Warehouse Associate          :  681  
##                           (Other)                      :34948

as.data.frame(table(df$company))   # there are six company in this dataset

##        Var1  Freq
## 1    amazon 26430
## 2     apple 12950
## 3  facebook  1590
## 4    google  7819
## 5 microsoft 17930
## 6   netflix   810

# separate tables according to companies
df_gg = df[df$company =='google',]
df_amz = df[df$company =='amazon',]
df_apl = df[df$company =='apple',]
df_fb = df[df$company =='facebook',]
df_ms = df[df$company =='microsoft',]
df_nf = df[df$company =='netflix',]

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# overall rating comparison (mean)
df %>%
  group_by(company) %>%
  summarize(avg_rating = mean(overall.ratings)) %>%
  ggplot(aes(x=company, y= avg_rating, fill =company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Overall Rating")

# work balance comparison (mean)

df %>%
  filter(!is.na(work.balance.stars)) %>% #remove missing values
  group_by(company) %>%
  summarize(avg_rating_work_balance_stars = mean(work.balance.stars)) %>%
  ggplot(aes(x=company, y= avg_rating_work_balance_stars, fill = company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Work Balance Rating")

# culture values comparison (mean)

df %>%
  group_by(company) %>%
  filter(!is.na(culture.values.stars)) %>%  #remove missing values
  summarize(avg_rating = mean(culture.values.stars)) %>%
  ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Culture Values Rating")

# career opportunities comparison (mean)

 df %>%
  group_by(company) %>%
  filter(!is.na(carrer.opportunities.stars)) %>% #remove missing values
  summarize(avg_rating = mean(carrer.opportunities.stars)) %>%
  ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+labs(x = "", y = "", title = "Carrer Opportunities Rating")

# company benefits comparison (mean)

df %>%
  group_by(company) %>%
  filter(!is.na(comp.benefit.stars)) %>% #remove missing values
  summarize(avg_rating = mean(comp.benefit.stars)) %>%
  ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+ labs(x = "", y = "", title = "Company Benefit Rating")

# senior management comparison (mean)

df %>%
  group_by(company) %>%
  filter(!is.na(senior.mangemnet.stars)) %>% #remove missing values
  summarize(avg_rating = mean(senior.mangemnet.stars)) %>%
  ggplot(aes(x=company, y= avg_rating, fill = company)) + geom_bar(stat="identity")+coord_flip()+ labs(x = "", y = "", title = "Senior Managment Rating")

# boxplot to compare employee's opion about the 6 aspects by each company

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

p1 <- ggplot(df, aes(x = company, y = overall.ratings)) +
  geom_boxplot( fill = "blue2", alpha = 0.7)+coord_flip()
p2 <- ggplot(df, aes(x = company, y = work.balance.stars)) +
  geom_boxplot( fill = "green4", alpha = 0.7)+coord_flip()
p3 <-ggplot(df, aes(x = company, y = culture.values.stars)) +
  geom_boxplot(fill = "gold2", alpha = 0.7)+coord_flip()
p4 <-ggplot(df, aes(x = company, y = carrer.opportunities.stars)) +
  geom_boxplot(fill = "slategray4", alpha = 0.7)+coord_flip()
p5 <-ggplot(df, aes(x = company, y = comp.benefit.stars)) +
  geom_boxplot(fill = "purple4", alpha = 0.7)+coord_flip()
p6 <-ggplot(df, aes(x = company, y = senior.mangemnet.stars)) +
  geom_boxplot(fill = "red4", alpha = 0.7)+coord_flip()

grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)

## Warning: Removed 7160 rows containing non-finite values (stat_boxplot).

## Warning: Removed 13546 rows containing non-finite values (stat_boxplot).

## Warning: Removed 7108 rows containing non-finite values (stat_boxplot).

## Warning: Removed 7161 rows containing non-finite values (stat_boxplot).

## Warning: Removed 7775 rows containing non-finite values (stat_boxplot).

#to be continued

project_script-new.R

li

Tue Apr 16 23:05:58 2019