#Load Data

df=read.csv("salaries_clean.csv")
df
df

Structure of the data

str(df)
## 'data.frame':    1655 obs. of  19 variables:
##  $ index                    : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ salary_id                : int  1 3 4 6 12 14 16 17 21 23 ...
##  $ employer_name            : chr  "opower" "walmart" "vertical knowledge" "netapp" ...
##  $ location_name            : chr  "san francisco, ca" "bentonville, ar" "cleveland, oh" "waltham" ...
##  $ location_state           : chr  "CA" "AR" "OH" "" ...
##  $ location_country         : chr  "US" "US" "US" "" ...
##  $ location_latitude        : num  37.8 36.4 41.5 NA NA ...
##  $ location_longitude       : num  -122.4 -94.2 -81.7 NA NA ...
##  $ job_title                : chr  "systems engineer" "senior developer" "software engineer" "mts" ...
##  $ job_title_category       : chr  "Engineering" "Software" "Software" "Other" ...
##  $ job_title_rank           : chr  "" "Senior" "" "" ...
##  $ total_experience_years   : num  13 15 4 4 4 5 4 8 2 1 ...
##  $ employer_experience_years: num  2 8 1 0 3 1.5 2.5 2 1 1 ...
##  $ annual_base_pay          : num  125000 65000 86000 105000 110000 40000 45000 135000 105000 80000 ...
##  $ signing_bonus            : num  5000 NA 5000 5000 5000 0 0 0 0 0 ...
##  $ annual_bonus             : num  0 5000 6000 8500 7000 500 1500 0 47000 0 ...
##  $ stock_value_bonus        : chr  "5000 shares" "3,000" "0" "0" ...
##  $ comments                 : chr  "Don't work here." "" "" "" ...
##  $ submitted_at             : chr  "3/21/16 12:58" "3/21/16 12:58" "3/21/16 12:59" "3/21/16 13:00" ...

Columns in the dataset

colnames(df)
##  [1] "index"                     "salary_id"                
##  [3] "employer_name"             "location_name"            
##  [5] "location_state"            "location_country"         
##  [7] "location_latitude"         "location_longitude"       
##  [9] "job_title"                 "job_title_category"       
## [11] "job_title_rank"            "total_experience_years"   
## [13] "employer_experience_years" "annual_base_pay"          
## [15] "signing_bonus"             "annual_bonus"             
## [17] "stock_value_bonus"         "comments"                 
## [19] "submitted_at"

Summary of data

summary(df)
##      index          salary_id    employer_name      location_name     
##  Min.   :   0.0   Min.   :   1   Length:1655        Length:1655       
##  1st Qu.: 413.5   1st Qu.: 897   Class :character   Class :character  
##  Median : 827.0   Median :1711   Mode  :character   Mode  :character  
##  Mean   : 827.0   Mean   :1684                                        
##  3rd Qu.:1240.5   3rd Qu.:2477                                        
##  Max.   :1654.0   Max.   :3298                                        
##                                                                       
##  location_state     location_country   location_latitude location_longitude
##  Length:1655        Length:1655        Min.   :-41.00    Min.   :-123.27   
##  Class :character   Class :character   1st Qu.: 37.41    1st Qu.:-102.70   
##  Mode  :character   Mode  :character   Median : 38.58    Median : -95.00   
##                                        Mean   : 37.75    Mean   : -64.71   
##                                        3rd Qu.: 45.44    3rd Qu.: -64.56   
##                                        Max.   : 65.00    Max.   : 174.00   
##                                        NA's   :863       NA's   :863       
##   job_title         job_title_category job_title_rank    
##  Length:1655        Length:1655        Length:1655       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  total_experience_years employer_experience_years annual_base_pay    
##  Min.   : 0.000         Min.   : 0.000            Min.   :0.000e+00  
##  1st Qu.: 3.000         1st Qu.: 1.000            1st Qu.:6.100e+04  
##  Median : 5.000         Median : 2.000            Median :9.948e+04  
##  Mean   : 6.756         Mean   : 2.656            Mean   :6.325e+06  
##  3rd Qu.:10.000         3rd Qu.: 3.000            3rd Qu.:1.300e+05  
##  Max.   :56.000         Max.   :58.000            Max.   :1.000e+10  
##  NA's   :47             NA's   :47                NA's   :4          
##  signing_bonus      annual_bonus     stock_value_bonus    comments        
##  Min.   :      0   Min.   :      0   Length:1655        Length:1655       
##  1st Qu.:      0   1st Qu.:      0   Class :character   Class :character  
##  Median :      0   Median :    500   Mode  :character   Mode  :character  
##  Mean   :  22406   Mean   :  11776                                        
##  3rd Qu.:   5000   3rd Qu.:  10000                                        
##  Max.   :8999999   Max.   :2000000                                        
##  NA's   :323       NA's   :319                                            
##  submitted_at      
##  Length:1655       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Plotting some of the features to see inconsistent data

  1. Location names
#Plotting Location names
ggplot(df,aes(x=location_name,size=)) + geom_bar() 

library(dplyr)
count(df,location_name)

Cleaning Data

1. Location_name

length(df$location_name)
## [1] 1655

Filtering rows that

Remove all the rows where location name is a number

library(dplyr)
x=filter(df, !is.numeric(location_name))
count(x,location_name)
library(stringr)
select(df,location_name) %>%
  filter(!is.na(df$location_name))

Counting missing values in the dataset

na_counts <- colSums(is.na(df))
na_counts
##                     index                 salary_id             employer_name 
##                         0                         0                         0 
##             location_name            location_state          location_country 
##                         0                         0                         0 
##         location_latitude        location_longitude                 job_title 
##                       863                       863                         0 
##        job_title_category            job_title_rank    total_experience_years 
##                         0                         0                        47 
## employer_experience_years           annual_base_pay             signing_bonus 
##                        47                         4                       323 
##              annual_bonus         stock_value_bonus                  comments 
##                       319                         0                         0 
##              submitted_at 
##                         0
empty_string_counts <- lapply(df, function(x) sum(nchar(x) == 0))
empty_string_counts
## $index
## [1] 0
## 
## $salary_id
## [1] 0
## 
## $employer_name
## [1] 4
## 
## $location_name
## [1] 0
## 
## $location_state
## [1] 1097
## 
## $location_country
## [1] 863
## 
## $location_latitude
## [1] NA
## 
## $location_longitude
## [1] NA
## 
## $job_title
## [1] 0
## 
## $job_title_category
## [1] 0
## 
## $job_title_rank
## [1] 1230
## 
## $total_experience_years
## [1] NA
## 
## $employer_experience_years
## [1] NA
## 
## $annual_base_pay
## [1] NA
## 
## $signing_bonus
## [1] NA
## 
## $annual_bonus
## [1] NA
## 
## $stock_value_bonus
## [1] 402
## 
## $comments
## [1] 1363
## 
## $submitted_at
## [1] 0
total_missing_counts <- lapply(df, function(x) sum(is.na(x) | nchar(x) == 0))

# convert the total_missing_counts vector to a dataframe
missing_counts_df <- data.frame(feature = names(total_missing_counts),
                                missing_count = as.numeric(total_missing_counts))

# calculate the percentage of missing values for each feature
missing_counts_df <- missing_counts_df %>%
  mutate(missing_percent = (missing_count / nrow(df)) * 100)

# create the bar plot using the missing_percent column
ggplot(missing_counts_df, aes(y = feature, x = missing_percent)) +
  geom_bar(stat = "identity") +
  ylab("Feature") +
  xlab("Missing Value Percent") +
  xlim(c(0,100))+
  ggtitle("Missing Value Percent by Feature")

# Using features PAY : annual_base_pay, # EXPERIENCE : total_experience_years, employer_experience_years, # LOCATION : # EMPLOYER : employer_name # JOB INFO : job_title_rank , job_title

`

library(stringdist)
location_name_cleaned <- as.character(df$location_name)
location_name_cleaned <- tolower(location_name_cleaned)
location_name_cleaned<-trimws(location_name_cleaned)
# remove numbers and special characters
location_name_cleaned<- gsub("[0-9!@#$%^&*()_+=-]", "", location_name_cleaned)
location_name_cleaned
##    [1] "san francisco, ca"                       
##    [2] "bentonville, ar"                         
##    [3] "cleveland, oh"                           
##    [4] "waltham"                                 
##    [5] "cupertino"                               
##    [6] "eastern oregon"                          
##    [7] "madison, wi"                             
##    [8] "new york city"                           
##    [9] "nyc"                                     
##   [10] "san francisco"                           
##   [11] "denver, co"                              
##   [12] "chicago"                                 
##   [13] "west lafayette, in"                      
##   [14] "minneapolis"                             
##   [15] "minneapolis"                             
##   [16] "bordeaux"                                
##   [17] "seattle"                                 
##   [18] "charleston, sc"                          
##   [19] "bala cynwyd, pa"                         
##   [20] "toronto, canada"                         
##   [21] "durham, nc"                              
##   [22] "seattle"                                 
##   [23] "new york"                                
##   [24] "san francisco"                           
##   [25] "santa clara"                             
##   [26] "chicago"                                 
##   [27] "boulder co"                              
##   [28] "bucharest, romania"                      
##   [29] "stockholm, sweden"                       
##   [30] "sterling, va"                            
##   [31] "orlando, florida"                        
##   [32] "detroit, mi"                             
##   [33] "san francisco, ca"                       
##   [34] "hillsboro, oregon"                       
##   [35] "austin, tx"                              
##   [36] "denver, co"                              
##   [37] "washington dc"                           
##   [38] "wellington, new zealand"                 
##   [39] "denver, co"                              
##   [40] "albany, ny"                              
##   [41] "ct"                                      
##   [42] "san francisco"                           
##   [43] "joplin mo"                               
##   [44] "jersey city, nj"                         
##   [45] "phils"                                   
##   [46] "bristol, uk"                             
##   [47] "philadelphia, pa"                        
##   [48] "san jose, ca"                            
##   [49] "cambridge, ma"                           
##   [50] "san francisco"                           
##   [51] "san francisco"                           
##   [52] "barcelona, spain"                        
##   [53] "redmond, wa"                             
##   [54] "new york city"                           
##   [55] "san francisco, ca"                       
##   [56] "ithaca, ny"                              
##   [57] "redmond, wa"                             
##   [58] "amsterdam"                               
##   [59] "seattle"                                 
##   [60] "oklahoma city, ok"                       
##   [61] "chicago"                                 
##   [62] "san francisco"                           
##   [63] "raleigh, nc"                             
##   [64] "verona, wisconsin"                       
##   [65] "karlstad, sweden"                        
##   [66] "nj"                                      
##   [67] "lubbock, tx"                             
##   [68] "raleigh, nc"                             
##   [69] "new york, ny"                            
##   [70] "london, uk"                              
##   [71] "sunnyvale"                               
##   [72] "poland"                                  
##   [73] "new york city, ny"                       
##   [74] "dallas"                                  
##   [75] "san francisco, ca"                       
##   [76] "seattel, wa"                             
##   [77] ""                                        
##   [78] ""                                        
##   [79] ""                                        
##   [80] ""                                        
##   [81] ""                                        
##   [82] ""                                        
##   [83] ""                                        
##   [84] ""                                        
##   [85] ""                                        
##   [86] ""                                        
##   [87] ""                                        
##   [88] ""                                        
##   [89] "london"                                  
##   [90] "hillsboro, oregon"                       
##   [91] "mountain view"                           
##   [92] "bothell"                                 
##   [93] ""                                        
##   [94] ""                                        
##   [95] "israel"                                  
##   [96] "bremerhaven, germany"                    
##   [97] ""                                        
##   [98] "berlin"                                  
##   [99] "palo alto"                               
##  [100] "dallas tx"                               
##  [101] "austin"                                  
##  [102] "seattle"                                 
##  [103] "san francisco"                           
##  [104] "berlin"                                  
##  [105] "san francisco"                           
##  [106] "seattle"                                 
##  [107] "johnston, iowa"                          
##  [108] "california"                              
##  [109] "omaha, ne"                               
##  [110] "plymouth, mn"                            
##  [111] "boston metro"                            
##  [112] "phoenix, arizona"                        
##  [113] "sunnyvale"                               
##  [114] "seattle, wa"                             
##  [115] "san francisco"                           
##  [116] "culver city, ca"                         
##  [117] "menlo park"                              
##  [118] "seattle"                                 
##  [119] "richmond, ca"                            
##  [120] "solon, oh"                               
##  [121] "seattle"                                 
##  [122] "ìngelholm, sweden"                       
##  [123] "remote us"                               
##  [124] "redmond"                                 
##  [125] "stuttgart"                               
##  [126] "san francisco"                           
##  [127] "helsinki"                                
##  [128] "albuquerque, nm"                         
##  [129] "mountain view"                           
##  [130] "cupertino"                               
##  [131] "dallas"                                  
##  [132] "washington d.c."                         
##  [133] "palo alto"                               
##  [134] "rennes, france"                          
##  [135] "regina, sk canada"                       
##  [136] "buffalo, ny"                             
##  [137] "atlanta"                                 
##  [138] "sf"                                      
##  [139] "san francisco, ca"                       
##  [140] "olathe, ks"                              
##  [141] "norman, ok"                              
##  [142] "wilton, ct usa"                          
##  [143] "philadelphia"                            
##  [144] "united states"                           
##  [145] "seattle, wa"                             
##  [146] "columbus, oh"                            
##  [147] "palo alto"                               
##  [148] "denver, co"                              
##  [149] "sf"                                      
##  [150] "austin, tx"                              
##  [151] "oakland, ca"                             
##  [152] "cincinatti"                              
##  [153] "san diego, ca"                           
##  [154] "detroit, mi"                             
##  [155] "stockholm, sweden"                       
##  [156] "southampton, uk"                         
##  [157] "london"                                  
##  [158] "seattle, wa"                             
##  [159] "tampa"                                   
##  [160] "oregon"                                  
##  [161] "puerto rico"                             
##  [162] "gaithersburg, md"                        
##  [163] "seattle"                                 
##  [164] "warsaw, poland"                          
##  [165] "nyc"                                     
##  [166] "philadelphia"                            
##  [167] "redmond"                                 
##  [168] "asunciìn, paraguay"                      
##  [169] "dublin, ireland"                         
##  [170] "nyc"                                     
##  [171] "new york, new york"                      
##  [172] "northern california"                     
##  [173] "toronto, canada"                         
##  [174] "seattle"                                 
##  [175] "vancouver, canada"                       
##  [176] "canada"                                  
##  [177] "denver,  co"                             
##  [178] "kingston, canada"                        
##  [179] "boston"                                  
##  [180] "rolla, mo, usa"                          
##  [181] "boston"                                  
##  [182] "remote"                                  
##  [183] "boulder, co"                             
##  [184] "san francisco"                           
##  [185] "san francisco"                           
##  [186] "redmond"                                 
##  [187] "mountain view"                           
##  [188] "seattle"                                 
##  [189] "menlo park"                              
##  [190] "chicago"                                 
##  [191] "seattle, wa"                             
##  [192] "san francisco"                           
##  [193] "seattle, wa"                             
##  [194] "bangalore, india"                        
##  [195] "salt lake city"                          
##  [196] "san francisco, ca"                       
##  [197] "menlo park"                              
##  [198] "cape town"                               
##  [199] "bristol, uk"                             
##  [200] "london"                                  
##  [201] "wellington"                              
##  [202] "mountain view"                           
##  [203] "houston"                                 
##  [204] "mountain view"                           
##  [205] "boston"                                  
##  [206] "stuart, fl"                              
##  [207] "boston"                                  
##  [208] "menlo park"                              
##  [209] "nc"                                      
##  [210] "north carolina"                          
##  [211] "los angeles"                             
##  [212] "bengaluru"                               
##  [213] "atlanta, georgia"                        
##  [214] "seattle, wa usa"                         
##  [215] "chicago"                                 
##  [216] "redmon"                                  
##  [217] "oslo, norway"                            
##  [218] "los gatos"                               
##  [219] "new york city"                           
##  [220] "san francisco"                           
##  [221] "owego, ny"                               
##  [222] "anaheim, ca"                             
##  [223] "boston, ma"                              
##  [224] "vancouver"                               
##  [225] "santa clara, ca"                         
##  [226] "san francisco"                           
##  [227] "new york city"                           
##  [228] "florida"                                 
##  [229] "los gatos"                               
##  [230] "london"                                  
##  [231] "san francisco"                           
##  [232] "frankfurt"                               
##  [233] "rochester, ny"                           
##  [234] "san francisco, ca"                       
##  [235] "redmond, wa"                             
##  [236] "canada"                                  
##  [237] "san francisco"                           
##  [238] "college station, tx"                     
##  [239] "san francisco"                           
##  [240] "chinz"                                   
##  [241] "sf"                                      
##  [242] "cambridge, ma"                           
##  [243] "washington dc"                           
##  [244] "mountain view"                           
##  [245] "new york"                                
##  [246] "new york city"                           
##  [247] "san francisco"                           
##  [248] "chico ca"                                
##  [249] "ca"                                      
##  [250] "oklahoma city, ok"                       
##  [251] "san francisco"                           
##  [252] "stockholm, sweden"                       
##  [253] "nordrheinwestfalen"                      
##  [254] "paris"                                   
##  [255] "san francisco"                           
##  [256] "austin"                                  
##  [257] "radnor, pa"                              
##  [258] "san francisco"                           
##  [259] "nyc"                                     
##  [260] "kansas city, mo"                         
##  [261] "redmond, wa"                             
##  [262] "monaco"                                  
##  [263] "calgary, canada"                         
##  [264] "yorktown,ny"                             
##  [265] "seattle, wa"                             
##  [266] "rochester ny"                            
##  [267] "dortmund, germany"                       
##  [268] "w"                                       
##  [269] "austin"                                  
##  [270] "sf"                                      
##  [271] "gainesville, fl"                         
##  [272] "vancouver"                               
##  [273] "san francisco"                           
##  [274] "hillsboro"                               
##  [275] "vancouver bc"                            
##  [276] "sweden"                                  
##  [277] "lexington park, md"                      
##  [278] "washington, dc"                          
##  [279] "austin, tx"                              
##  [280] "halifax, ns, canada"                     
##  [281] "schenectady, ny"                         
##  [282] "yeranus"                                 
##  [283] "washington, dc"                          
##  [284] "london"                                  
##  [285] "arizona"                                 
##  [286] "woodland hills"                          
##  [287] "pasadena"                                
##  [288] "nyc"                                     
##  [289] "berlin"                                  
##  [290] "san francisco"                           
##  [291] "richmond, ky"                            
##  [292] "new york, ny"                            
##  [293] "seattle"                                 
##  [294] "compton ca"                              
##  [295] "palo alto"                               
##  [296] "washington, dc"                          
##  [297] "canada"                                  
##  [298] "chicago, il"                             
##  [299] "your momma's buttcrack"                  
##  [300] "seattle"                                 
##  [301] "amsterdam"                               
##  [302] "menlo park"                              
##  [303] "new york"                                
##  [304] "new york city"                           
##  [305] "houston, tx"                             
##  [306] "schenectady, ny"                         
##  [307] "warsaw, poland"                          
##  [308] "redmond"                                 
##  [309] "toronto"                                 
##  [310] "portland, or"                            
##  [311] "ann arbor, mi"                           
##  [312] "nyc"                                     
##  [313] "clemson, sc"                             
##  [314] "uk"                                      
##  [315] "cambridge, ma"                           
##  [316] "south san francisco"                     
##  [317] "san francisco palo alto, ca"             
##  [318] "new york"                                
##  [319] "san francisco"                           
##  [320] "amsterdam"                               
##  [321] "san jose"                                
##  [322] "south africa"                            
##  [323] "fayetteville, ar"                        
##  [324] "seattle"                                 
##  [325] "london"                                  
##  [326] "germany"                                 
##  [327] "pleasanton, ca"                          
##  [328] "san jose, ca"                            
##  [329] "lincoln, ne"                             
##  [330] "sterling, va"                            
##  [331] "san francisco"                           
##  [332] "san francisco"                           
##  [333] "p"                                       
##  [334] "chicago"                                 
##  [335] "madison, wi"                             
##  [336] "dsvao"                                   
##  [337] "bellevue, wa"                            
##  [338] "minneapolis, mn"                         
##  [339] "toronto"                                 
##  [340] "san francisco"                           
##  [341] "seattle"                                 
##  [342] "yonkers, ny"                             
##  [343] "austin"                                  
##  [344] "redmond, wa"                             
##  [345] "new york city"                           
##  [346] "san francisco"                           
##  [347] "munich"                                  
##  [348] "seattle"                                 
##  [349] "san diego"                               
##  [350] "natick"                                  
##  [351] "berlin"                                  
##  [352] "sydney, australia"                       
##  [353] "san francisco"                           
##  [354] "boise, id remote, job is based in nyc"   
##  [355] "mountain view"                           
##  [356] "sao paulo"                               
##  [357] "bay area"                                
##  [358] "sf"                                      
##  [359] "toronto"                                 
##  [360] "austin, tx"                              
##  [361] "brazil"                                  
##  [362] "new york"                                
##  [363] "san francisco, ca"                       
##  [364] "london"                                  
##  [365] "london"                                  
##  [366] "san francisco"                           
##  [367] "dublin ireland"                          
##  [368] "pune"                                    
##  [369] "berlin"                                  
##  [370] "cambridge, ma"                           
##  [371] "seattle"                                 
##  [372] "london"                                  
##  [373] "san francisco"                           
##  [374] "ireland"                                 
##  [375] "menlo park"                              
##  [376] "test"                                    
##  [377] "chicago"                                 
##  [378] "seattle"                                 
##  [379] "montreal, quebec, canada"                
##  [380] "seattle"                                 
##  [381] "mississippi"                             
##  [382] "foobar"                                  
##  [383] "san francisco, ca"                       
##  [384] "mountain view"                           
##  [385] "stanford, ca"                            
##  [386] "albuquerque, new mexico"                 
##  [387] "bosnia"                                  
##  [388] "vancouver"                               
##  [389] "mountain west"                           
##  [390] "columbus, oh"                            
##  [391] "sacramento, ca"                          
##  [392] "cologne, germany"                        
##  [393] "san francisco"                           
##  [394] "durham, nc"                              
##  [395] "austin, tx"                              
##  [396] "bavaria"                                 
##  [397] "seattle"                                 
##  [398] "los angeles"                             
##  [399] "new york"                                
##  [400] "sunnyvale"                               
##  [401] "london"                                  
##  [402] "manchester, nh"                          
##  [403] "nyc"                                     
##  [404] "tacoma"                                  
##  [405] "waltham, ma"                             
##  [406] "salt lake city"                          
##  [407] "american fork, ut"                       
##  [408] "bulgaria"                                
##  [409] "radnor, pa"                              
##  [410] "chicago"                                 
##  [411] "san francisco"                           
##  [412] "berlin"                                  
##  [413] "menlo park"                              
##  [414] "los angeles"                             
##  [415] "chicago"                                 
##  [416] "san francisco"                           
##  [417] "redmond, wa"                             
##  [418] "new york, ny"                            
##  [419] "berlin"                                  
##  [420] "san francisco"                           
##  [421] "sunnyvale"                               
##  [422] "new york city"                           
##  [423] "seattle"                                 
##  [424] "midwest"                                 
##  [425] "issaquah"                                
##  [426] "southern california"                     
##  [427] "lahore, pakistan"                        
##  [428] "beaverton, or"                           
##  [429] "salt lake city, utah"                    
##  [430] "ireland"                                 
##  [431] "sydney"                                  
##  [432] "new york"                                
##  [433] "dcvamd area"                             
##  [434] "peru"                                    
##  [435] "san francisco"                           
##  [436] "burlington, vt"                          
##  [437] "san francisco"                           
##  [438] "toronto"                                 
##  [439] "seattle"                                 
##  [440] "chicago"                                 
##  [441] "phoenix, az"                             
##  [442] "iceland"                                 
##  [443] "los angeles"                             
##  [444] "new york"                                
##  [445] "remote us"                               
##  [446] "london, uk working remote"               
##  [447] "milwaukee"                               
##  [448] "baltimore"                               
##  [449] "san francisco"                           
##  [450] "new york city"                           
##  [451] "nyc"                                     
##  [452] "san francisco, ca"                       
##  [453] "usa"                                     
##  [454] "newark"                                  
##  [455] "paris"                                   
##  [456] "north carolina"                          
##  [457] "san francisco"                           
##  [458] "brisbane, australia"                     
##  [459] "pleasanton"                              
##  [460] "memphis"                                 
##  [461] "sulzbach an der murr, germany"           
##  [462] "nijmegen"                                
##  [463] "folsom, ca"                              
##  [464] "boulder, co"                             
##  [465] "interior bc, canada"                     
##  [466] "midwest"                                 
##  [467] "london"                                  
##  [468] "englewood, co"                           
##  [469] "midwest"                                 
##  [470] "santa monica, ca"                        
##  [471] "virginia"                                
##  [472] "halifax"                                 
##  [473] "santa clara"                             
##  [474] "berlin"                                  
##  [475] "mountain view, ca"                       
##  [476] "san mateo, ca"                           
##  [477] "bedford ma"                              
##  [478] "south carolina"                          
##  [479] "mtv"                                     
##  [480] "san francisco, ca"                       
##  [481] "san juan, pr"                            
##  [482] "seattle"                                 
##  [483] "minneapolis, mn"                         
##  [484] "seattle"                                 
##  [485] "uk"                                      
##  [486] "boston, ma"                              
##  [487] "canada"                                  
##  [488] "salt lake city, ut"                      
##  [489] "sacramento, ca"                          
##  [490] "adelaide, australia"                     
##  [491] "denver, co"                              
##  [492] "sweden"                                  
##  [493] "atlanta"                                 
##  [494] "mountain view"                           
##  [495] "alexandria, va"                          
##  [496] "plano tx"                                
##  [497] "san diego"                               
##  [498] "tampa, fl"                               
##  [499] "san francisco"                           
##  [500] "san francisco"                           
##  [501] "san francisco"                           
##  [502] "redmond, wa"                             
##  [503] "austin, tx"                              
##  [504] "washington, dc"                          
##  [505] "south of france"                         
##  [506] "springsfield, co"                        
##  [507] "south west uk"                           
##  [508] "palo alto"                               
##  [509] "vancouver"                               
##  [510] "salt lake city, ut"                      
##  [511] "arlington, va"                           
##  [512] "washington dc"                           
##  [513] "france  lyon"                            
##  [514] "new york, ny"                            
##  [515] "toulouse, france"                        
##  [516] "seattle, wa"                             
##  [517] "mountain view, ca"                       
##  [518] "redmond, wa"                             
##  [519] "boston, ma"                              
##  [520] "san francisco"                           
##  [521] "australia"                               
##  [522] "sunnyvale"                               
##  [523] "did"                                     
##  [524] "austin"                                  
##  [525] "seattle"                                 
##  [526] "belgium"                                 
##  [527] "sf bay area"                             
##  [528] "san francisco, ca"                       
##  [529] "verona, wisconsin"                       
##  [530] "hursley, uk"                             
##  [531] "menlo park"                              
##  [532] "toulouse fr"                             
##  [533] "maryland"                                
##  [534] "portland, or"                            
##  [535] "redmond, wa"                             
##  [536] "redmond, wa"                             
##  [537] "omaha, ne"                               
##  [538] "ms"                                      
##  [539] "louisville, co"                          
##  [540] "brisbane, australia"                     
##  [541] "ottawa, canada"                          
##  [542] ""                                        
##  [543] "germany"                                 
##  [544] "clujnapoca, romania"                     
##  [545] "san francisco"                           
##  [546] "los angeles"                             
##  [547] "toronto, on"                             
##  [548] "bay area"                                
##  [549] "istanbul"                                
##  [550] "kingsport, tn"                           
##  [551] "nyc"                                     
##  [552] "denver, co"                              
##  [553] "redmond, wa"                             
##  [554] "austin"                                  
##  [555] "zurich, switzerland"                     
##  [556] "san ramon"                               
##  [557] "austin, tx"                              
##  [558] "london"                                  
##  [559] "lowell, ma"                              
##  [560] "minnesota"                               
##  [561] "champaign, il"                           
##  [562] "mountain view, ca"                       
##  [563] "chicago"                                 
##  [564] "maryland"                                
##  [565] "salt lake city"                          
##  [566] "portland, or"                            
##  [567] "remote midwest"                          
##  [568] "indianapolis, in"                        
##  [569] "clujnapoca"                              
##  [570] "london, uk"                              
##  [571] "san francisco"                           
##  [572] "irvine, ca"                              
##  [573] "san francisco"                           
##  [574] "new york city"                           
##  [575] "eindhoven, netherlands"                  
##  [576] "san diego"                               
##  [577] "clujnapoca"                              
##  [578] "hungary"                                 
##  [579] "cupertino, ca"                           
##  [580] "remote"                                  
##  [581] "new york"                                
##  [582] "houston"                                 
##  [583] "san diego"                               
##  [584] "oslo"                                    
##  [585] "seattle"                                 
##  [586] "costa mesa, california"                  
##  [587] "aaaa"                                    
##  [588] "fairfax, va"                             
##  [589] "frankfurt, germany"                      
##  [590] "seattle"                                 
##  [591] "salamanca, spain"                        
##  [592] "colorado"                                
##  [593] "honolulu"                                
##  [594] "sunnyvale"                               
##  [595] "philadelphia, pa"                        
##  [596] "clujnapoca"                              
##  [597] "los angles"                              
##  [598] "cambridge"                               
##  [599] "new york"                                
##  [600] "helsinki, finland"                       
##  [601] "melbourne, australia"                    
##  [602] "brooklyn"                                
##  [603] "los angeles"                             
##  [604] "west yorkshire, uk"                      
##  [605] "india"                                   
##  [606] "las juntas"                              
##  [607] "provo"                                   
##  [608] "clujnapoca"                              
##  [609] "berkeley"                                
##  [610] "san francisco"                           
##  [611] "dayton, oh"                              
##  [612] "san jose"                                
##  [613] "germany"                                 
##  [614] "menlo park"                              
##  [615] "bucharest"                               
##  [616] "silicon valley"                          
##  [617] "california"                              
##  [618] "new york city"                           
##  [619] "nsw"                                     
##  [620] "raleigh"                                 
##  [621] "atlanta"                                 
##  [622] "san francisco"                           
##  [623] "brazil"                                  
##  [624] "tel aviv, israel"                        
##  [625] "seattle"                                 
##  [626] "mumbai"                                  
##  [627] "warsaw, poland"                          
##  [628] "san diego"                               
##  [629] "seattle, wa"                             
##  [630] "san francisco"                           
##  [631] "zìrich"                                  
##  [632] "reading, uk"                             
##  [633] "melbourne, australia"                    
##  [634] "denver, co"                              
##  [635] "san francisco"                           
##  [636] "new york, ny"                            
##  [637] "uk"                                      
##  [638] "venezuela"                               
##  [639] "dfw"                                     
##  [640] "buch"                                    
##  [641] "st. george, ut"                          
##  [642] "new york, ny"                            
##  [643] "baltimore, md"                           
##  [644] "lisbon, portugal"                        
##  [645] "san francisco"                           
##  [646] "new york"                                
##  [647] "a"                                       
##  [648] "bay area"                                
##  [649] "new york"                                
##  [650] "new york city"                           
##  [651] "new york city"                           
##  [652] "new york, ny"                            
##  [653] "san francisco, ca"                       
##  [654] "los angeles"                             
##  [655] "palo alto, ca"                           
##  [656] "new zealand"                             
##  [657] "centennial, co"                          
##  [658] "new york city"                           
##  [659] "santa clara"                             
##  [660] "san fransico"                            
##  [661] "poland"                                  
##  [662] "seattle"                                 
##  [663] "nyc"                                     
##  [664] "amsterdam"                               
##  [665] "greater los angeles area"                
##  [666] "san francisco"                           
##  [667] "portland"                                
##  [668] "austin"                                  
##  [669] "st. paul"                                
##  [670] "berlin, germany"                         
##  [671] "los angeles"                             
##  [672] "austin tx"                               
##  [673] "sao paulo"                               
##  [674] "usa"                                     
##  [675] "buenos aires"                            
##  [676] "redwood city"                            
##  [677] "redmond"                                 
##  [678] "edegem"                                  
##  [679] "new york, ny"                            
##  [680] "boston"                                  
##  [681] "india"                                   
##  [682] "remote"                                  
##  [683] "london"                                  
##  [684] "new york"                                
##  [685] "cambridge, ma"                           
##  [686] "cambridge"                               
##  [687] "seattle, wa"                             
##  [688] "huntsville, al"                          
##  [689] "carson city, nv"                         
##  [690] "uk"                                      
##  [691] "seattle"                                 
##  [692] "folsom, ca"                              
##  [693] "stockholm"                               
##  [694] "cupertino"                               
##  [695] "new york"                                
##  [696] "roseville, ca"                           
##  [697] "oakland"                                 
##  [698] "belgium"                                 
##  [699] "folsom, ca"                              
##  [700] "livermore, ca"                           
##  [701] "sf"                                      
##  [702] "mtv"                                     
##  [703] "palo alto"                               
##  [704] "austin, tx"                              
##  [705] "chicago il"                              
##  [706] "england  manchester"                     
##  [707] "copenhagen, denmark"                     
##  [708] "kirkland wa usa"                         
##  [709] "los angeles"                             
##  [710] "austin"                                  
##  [711] "kitchener, canada"                       
##  [712] "pittsburgh, pa"                          
##  [713] "sydney, australia"                       
##  [714] "kirkland, wa"                            
##  [715] "ireland"                                 
##  [716] "olathe, ks"                              
##  [717] "remote"                                  
##  [718] "luxembourg, luxembourg"                  
##  [719] "washington dc"                           
##  [720] "sweden"                                  
##  [721] "san francisco"                           
##  [722] "los angeles"                             
##  [723] "greenville, sc"                          
##  [724] "seattle, wa"                             
##  [725] "san franciaco"                           
##  [726] "brighton, uk"                            
##  [727] "berlin"                                  
##  [728] "chicago, il"                             
##  [729] "redwood city, ca"                        
##  [730] "sweden"                                  
##  [731] "palo alto"                               
##  [732] "san francisco"                           
##  [733] "remote"                                  
##  [734] "austin"                                  
##  [735] "palo alto"                               
##  [736] "amsterdam"                               
##  [737] "easton, pa"                              
##  [738] "frankfurt"                               
##  [739] "mountain view, ca"                       
##  [740] "chicago"                                 
##  [741] "london"                                  
##  [742] "san francisco"                           
##  [743] "menlo park"                              
##  [744] "san francisco"                           
##  [745] "maryland"                                
##  [746] "orange county, ca"                       
##  [747] "seattle"                                 
##  [748] "new york city"                           
##  [749] "london"                                  
##  [750] "boulder"                                 
##  [751] "nyc"                                     
##  [752] "bc, canada"                              
##  [753] "dallas"                                  
##  [754] "menlo park"                              
##  [755] "seattle, wa"                             
##  [756] "mill valley, ca"                         
##  [757] "san francisco, ca"                       
##  [758] "austin, tx"                              
##  [759] "baltimore, md"                           
##  [760] "redmond"                                 
##  [761] "san jose"                                
##  [762] "portland, or"                            
##  [763] "mìnchen"                                 
##  [764] "zurich"                                  
##  [765] "san francisco"                           
##  [766] "toronto, on"                             
##  [767] "toronto, on"                             
##  [768] "madrid"                                  
##  [769] "portland"                                
##  [770] "seattle, wa"                             
##  [771] "bay area"                                
##  [772] "san jose, ca"                            
##  [773] "pleasanton"                              
##  [774] "san francisco"                           
##  [775] "budapest, hungary"                       
##  [776] "new york city"                           
##  [777] "vancouver"                               
##  [778] "san francisco"                           
##  [779] "chicago"                                 
##  [780] "san francisco"                           
##  [781] "scotland"                                
##  [782] "mountain view"                           
##  [783] "sfo"                                     
##  [784] "pasadena, ca"                            
##  [785] "san francisco"                           
##  [786] "mexico"                                  
##  [787] "redwood city"                            
##  [788] "austin, tx"                              
##  [789] "seattle"                                 
##  [790] "ur moms house"                           
##  [791] "dc"                                      
##  [792] "san francisco"                           
##  [793] "location"                                
##  [794] "san francisco"                           
##  [795] "berkeley"                                
##  [796] "nyc"                                     
##  [797] "london"                                  
##  [798] "columbus, oh"                            
##  [799] "austin tx"                               
##  [800] "edmonton, ab"                            
##  [801] "denver"                                  
##  [802] "reno nevafa"                             
##  [803] "bay area"                                
##  [804] "cupertino"                               
##  [805] "austin"                                  
##  [806] "washington, dc"                          
##  [807] "ann arbor, mi"                           
##  [808] "rennes, fr"                              
##  [809] "washington d.c."                         
##  [810] "warren, mi"                              
##  [811] "seattle, wa"                             
##  [812] "chicago"                                 
##  [813] "germany"                                 
##  [814] "zurich"                                  
##  [815] "boston"                                  
##  [816] "san francisco"                           
##  [817] "tampa, fl"                               
##  [818] "redmond"                                 
##  [819] "london"                                  
##  [820] "berlin, germany"                         
##  [821] "baltimore, md"                           
##  [822] "london, uk"                              
##  [823] "seattle"                                 
##  [824] "boston, ma"                              
##  [825] "san jose, ca"                            
##  [826] "nj"                                      
##  [827] "san francisco"                           
##  [828] "boulder, co"                             
##  [829] "madison, wi"                             
##  [830] "san francisco"                           
##  [831] "new york"                                
##  [832] "sf"                                      
##  [833] "virginiawashington area"                 
##  [834] "seattle"                                 
##  [835] "ireland"                                 
##  [836] "seattle"                                 
##  [837] "san francisco"                           
##  [838] "palo alto"                               
##  [839] "redwood city, ca"                        
##  [840] "chandler, az"                            
##  [841] "moscow"                                  
##  [842] "seattle"                                 
##  [843] "seattle"                                 
##  [844] "tarzana, ca"                             
##  [845] "irvine, ca"                              
##  [846] "santa barbara"                           
##  [847] "spain"                                   
##  [848] "san francisco"                           
##  [849] "california"                              
##  [850] "san francisco"                           
##  [851] "toronto, canada"                         
##  [852] "redwood city"                            
##  [853] "chicago"                                 
##  [854] "san jose"                                
##  [855] "san francisco"                           
##  [856] "oxford, uk"                              
##  [857] "london"                                  
##  [858] "sf"                                      
##  [859] "san francisco"                           
##  [860] "norway"                                  
##  [861] "warwickshire"                            
##  [862] "san francisco"                           
##  [863] "uk"                                      
##  [864] "san francisco"                           
##  [865] "redmond, washington"                     
##  [866] "redwood city"                            
##  [867] "princeton, nj"                           
##  [868] "seattle"                                 
##  [869] "copenhagen, denmark"                     
##  [870] "columbia mo"                             
##  [871] "belgrade"                                
##  [872] "seattle, wa"                             
##  [873] "london"                                  
##  [874] "boston"                                  
##  [875] "nyc"                                     
##  [876] "san francisco"                           
##  [877] "palo alto"                               
##  [878] "redmond"                                 
##  [879] "spain"                                   
##  [880] "austin"                                  
##  [881] "san francisco"                           
##  [882] "redwood city, ca"                        
##  [883] "uk, outside london"                      
##  [884] "vancouver"                               
##  [885] "melbourne, australia"                    
##  [886] "uk"                                      
##  [887] "san francisco"                           
##  [888] "not sf"                                  
##  [889] "seattle"                                 
##  [890] "lyon, france"                            
##  [891] "vancouver, bc"                           
##  [892] "new hampshire"                           
##  [893] "ottawa, canada"                          
##  [894] "mountain view, ca"                       
##  [895] "santa clara"                             
##  [896] "mountain view, california"               
##  [897] "mountain view"                           
##  [898] "canada"                                  
##  [899] "austin, tx"                              
##  [900] "san francisco"                           
##  [901] "san diego"                               
##  [902] "remote"                                  
##  [903] "bournemouth, uk"                         
##  [904] "glendale, ca"                            
##  [905] "san diego"                               
##  [906] "michigan"                                
##  [907] "washington, dc"                          
##  [908] "redmond, wa"                             
##  [909] "atlanta, ga"                             
##  [910] "chicago"                                 
##  [911] "new york city"                           
##  [912] "san francisco"                           
##  [913] "houston, tx"                             
##  [914] "austin, tx"                              
##  [915] "san jose, ca"                            
##  [916] "germany"                                 
##  [917] "san francisco"                           
##  [918] "seattle"                                 
##  [919] "toronto, on"                             
##  [920] "toronto"                                 
##  [921] "new york"                                
##  [922] "pittsburgh, pa"                          
##  [923] "seattle, washington"                     
##  [924] "london, uk"                              
##  [925] "sunnyvale, ca"                           
##  [926] "boston, ma"                              
##  [927] "cambridge ma"                            
##  [928] "san jose"                                
##  [929] "atlanta, ga"                             
##  [930] "menlo park"                              
##  [931] "poland"                                  
##  [932] "davis"                                   
##  [933] "frankfurt am main, germany"              
##  [934] "new york, ny"                            
##  [935] "san francisco"                           
##  [936] "oakville, on., canada"                   
##  [937] "limerick, ireland"                       
##  [938] "toronto"                                 
##  [939] "chicago"                                 
##  [940] "toronto"                                 
##  [941] "salt lake city, ut"                      
##  [942] "los angeles, ca"                         
##  [943] "redmond, wa"                             
##  [944] "new york"                                
##  [945] "verona, wi"                              
##  [946] "san francisco"                           
##  [947] "nyc"                                     
##  [948] "pittsburgh"                              
##  [949] "new york"                                
##  [950] "new york"                                
##  [951] "san francisco"                           
##  [952] "los angeles, ca"                         
##  [953] "seattle"                                 
##  [954] "cambridge, ma"                           
##  [955] "sam diego"                               
##  [956] "irvine, ca"                              
##  [957] "new york"                                
##  [958] "cupertino ca"                            
##  [959] "needham, massachusetts"                  
##  [960] "seattle"                                 
##  [961] "seattle"                                 
##  [962] "san francisco"                           
##  [963] "boston, ma"                              
##  [964] "redmond"                                 
##  [965] "burnaby bc"                              
##  [966] "mountain view"                           
##  [967] "san francisco"                           
##  [968] "londnon"                                 
##  [969] "cupertino"                               
##  [970] "san fransisco"                           
##  [971] "columbus, oh"                            
##  [972] "puerto rico"                             
##  [973] "sunnyvale, california"                   
##  [974] "remote"                                  
##  [975] "adelaide, au"                            
##  [976] "dallas, tx"                              
##  [977] "seattle"                                 
##  [978] "bay area"                                
##  [979] "san francisco, ca"                       
##  [980] "nyc"                                     
##  [981] "nebraska"                                
##  [982] "austin, texas"                           
##  [983] "cambridge, ma"                           
##  [984] "kuala lumpur"                            
##  [985] "atlanta"                                 
##  [986] "san francisco"                           
##  [987] "zìrich"                                  
##  [988] "iowa"                                    
##  [989] "mountain view, ca"                       
##  [990] "seattle"                                 
##  [991] "bay area"                                
##  [992] "portland, or"                            
##  [993] "san francisco"                           
##  [994] "milwaukee"                               
##  [995] "redmond"                                 
##  [996] "il"                                      
##  [997] "dc"                                      
##  [998] "orange county"                           
##  [999] "nyc"                                     
## [1000] "guadalajara"                             
## [1001] "new york,  ny"                           
## [1002] "nashville"                               
## [1003] "redmond"                                 
## [1004] "australia"                               
## [1005] "vancouver"                               
## [1006] "new york city"                           
## [1007] "baltimore"                               
## [1008] "dc area"                                 
## [1009] "philippines"                             
## [1010] "minneapolis"                             
## [1011] "boston"                                  
## [1012] "phoenix az"                              
## [1013] "argentina"                               
## [1014] "vancouver bc"                            
## [1015] "brooklyn"                                
## [1016] "vi\u0087èàt nam"                         
## [1017] "mountain view"                           
## [1018] "remote"                                  
## [1019] "kansas city"                             
## [1020] "madrid, spain"                           
## [1021] "bangalore"                               
## [1022] "new york city"                           
## [1023] "redmond, wa"                             
## [1024] "los angeles"                             
## [1025] "scottsdale, az"                          
## [1026] "new zealand"                             
## [1027] "st petersburg, fl"                       
## [1028] "seattle"                                 
## [1029] "boston"                                  
## [1030] "mountain view, ca"                       
## [1031] "sf"                                      
## [1032] "redmond"                                 
## [1033] "houston, tx"                             
## [1034] "colorado"                                
## [1035] "cupertino"                               
## [1036] "bangalore"                               
## [1037] "atlanta, ga"                             
## [1038] "washington, dc"                          
## [1039] "campo grande, mato grosso do sul, brazil"
## [1040] "orlando"                                 
## [1041] "cambridge ma"                            
## [1042] "mountain view, ca"                       
## [1043] "cleveland, oh"                           
## [1044] "singapore"                               
## [1045] "nyc"                                     
## [1046] "columbia, md"                            
## [1047] "san jose"                                
## [1048] "bangalore"                               
## [1049] "porto, portugal"                         
## [1050] "seattle"                                 
## [1051] "des moines, ia"                          
## [1052] "austin, tx"                              
## [1053] "brisbane, australia"                     
## [1054] "san francisco, ca"                       
## [1055] "remote usa"                              
## [1056] "lenexa, ks"                              
## [1057] "philadelphia"                            
## [1058] "mountain view"                           
## [1059] "toronto, canada"                         
## [1060] "beijing"                                 
## [1061] "nyc"                                     
## [1062] "new york"                                
## [1063] "los angeles"                             
## [1064] "san francisco, ca"                       
## [1065] "pittsburgh, pa"                          
## [1066] "los angeles, ca"                         
## [1067] "montreal"                                
## [1068] "new york city"                           
## [1069] "san jose"                                
## [1070] "australia"                               
## [1071] "sydney, au"                              
## [1072] "menlo park, ca"                          
## [1073] "sydney"                                  
## [1074] "atlanta, ga"                             
## [1075] "new york, new york"                      
## [1076] "salt lake city, ut"                      
## [1077] "seattle"                                 
## [1078] "seattle"                                 
## [1079] "ann arbor, mi"                           
## [1080] "remote"                                  
## [1081] "seattle"                                 
## [1082] "new york"                                
## [1083] "mountain view"                           
## [1084] "san jose, ca"                            
## [1085] "new york"                                
## [1086] "san francisco, ca"                       
## [1087] "singapore"                               
## [1088] "shanghai"                                
## [1089] "palo alto california"                    
## [1090] "boston ma"                               
## [1091] "kansas"                                  
## [1092] "redmond"                                 
## [1093] "mclean, va"                              
## [1094] "denver, co"                              
## [1095] "argentina"                               
## [1096] "vancouver, bc"                           
## [1097] "los angeles"                             
## [1098] "shanghai, china"                         
## [1099] "madison, wi"                             
## [1100] "redmond, washington"                     
## [1101] "san francisco"                           
## [1102] "melbourne"                               
## [1103] "seattle, wa"                             
## [1104] "washington dc"                           
## [1105] "vancouver"                               
## [1106] "san francisco, ca"                       
## [1107] "rochester"                               
## [1108] "san francisco"                           
## [1109] "montreal, qc, canada"                    
## [1110] "washington dc"                           
## [1111] "seattle"                                 
## [1112] "bangalore"                               
## [1113] "los angeles"                             
## [1114] "madison, wi"                             
## [1115] "new york, ny"                            
## [1116] "new zealand"                             
## [1117] "nyc"                                     
## [1118] "grand prairie, tx"                       
## [1119] "norwalk, ct"                             
## [1120] "vancouver"                               
## [1121] "nyc"                                     
## [1122] "edmonton, alberta, canada"               
## [1123] "singapore"                               
## [1124] "austin"                                  
## [1125] "canada"                                  
## [1126] "new york"                                
## [1127] "chicago, il"                             
## [1128] "san jose"                                
## [1129] "ohio"                                    
## [1130] "rio de janeiro"                          
## [1131] "lancaster, pa"                           
## [1132] "seattle, wa"                             
## [1133] "toronto"                                 
## [1134] "washington dc"                           
## [1135] "mountain view"                           
## [1136] "jakarta"                                 
## [1137] "foster city, ca"                         
## [1138] "knoxviile, tn"                           
## [1139] "oregon"                                  
## [1140] "new york, ny"                            
## [1141] "anacortes, wa"                           
## [1142] "laurel, md, usa"                         
## [1143] "chicago"                                 
## [1144] "williamsburg"                            
## [1145] "australia"                               
## [1146] "riverside"                               
## [1147] "minneapolis"                             
## [1148] "oakland, ca"                             
## [1149] "san francisco"                           
## [1150] "san francisco"                           
## [1151] "minneapolis, minnesota"                  
## [1152] "los angeles"                             
## [1153] "nyc"                                     
## [1154] "pune"                                    
## [1155] "portland, or"                            
## [1156] "sydney, australia"                       
## [1157] "falls church, va, usa"                   
## [1158] "falls church, va, usa"                   
## [1159] "mountain view"                           
## [1160] "columbus, ohio"                          
## [1161] "omaha"                                   
## [1162] "sf"                                      
## [1163] "san francisco"                           
## [1164] "nyc"                                     
## [1165] "redmond"                                 
## [1166] "norfolk"                                 
## [1167] "bellingham, wa"                          
## [1168] "san francisco"                           
## [1169] "los angeles, ca"                         
## [1170] "san francisco"                           
## [1171] "san francisco"                           
## [1172] "denver"                                  
## [1173] "corvallis, or"                           
## [1174] "seattle"                                 
## [1175] "redwood shores"                          
## [1176] "boulder, colorado"                       
## [1177] "india"                                   
## [1178] "kansas city, mo"                         
## [1179] "cupertino, ca"                           
## [1180] "hyderabad, india"                        
## [1181] "sf"                                      
## [1182] "redmond, wa"                             
## [1183] "foster city"                             
## [1184] "washington dc"                           
## [1185] "auckland"                                
## [1186] "southern california"                     
## [1187] "hanoi, vietnam"                          
## [1188] "new york"                                
## [1189] "bloomington, il"                         
## [1190] "new york"                                
## [1191] "nyc"                                     
## [1192] "jakarta"                                 
## [1193] "detroit, mi"                             
## [1194] "san francisco"                           
## [1195] "alameda"                                 
## [1196] "san francisco"                           
## [1197] "san francisco"                           
## [1198] "hillsboro, oregon"                       
## [1199] "houston"                                 
## [1200] "seattle"                                 
## [1201] "minneapolis mn"                          
## [1202] "chicago, il"                             
## [1203] "chicago, il"                             
## [1204] "greater boston area"                     
## [1205] "silicon valley, ca"                      
## [1206] "boston, massachusetts"                   
## [1207] "bangalore"                               
## [1208] "irvine"                                  
## [1209] "mountain view"                           
## [1210] "las vegas"                               
## [1211] "redmond, wa"                             
## [1212] "new york"                                
## [1213] "nyc"                                     
## [1214] "mountain view"                           
## [1215] "redmond, wa, usa"                        
## [1216] "redmond, wa"                             
## [1217] "san bruno"                               
## [1218] "sydney"                                  
## [1219] "mtv"                                     
## [1220] "new york"                                
## [1221] "seattle"                                 
## [1222] "bay area"                                
## [1223] "san francisco"                           
## [1224] "pune, india"                             
## [1225] "korea"                                   
## [1226] "palo alto"                               
## [1227] "san francisco"                           
## [1228] "nyc"                                     
## [1229] "san diego"                               
## [1230] "san francisco"                           
## [1231] "seattle"                                 
## [1232] "new york"                                
## [1233] "austin, tx"                              
## [1234] "san francisco"                           
## [1235] "london"                                  
## [1236] "cyprus"                                  
## [1237] "remote/everywhere/usa"                   
## [1238] ""                                        
## [1239] "san francisco, ca"                       
## [1240] "cyprus"                                  
## [1241] "boston"                                  
## [1242] "bellevue wa"                             
## [1243] "nyc"                                     
## [1244] "san francisco"                           
## [1245] "mountain view"                           
## [1246] "redwood"                                 
## [1247] "pittsburgh, pa"                          
## [1248] "seattle"                                 
## [1249] "mumbai"                                  
## [1250] "silicon valley"                          
## [1251] "mumbai, india"                           
## [1252] "mountain view"                           
## [1253] "seattle"                                 
## [1254] "los gatos, ca"                           
## [1255] "mountain view, ca"                       
## [1256] "washington d.c."                         
## [1257] "new york city"                           
## [1258] "shanghai"                                
## [1259] "san bruno"                               
## [1260] "auckland, new zealand"                   
## [1261] "san francisco, ca"                       
## [1262] "everett"                                 
## [1263] "los gatos"                               
## [1264] "ridgecrest"                              
## [1265] "chennai"                                 
## [1266] "bangalore, india"                        
## [1267] "dallas texas usa"                        
## [1268] "sf"                                      
## [1269] "san francisco"                           
## [1270] "sunnyvale"                               
## [1271] "russia, spb"                             
## [1272] "bangalore, india"                        
## [1273] "singapore"                               
## [1274] "bangalore india"                         
## [1275] "bangalore, india"                        
## [1276] "san francisco"                           
## [1277] "mountain view"                           
## [1278] "seattle"                                 
## [1279] "seattle"                                 
## [1280] "san francisco"                           
## [1281] "san jose"                                
## [1282] "san francisco"                           
## [1283] "singapore"                               
## [1284] "sunnyvale"                               
## [1285] "redwood city, ca"                        
## [1286] "san francisco"                           
## [1287] "pune, india"                             
## [1288] "california"                              
## [1289] "boulder"                                 
## [1290] "san francisco"                           
## [1291] "asia"                                    
## [1292] "san francisco"                           
## [1293] "livermore"                               
## [1294] "not a coast"                             
## [1295] "mountain view"                           
## [1296] "sunnyvale"                               
## [1297] "france"                                  
## [1298] "denver"                                  
## [1299] "sacramento, ca"                          
## [1300] "germany"                                 
## [1301] "san francisco"                           
## [1302] "sunnyvale"                               
## [1303] "mountain view"                           
## [1304] "\u008aü¾µá"                              
## [1305] "sf"                                      
## [1306] "cupertino"                               
## [1307] "livermore, california"                   
## [1308] "san francisco"                           
## [1309] "mountain view"                           
## [1310] "seattle, wa"                             
## [1311] "boston"                                  
## [1312] "san francisco, ca"                       
## [1313] "manila, philippines"                     
## [1314] "san ramon, ca"                           
## [1315] "seattle, wa"                             
## [1316] "athens, greece"                          
## [1317] "san jose"                                
## [1318] "karachi, pakistan"                       
## [1319] "menlo park"                              
## [1320] "secret"                                  
## [1321] "san jose"                                
## [1322] "eindhoven, the netherlands"              
## [1323] "seattle"                                 
## [1324] "here"                                    
## [1325] "chennai"                                 
## [1326] "abu dhabi"                               
## [1327] "sunnyvale, ca"                           
## [1328] "kenya"                                   
## [1329] "chennai"                                 
## [1330] "bern"                                    
## [1331] "hanoi, vietnam"                          
## [1332] "mountain view"                           
## [1333] "singapore"                               
## [1334] "dubin"                                   
## [1335] "san francisco"                           
## [1336] "cupertino"                               
## [1337] "san francisco"                           
## [1338] "richmond, surrey"                        
## [1339] "palo alto"                               
## [1340] "location"                                
## [1341] "seattle, wa"                             
## [1342] "nairobi"                                 
## [1343] "chennai"                                 
## [1344] "netherlands"                             
## [1345] "sf bay area"                             
## [1346] "pleasanton, ca"                          
## [1347] "geneva, switzerland"                     
## [1348] "berlin, germany"                         
## [1349] "san francisco"                           
## [1350] "san francisco, ca"                       
## [1351] "glasgow"                                 
## [1352] "dhaka, bangladesh"                       
## [1353] "spain"                                   
## [1354] "berlin"                                  
## [1355] "london"                                  
## [1356] "london"                                  
## [1357] "melbourne, australia"                    
## [1358] "san francisco"                           
## [1359] "singapore"                               
## [1360] "chisinau, moldova"                       
## [1361] "bangalore"                               
## [1362] "hong kong"                               
## [1363] "bonn"                                    
## [1364] "remote"                                  
## [1365] "stockholm, sweden"                       
## [1366] "vietnam"                                 
## [1367] "mountain view, ca"                       
## [1368] "berlin"                                  
## [1369] "michigan"                                
## [1370] "ssf"                                     
## [1371] "germany, brunswick"                      
## [1372] "portland, or"                            
## [1373] "chicago, il"                             
## [1374] "sydney"                                  
## [1375] "morocco"                                 
## [1376] "constanta, romania"                      
## [1377] "the netherlands"                         
## [1378] "amsterdam"                               
## [1379] "sydney"                                  
## [1380] "italy"                                   
## [1381] "turin, italy"                            
## [1382] "menlo park, ca"                          
## [1383] "london"                                  
## [1384] "manchester, uk"                          
## [1385] "israel"                                  
## [1386] "casablanca, morocco"                     
## [1387] "london, uk"                              
## [1388] "barcelona, spain"                        
## [1389] "berlin"                                  
## [1390] "copenhagen, denmark"                     
## [1391] "remote"                                  
## [1392] "moscow, russia"                          
## [1393] "pa"                                      
## [1394] "singapore"                               
## [1395] "kiev"                                    
## [1396] "germany"                                 
## [1397] "clujnapoca, romania"                     
## [1398] "munich, germany"                         
## [1399] "dalas"                                   
## [1400] "athens, greece remote"                   
## [1401] "greenville, sc"                          
## [1402] "london"                                  
## [1403] "ireland"                                 
## [1404] "south africa"                            
## [1405] "switzerland"                             
## [1406] "london"                                  
## [1407] "seattle, wa"                             
## [1408] "istanbul"                                
## [1409] "dc"                                      
## [1410] "melbourne"                               
## [1411] "arbedo"                                  
## [1412] "brasil"                                  
## [1413] "milan, italy"                            
## [1414] "cramlington, uk"                         
## [1415] "denmark"                                 
## [1416] "wroclaw"                                 
## [1417] "uk"                                      
## [1418] "amsterdam"                               
## [1419] "paris"                                   
## [1420] "edinburgh"                               
## [1421] "strasbourg"                              
## [1422] "clujnapoca, romania"                     
## [1423] "ghgh"                                    
## [1424] "asdf"                                    
## [1425] "bali"                                    
## [1426] "prague"                                  
## [1427] "didu"                                    
## [1428] "stockholm, sweden"                       
## [1429] "brussels"                                
## [1430] "london"                                  
## [1431] "reading, uk"                             
## [1432] "lund, sweden"                            
## [1433] "cape town, south africa"                 
## [1434] "west yorkshire, uk"                      
## [1435] "paris"                                   
## [1436] "new haven, ct, usa"                      
## [1437] "san francisco"                           
## [1438] "dublin"                                  
## [1439] "frankfurt"                               
## [1440] "toronto"                                 
## [1441] "noida"                                   
## [1442] "milwaukee, wi"                           
## [1443] "london"                                  
## [1444] "atlanta"                                 
## [1445] "france"                                  
## [1446] "manchester, uk"                          
## [1447] "bermuda"                                 
## [1448] "chattanooga, tn"                         
## [1449] "bonn"                                    
## [1450] "nw"                                      
## [1451] "italy"                                   
## [1452] "switzerland zurich"                      
## [1453] "france"                                  
## [1454] "boca raton, fl"                          
## [1455] "hatfield, pa"                            
## [1456] "richfield mn"                            
## [1457] "india"                                   
## [1458] "mumbai"                                  
## [1459] "san francisco"                           
## [1460] "boston"                                  
## [1461] "paris, france"                           
## [1462] "toronto"                                 
## [1463] "montvale nj"                             
## [1464] "indianapolis in"                         
## [1465] "chicago"                                 
## [1466] "madrid, spain"                           
## [1467] "germany"                                 
## [1468] "milan"                                   
## [1469] "munich, germany"                         
## [1470] "moscow, russia"                          
## [1471] "grand rapids, mi"                        
## [1472] "krakow"                                  
## [1473] "hamburg"                                 
## [1474] "toronto"                                 
## [1475] "new york"                                
## [1476] "kentucky"                                
## [1477] "grenoble, france"                        
## [1478] "slovenia"                                
## [1479] "london, uk"                              
## [1480] "montreal"                                
## [1481] "poughkeepsie"                            
## [1482] "pittsburgh, pa"                          
## [1483] "jblm, wa"                                
## [1484] "pune"                                    
## [1485] "singapore"                               
## [1486] "milan"                                   
## [1487] "mountain view"                           
## [1488] "boston, ma"                              
## [1489] "london"                                  
## [1490] "austin, tx"                              
## [1491] "cambridge, ma"                           
## [1492] "bristol, uk"                             
## [1493] "sweden"                                  
## [1494] "argentina"                               
## [1495] "nyc"                                     
## [1496] "boston"                                  
## [1497] "atlanta"                                 
## [1498] "berlin"                                  
## [1499] "clujnapoca, romania"                     
## [1500] "sao paulo"                               
## [1501] "austin, tx"                              
## [1502] "winnipeg, mb canada"                     
## [1503] "nyc"                                     
## [1504] "durham, nc"                              
## [1505] "strasbourg"                              
## [1506] "amsterdam"                               
## [1507] "new york city"                           
## [1508] "paris"                                   
## [1509] "dc metro area"                           
## [1510] "dallas, tx"                              
## [1511] "remote"                                  
## [1512] "melbourne"                               
## [1513] "phoenix, az"                             
## [1514] "new york city"                           
## [1515] "jì¦nkì¦ping, sweden"                     
## [1516] "munich"                                  
## [1517] "durham, nc"                              
## [1518] "leicester"                               
## [1519] "vermont"                                 
## [1520] "exeter, uk"                              
## [1521] "columbus, oh"                            
## [1522] "oklahoma city"                           
## [1523] "menlo park"                              
## [1524] "uk"                                      
## [1525] "kansas city, mo"                         
## [1526] "remote usa"                              
## [1527] "orlando"                                 
## [1528] "irving, tx"                              
## [1529] "minneapolis"                             
## [1530] "london"                                  
## [1531] "st. louis"                               
## [1532] "moscow"                                  
## [1533] "st petersburg, fl"                       
## [1534] "minneapolis"                             
## [1535] "nj, usa"                                 
## [1536] "minneapolis, mn"                         
## [1537] "wallops island, virginia"                
## [1538] "atlanta"                                 
## [1539] "minneapolis, mn"                         
## [1540] "buenos aires"                            
## [1541] "london"                                  
## [1542] "geneva"                                  
## [1543] "seattle, wa"                             
## [1544] "montreal, quebec, canada"                
## [1545] "sunnyvale"                               
## [1546] "santa clara, cuba"                       
## [1547] "philadelphia, pa"                        
## [1548] "ann arbor, michigan"                     
## [1549] "aveiro, portugal"                        
## [1550] "london"                                  
## [1551] "sunnyvale"                               
## [1552] "serbia"                                  
## [1553] "new york, ny"                            
## [1554] "glasgow, uk"                             
## [1555] "netherlands"                             
## [1556] "asdf"                                    
## [1557] "san francisco"                           
## [1558] "portland, or"                            
## [1559] "cambridge, ma"                           
## [1560] "rolla, mo"                               
## [1561] "shillington pa"                          
## [1562] "british columbia"                        
## [1563] "menlo park"                              
## [1564] "madison, wi"                             
## [1565] "taiwan"                                  
## [1566] "augusta, me"                             
## [1567] "san francisco, ca"                       
## [1568] "baltimore"                               
## [1569] "radford, va"                             
## [1570] "redmond"                                 
## [1571] "san francisco"                           
## [1572] "mountain view"                           
## [1573] "san francisco"                           
## [1574] "seattle, wa"                             
## [1575] "aveiro, portugal"                        
## [1576] "palo alto, ca"                           
## [1577] "culver city, ca"                         
## [1578] "mountain view"                           
## [1579] "usa"                                     
## [1580] "menlo park, ca"                          
## [1581] "sunnyvale"                               
## [1582] "philadelphia, pa"                        
## [1583] "san jose"                                
## [1584] "san francisco"                           
## [1585] "greer"                                   
## [1586] "greer"                                   
## [1587] "new york"                                
## [1588] "cupertino"                               
## [1589] "chicago, il"                             
## [1590] "sewttle"                                 
## [1591] "pisa, italy"                             
## [1592] "nj,  minutes from nyc"                   
## [1593] "baltimore, md"                           
## [1594] "bangalore"                               
## [1595] "fort lauderdale"                         
## [1596] "\u008eç÷\u008eýã"                        
## [1597] "eire"                                    
## [1598] "berlin, germany"                         
## [1599] "frankfurt, germany"                      
## [1600] "austin, tx"                              
## [1601] "netherlands"                             
## [1602] "bellevue, wa, usa"                       
## [1603] "ireland"                                 
## [1604] "ireland"                                 
## [1605] "provo"                                   
## [1606] "south san francisco, ca"                 
## [1607] "ireland"                                 
## [1608] "raleigh, nc"                             
## [1609] "boston, ma"                              
## [1610] "uk"                                      
## [1611] "boston, ma"                              
## [1612] "ottawa, canada"                          
## [1613] "madrid es"                               
## [1614] "product engineer"                        
## [1615] "sf"                                      
## [1616] "san jose"                                
## [1617] "san jose"                                
## [1618] "seattle"                                 
## [1619] "sydney, australia"                       
## [1620] "san francisco, ca"                       
## [1621] "bangalore,india"                         
## [1622] "seattle, wa"                             
## [1623] "redmond, wa"                             
## [1624] "san francisco"                           
## [1625] "beijing"                                 
## [1626] "toronto"                                 
## [1627] "san francisco"                           
## [1628] "buffalo"                                 
## [1629] "san francisco"                           
## [1630] "sf"                                      
## [1631] "san jose"                                
## [1632] "kitchener, on"                           
## [1633] "boulder, co"                             
## [1634] "los angeles ca"                          
## [1635] "menlo park"                              
## [1636] "zurich"                                  
## [1637] "poland, warsaw"                          
## [1638] "vienna"                                  
## [1639] "pune, india"                             
## [1640] "\u0093ãï\u0093ü"                         
## [1641] "\u0093\u009dã\u0093¤û\u0091áï"           
## [1642] "atlanta, ga"                             
## [1643] "\u0092î\u0090\u0090µ\u0090"              
## [1644] "khi"                                     
## [1645] "cambridge, ma"                           
## [1646] "\u0091ã´\u0093ã"                         
## [1647] "exton, pa"                               
## [1648] "philadelphia, pa"                        
## [1649] "menlo park"                              
## [1650] "beaverton, or"                           
## [1651] "boulder, co"                             
## [1652] "europe"                                  
## [1653] "nyc"                                     
## [1654] "college park, md"                        
## [1655] "sunnyvale"

Using longitude and latitude to find some insights

df_lat_long_sal <- select(df,location_latitude,location_longitude,annual_base_pay) %>%
  na.omit()
df_lat_long_sal
library(ggmap)
## ℹ Google's Terms of Service: <]8;;https://mapsplatform.google.comhttps://mapsplatform.google.com]8;;>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
# Get the map tile for the area of interest
map_tile <- get_stamenmap(location = c(lon = mean(df_lat_long_sal$location_longitude), lat = mean(df_lat_long_sal$location_latitude)), 
                    zoom = 10)#, source = "stamen")#maptype = "terrain", 
## Warning: `location` is not a valid argument to
## `]8;;ide:help:ggmap::get_stamenmapggmap::get_stamenmap]8;;()`; it is ignored.
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# Plot the map tile using ggmap
ggmap(map_tile) +
  geom_point(aes(x = location_longitude, y = location_latitude, color = annual_base_pay), data = df_lat_long_sal)
## Warning: Removed 788 rows containing missing values (`geom_point()`).

Checking the distribution of annual base salary for new cleaned dataset

x= df_lat_long_sal %>%
  filter(annual_base_pay > quantile(annual_base_pay, 0.25) - 1.5 * IQR(annual_base_pay) &
         annual_base_pay < quantile(annual_base_pay, 0.75) + 1.5 * IQR(annual_base_pay) )
ggplot(x,aes(annual_base_pay)) + geom_boxplot()

#Histogram of annual base pay
ggplot(df_lat_long_sal%>% filter(annual_base_pay < quantile(annual_base_pay,0.95)) ,aes(annual_base_pay)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(x)
## [1] 3
ggplot(x, aes(x=annual_base_pay)) + 
  geom_histogram(binwidth=5000, fill='grey',color='black') + 
  geom_vline(aes(xintercept = mean(annual_base_pay), color = "mean"), linetype = "dashed", size = 1) + 
  ggtitle("Annual base pay distribution after removing outliers" ) +
  xlab("Annual Base Pay (in USD)") +
  ylab("Frequency") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean")) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

# Get the map tile for the area of interest
map_tile <- get_stamenmap(location = c(lon = mean(x$location_longitude), lat = mean(x$location_latitude)), 
                    zoom = 10)#, source = "stamen")#maptype = "terrain", 
## Warning: `location` is not a valid argument to
## `]8;;ide:help:ggmap::get_stamenmapggmap::get_stamenmap]8;;()`; it is ignored.
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# Plot the map tile using ggmap
ggmap(map_tile) +
  geom_point(aes(x = location_longitude, y = location_latitude, size = annual_base_pay), data = x)
## Warning: Removed 759 rows containing missing values (`geom_point()`).

Creating an interactive map

# load leaflet package
library(leaflet)

# create the map
leaflet(data = df_lat_long_sal) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~location_longitude, lat = ~location_latitude,  color = "YlOrRd", 
                   radius = 4 , data = df_lat_long_sal) %>%
  setView(lng = mean(df_lat_long_sal$location_longitude), lat = mean(df_lat_long_sal$location_latitude), zoom = 10)
# load leaflet package
library(leaflet)

# Create a new column for salary groups
df_lat_long_sal$salary_group <- cut(df_lat_long_sal$annual_base_pay, 
                                   breaks = c(0, 50000, 75000, 100000, Inf), 
                                   labels = c("low: <50k", "medium: <75k", "high: <100k", "very high: >100k"),
                                   include.lowest = TRUE,
                                   right=TRUE)

# define color palette for salary groups
pal <- colorFactor(c("low" = "green", "medium" = "yellow", "high" = "orange", "very high" = "red"), 
                  domain = df_lat_long_sal$salary_group)

#create the map
leaflet(data = df_lat_long_sal) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~location_longitude, lat = ~location_latitude,  color = pal(df_lat_long_sal$salary_group), 
                   radius = 4 , data = df_lat_long_sal) %>%
  setView(lng = mean(df_lat_long_sal$location_longitude)+40, lat = mean(df_lat_long_sal$location_latitude), zoom = 1.5)%>%
  addLegend(pal = pal, values = df_lat_long_sal$salary_group, title = "Salary Group", position = "bottomright")

Are experienced people getting the max salary ? ( Salary influenced by experience)

# Creating a data set for salary and experience
df_sal_exp <- select(df,annual_base_pay,total_experience_years,employer_experience_years,employer_name) %>%
  na.omit()
df_sal_exp
# Create a violin plot of salary by experience level
library(ggplot2)
ggplot(data=df_sal_exp, aes(x=total_experience_years, y=annual_base_pay)) + 
  geom_violin(fill="blue") +
  geom_jitter(width = 0.1, color = "black") +
  ggtitle("Salary Distribution by Experience Years") +
  xlab("Experience Years") +
  ylab("Annual Base Pay")

library(dplyr)

# remove outliers
df_sal_exp_no_outliers <- df_sal_exp %>% 
  group_by(total_experience_years) %>% 
  filter(annual_base_pay < quantile(annual_base_pay, 0.95))

# plot violin plot
ggplot(data=df_sal_exp_no_outliers, aes(x=total_experience_years, y=annual_base_pay)) + 
  geom_violin(trim=FALSE, draw_quantiles = c(0.25, 0.5, 0.75))

df_sal_exp <- df_sal_exp %>% 
  filter(annual_base_pay < quantile(annual_base_pay, 0.95))

# Divide the total_experience_years into different groups
df_sal_exp$experience_group <- cut(df_sal_exp$total_experience_years, breaks = c(0, 5, 10, 15, 20,Inf),include.lowest = TRUE, right = TRUE, labels = c("0-5", "6-10", "11-15", "16-20",">20")) 
  

# Create violin plots for each group of experience
ggplot(data = df_sal_exp, aes(x = experience_group, y = annual_base_pay)) + 
  geom_violin(aes(fill = experience_group), color = "black", trim = TRUE,draw_quantiles = c(0.25, 0.5, 0.75)) + 
  scale_fill_manual(values = c("#0000FF", "#00FF00", "#FFFF00", "#FFA500", "#FF0000")) +
  ggtitle("Salary distribution by experience group") +
  xlab("Experience Group") + ylab("Annual Base Pay")

ggplot(df_sal_exp,aes(total_experience_years)) + geom_boxplot()

ggplot(df_sal_exp,aes(x=total_experience_years,y=annual_base_pay)) + geom_point() + geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

If the line is nearly horizontal, it means that there is little or no correlation between the two variables.

#Wilcoxon rank-sum test
experience_group = ifelse(df_sal_exp$total_experience_years <= 10, "less than or equal to 10 years", "greater than 10 years")
wilcox.test(df_sal_exp$annual_base_pay ~ experience_group)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df_sal_exp$annual_base_pay by experience_group
## W = 209702, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#Kruskal-Wallis test
kruskal.test(df_sal_exp$annual_base_pay ~ df_sal_exp$experience_group)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  df_sal_exp$annual_base_pay by df_sal_exp$experience_group
## Kruskal-Wallis chi-squared = 117.57, df = 4, p-value < 2.2e-16

The Kruskal-Wallis rank sum test is a non-parametric method for comparing the central tendency of two or more groups. It is used when the data is not normally distributed and the groups being compared have different sample sizes.

In this case, the test is being used to compare the salary distributions of different experience groups (df_sal_exp\(annual_base_pay by df_sal_exp\)experience_group). The test statistic, Kruskal-Wallis chi-squared, is calculated as 17.747, with a degree of freedom of 4. The p-value is 0.001383, which is less than 0.05. This suggests that there is a statistically significant difference in the salary distributions of the different experience groups, and that experience level does influence salary.

—– XXX —- The Wilcoxon rank sum test with continuity correction is a non-parametric test that is used to determine whether there are significant differences in the median of two groups. The test statistic is W and the p-value represents the probability of observing a W statistic as extreme or more extreme than the one observed, assuming that the null hypothesis (i.e., no significant difference in median) is true. A small p-value (typically less than 0.05) suggests that there is evidence to reject the null hypothesis and conclude that there is a significant difference in median between the two groups.

————-xxxx——— # Which kind of job gets paid well ?

df_sal_type <- select(df,annual_base_pay,employer_name,job_title_category)
df_sal_type
unique(df_sal_type$job_title_category)
## [1] "Engineering"     "Software"        "Other"           "Web"            
## [5] "Data"            "Management"      "Operations"      "Applied Science"
x<- df_sal_type %>%
  na.omit() %>%
  filter(annual_base_pay < quantile(annual_base_pay,0.95))
ggplot(x,aes( y=job_title_category,x= annual_base_pay )) + geom_boxplot()

——— xxxxx —————— # Trying to use 1.5*IQR for Experience years

# remove outliers
df_sal_exp_no_outliers_n <- df_sal_exp %>% 
  group_by(total_experience_years)%>%
  filter(annual_base_pay > quantile(annual_base_pay, 0.25) - 1.5 * IQR(annual_base_pay) &
         annual_base_pay < quantile(annual_base_pay, 0.75) + 1.5 * IQR(annual_base_pay) )

# plot violin plot
ggplot(data=df_sal_exp_no_outliers_n, aes(x=total_experience_years, y=annual_base_pay)) + 
  geom_violin(trim=FALSE, draw_quantiles = c(0.25, 0.5, 0.75))

df_sal_exp_n <- df_sal_exp %>% 
  filter(annual_base_pay > quantile(annual_base_pay, 0.25) - 1.5 * IQR(annual_base_pay) &
         annual_base_pay < quantile(annual_base_pay, 0.75) + 1.5 * IQR(annual_base_pay) )

# Divide the total_experience_years into different groups
df_sal_exp_n$experience_group <- cut(df_sal_exp_n$total_experience_years, breaks = c(0, 5, 10, 15, 20,Inf),include.lowest = TRUE, right = TRUE, labels = c("0-5", "6-10", "11-15", "16-20",">20")) 
  

# Create violin plots for each group of experience
ggplot(data = df_sal_exp_n, aes(x = experience_group, y = annual_base_pay)) + 
  geom_violin(aes(fill = experience_group), color = "black", trim = TRUE,draw_quantiles = c(0.25, 0.5, 0.75)) + 
  scale_fill_manual(name = "Experience (years)",values = c("#0000FF", "#00FF00", "#FFFF00", "#FFA500", "#FF0000")) +
  ggtitle("Annual Base Pay distribution by experience group") +
  xlab("Experience Group") + ylab("Annual Base Pay")

#ggplot(df_sal_exp_n,aes(x=total_experience_years,y=annual_base_pay)) + geom_point() + geom_smooth(method = 'lm') 
ggplot(df_sal_exp_n, aes(x=total_experience_years, y=annual_base_pay)) +
geom_point(color = "blue") +
geom_smooth(method = 'lm', color = "red", size = 1) +
ggtitle("Annual Base Pay vs Total Years of Experience") +
xlab("Total Years of Experience") + ylab("Annual Base Pay (in USD)") +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(name = "", values = c("blue" = "point", "red" = "linear regression line"), labels = c("Data Points", "Linear Regression Line"))
## `geom_smooth()` using formula = 'y ~ x'

#Wilcoxon rank-sum test
experience_group = ifelse(df_sal_exp_n$total_experience_years <= 10, "less than or equal to 10 years", "greater than 10 years")
wilcox.test(df_sal_exp_n$annual_base_pay ~ experience_group)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df_sal_exp_n$annual_base_pay by experience_group
## W = 209702, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#Kruskal-Wallis test
kruskal.test(df_sal_exp_n$annual_base_pay ~ df_sal_exp_n$experience_group)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  df_sal_exp_n$annual_base_pay by df_sal_exp_n$experience_group
## Kruskal-Wallis chi-squared = 117.57, df = 4, p-value < 2.2e-16

The Kruskal-Wallis rank sum test is a non-parametric statistical test that is used to determine whether there are significant differences in the distribution of a continuous variable among different groups.

In this case, the test is being applied to the annual_base_pay variable, which is a continuous variable, and is being grouped by experience_group. The test calculates a chi-squared statistic, which is used to determine whether there is a significant difference in the annual base pay among the different experience groups.

The df (degree of freedom) is 4, which indicates the number of groups being compared. The p-value is less than 2.2e-16 which means that there is less than a 0.0000000002% chance that the differences in annual base pay among the experience groups is due to chance. Therefore, we can reject the null hypothesis that the annual base pay is the same across all experience groups and conclude that there is a significant difference in annual base pay among the different experience groups.

Summary statistics of variables for the report

ggplot(df_sal_exp_n, aes(x=total_experience_years)) + 
  geom_histogram(fill='orange',color='black') + 
  geom_vline(aes(xintercept = mean(total_experience_years), color = "mean"), linetype = "dashed", size = 1) + 
  ggtitle("Distribution of experience (years) of individuals after removing outliers" ) +
  xlab("Total Experience (in years)") +
  ylab("Frequency") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean")) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Testing normality ( using tests and )

# Load libraries
library(ggplot2)

# Plot Q-Q plot to visualize normality
ggplot(df_sal_exp_n, aes(sample = total_experience_years)) + 
  geom_qq() +
  geom_qq_line() + 
  facet_wrap(~ experience_group, ncol = 2)

ggplot(df_sal_exp_n, aes(sample = annual_base_pay)) + 
  geom_qq() +
  geom_qq_line() + 
  facet_wrap(~ experience_group, ncol = 2)

# Run Shapiro-Wilk test for normality
shapiro.test(df_sal_exp_n$total_experience_years)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_sal_exp_n$total_experience_years
## W = 0.8465, p-value < 2.2e-16
shapiro.test(df_sal_exp_n$employer_experience_years)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_sal_exp_n$employer_experience_years
## W = 0.63553, p-value < 2.2e-16
shapiro.test(df_sal_exp_n$annual_base_pay)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_sal_exp_n$annual_base_pay
## W = 0.99037, p-value = 2.126e-08
ggplot(df_sal_exp_n, aes(x=employer_experience_years, y=annual_base_pay)) +
geom_point(color = "black") +
geom_smooth(method = 'lm', color = "red", size = 1) +
ggtitle("Annual Base Pay vs Experience at Current Employer ") +
xlab(" Years of Experience at Current Employer") + ylab("Annual Base Pay (in USD)") +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(name = "", values = c("black" = "point", "red" = "linear regression line"), labels = c("Data Points", "Linear Regression Line"))
## `geom_smooth()` using formula = 'y ~ x'

# Pearson correlation test

cor.test(df_sal_exp_n$total_experience_years, df_sal_exp_n$annual_base_pay,method = "spearman")
## Warning in cor.test.default(df_sal_exp_n$total_experience_years,
## df_sal_exp_n$annual_base_pay, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_sal_exp_n$total_experience_years and df_sal_exp_n$annual_base_pay
## S = 402076136, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2951122
cor.test(df_sal_exp_n$employer_experience_years, df_sal_exp_n$annual_base_pay,method = "spearman")
## Warning in cor.test.default(df_sal_exp_n$employer_experience_years,
## df_sal_exp_n$annual_base_pay, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_sal_exp_n$employer_experience_years and df_sal_exp_n$annual_base_pay
## S = 512833984, p-value = 8.662e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1009404
cor.test(df_sal_exp_n$total_experience_years, df_sal_exp_n$annual_base_pay,method = "kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  df_sal_exp_n$total_experience_years and df_sal_exp_n$annual_base_pay
## z = 11.495, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##     tau 
## 0.20528
cor.test(df_sal_exp_n$employer_experience_years, df_sal_exp_n$annual_base_pay,method = "kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  df_sal_exp_n$employer_experience_years and df_sal_exp_n$annual_base_pay
## z = 3.9715, p-value = 7.142e-05
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## 0.07337516
ggplot(df_sal_exp_n, aes(sample = annual_base_pay)) + 
  geom_qq() +
  geom_qq_line()+
  xlab("Theoritical") + 
  ylab("Sample")+
  ggtitle("Q-Q plot for Annual Base Pay")+
  theme(plot.title = element_text(hjust = 0.5))

ggplot(df_sal_exp_n, aes(sample = total_experience_years)) + 
  geom_qq() +
  geom_qq_line()+
  xlab("Theoritical") + 
  ylab("Sample")+
  ggtitle("Q-Q plot for Total Experience (in years)")+
  theme(plot.title = element_text(hjust = 0.5))

ggplot(df_sal_exp_n, aes(sample = employer_experience_years)) + 
  geom_qq() +
  geom_qq_line() + 
  xlab("Theoritical") + 
  ylab("Sample")+
  ggtitle("Q-Q plot for Experience at Current Employer (in years)")+
  theme(plot.title = element_text(hjust = 0.5))

filter(df_sal_exp_n,annual_base_pay == 0)

In-depth analysis of the geographical location

#df_lat_long_sal$region <- ifelse(location_longitude<-100,"West",ifelse(location_longitude<-80,"Central",ifelse(location_longitude>`-70,"East","Not in USA")) )

df_lat_long_sal$region <- ifelse(df_lat_long_sal$location_latitude > 24 &
                                 df_lat_long_sal$location_latitude < 50 &
                                 df_lat_long_sal$location_longitude > -125 &
                                 df_lat_long_sal$location_longitude < -65, 
                           ifelse(df_lat_long_sal$location_longitude < -110, "West USA",
                           ifelse(df_lat_long_sal$location_longitude > -90, "East USA", "Central USA")), 
                           "Outside USA")

df_lat_long_sal
library(leaflet)
# define color palette for regions
pal <- colorFactor(c("West USA" = "green", "Central USA" = "yellow","Outside USA" ="grey", "East USA" = "orange"), 
                  domain = df_lat_long_sal$region)

#create the map
leaflet(data = df_lat_long_sal) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~location_longitude, lat = ~location_latitude,  color = pal(df_lat_long_sal$region), 
                   radius = 4 , data = df_lat_long_sal) %>%
  setView(lng = mean(df_lat_long_sal$location_longitude)+40, lat = mean(df_lat_long_sal$location_latitude), zoom = 1.5)%>%
  addLegend(pal = pal, values = df_lat_long_sal$region, title = "Region", position = "bottomright") 
# Check if each of them are normally distributed 

library(ggplot2)

ggplot(data = df_lat_long_sal %>% 
  filter(region  != "Outside USA"), aes(sample = annual_base_pay)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~region, ncol = 1) +
  ggtitle("Q-Q Plot of Annual Base Pay by Region") +
  xlab("Theoretical Quantiles") +
  ylab("Sample Quantiles")+
  theme(plot.title = element_text(hjust = 0.5))

` #Normality Tests

df_west_usa <- df_lat_long_sal %>% 
  filter(region == "West USA") %>% 
  select(annual_base_pay)

df_east_usa <- df_lat_long_sal %>% 
  filter(region == "East USA") %>% 
  select(annual_base_pay)

df_central_usa <- df_lat_long_sal %>% 
  filter(region == "Central USA") %>% 
  select(annual_base_pay)

shapiro.test(df_west_usa$annual_base_pay)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_west_usa$annual_base_pay
## W = 0.077868, p-value < 2.2e-16
#length(df_west_usa$annual_base_pay)

shapiro.test(df_west_usa$annual_base_pay)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_west_usa$annual_base_pay
## W = 0.077868, p-value < 2.2e-16
#length(df_east_usa$annual_base_pay)

shapiro.test(df_west_usa$annual_base_pay)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_west_usa$annual_base_pay
## W = 0.077868, p-value < 2.2e-16
#length(df_central_usa$annual_base_pay)
library(dplyr)
library(ggplot2)

df_lat_long_sal_sum <- df_lat_long_sal %>% 
  group_by(region) %>% 
  filter(region  != "Outside USA")%>%
  summarise(
    mean = mean(annual_base_pay), 
    se = sd(annual_base_pay)/sqrt(n()), 
  )
#df_lat_long_sal_sum
ggplot(df_lat_long_sal_sum, aes(x = mean, y = region )) +
  geom_errorbarh(aes(xmax=mean+1.96*se, xmin=mean-se*1.96)) +
  geom_point() +
  geom_label(aes(label=round(mean,3), nudge_y = 0.1))+
  ggtitle("Mean Annual Base Pay (Confidence Interval)") +
  xlab("Mean Annual Base Pay ") +
  ylab("Region")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning in geom_label(aes(label = round(mean, 3), nudge_y = 0.1)): Ignoring
## unknown aesthetics: nudge_y

# Count the type of each group in the dataset

count_df <-df_lat_long_sal %>%
  count(region,salary_group,sort=FALSE)
count_df <- count_df %>%
  filter(region!='Outside USA')
ggplot(count_df, aes(x=salary_group, y=region, fill=n)) +
  geom_tile(colour="white") + 
  scale_fill_gradient(name="No of people",low="white", high="steelblue") + 
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) + 
  geom_text(aes(label=n), color="black", size=5) 

filter( df_lat_long_sal, location_longitude < -100)
summary(df_lat_long_sal$annual_base_pay)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0    55000    86400   131774   120000 10280000