Tin Le Mini lesson 10 Lollipop chart on paid wage of software engineers
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(readxl)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library()
sd <- read_excel("E:/Download/csc463/sd.xlsx", col_types = c("text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "numeric", "text", "numeric"))
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Coercing text to numeric in X146963 / R146963C24: '45870'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Coercing text to numeric in X164631 / R164631C24: '76700'
#Work did in class
glimpse(sd)
## Observations: 167,278
## Variables: 26
## $ CASE_NUMBER <chr> "I-200-14073-248840", "A-15061-...
## $ CASE_STATUS <chr> "denied", "denied", "denied", "...
## $ CASE_RECEIVED_DATE <chr> "3/14/2014", "3/19/2015", "9/13...
## $ DECISION_DATE <chr> "3/21/2014", "3/19/2015", "9/23...
## $ EMPLOYER_NAME <chr> "ADVANCED TECHNOLOGY GROUP USA,...
## $ PREVAILING_WAGE_SUBMITTED <chr> "6217100", "5067600", "4947000"...
## $ PREVAILING_WAGE_SUBMITTED_UNIT <chr> "year", "year", "year", "month"...
## $ PAID_WAGE_SUBMITTED <chr> "62171", "91440", "49470", "438...
## $ PAID_WAGE_SUBMITTED_UNIT <chr> "year", "year", "year", "year",...
## $ JOB_TITLE <chr> "SOFTWARE ENGINEER", "Assistant...
## $ WORK_CITY <chr> "BLOOMINGTON", "SAN FRANCISCO",...
## $ EDUCATION_LEVEL_REQUIRED <chr> "NA", "Doctorate", "NA", "NA", ...
## $ COLLEGE_MAJOR_REQUIRED <chr> "NA", "marketing", "NA", "NA", ...
## $ EXPERIENCE_REQUIRED_Y_N <chr> "NA", "n", "NA", "NA", "NA", "N...
## $ EXPERIENCE_REQUIRED_NUM_MONTHS <chr> "NA", NA, "NA", "NA", "NA", "NA...
## $ COUNTRY_OF_CITIZENSHIP <chr> "NA", "IRAN", "NA", "NA", "NA",...
## $ PREVAILING_WAGE_SOC_CODE <chr> "15-1132", "25-1011", "25-2052"...
## $ PREVAILING_WAGE_SOC_TITLE <chr> "Software Developers, Applicati...
## $ WORK_STATE <chr> "IL", "CALIFORNIA", "CA", "TX",...
## $ WORK_POSTAL_CODE <chr> "NA", "94132.0", "NA", "NA", "N...
## $ FULL_TIME_POSITION_Y_N <chr> "y", "NA", "y", "y", "y", "y", ...
## $ VISA_CLASS <chr> "H-1B", "greencard", "H-1B", "H...
## $ PREVAILING_WAGE_PER_YEAR <chr> "NA", "NA", "NA", "NA", "NA", "...
## $ PAID_WAGE_PER_YEAR <dbl> 62171, 91440, 49470, 43800, 170...
## $ JOB_TITLE_SUBGROUP <chr> "software engineer", "assistant...
## $ order <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...
sd$dec_dt = mdy(sd$DECISION_DATE)
sd$dec_yr = year(sd$dec_dt)
sd$full = (sd$FULL_TIME_POSITION_Y_N)
sd$full[is.na(sd$full)] = "Miss"
sum(is.na(sd$full))
## [1] 0
sd$full[sd$full=="NA"] = "Miss"
sd$full = factor(sd$full)
sd$jts = factor(sd$JOB_TITLE_SUBGROUP)
sd$pw = sd$PAID_WAGE_PER_YEAR
sd$WORK_STATE[nchar(sd$WORK_STATE) > 2 ] = "Miss"
softsd <- sd %>%
select(JOB_TITLE_SUBGROUP, WORK_CITY, PAID_WAGE_PER_YEAR) %>%
filter(JOB_TITLE_SUBGROUP == "software engineer") %>%
mutate(WORK_CITY = factor(WORK_CITY),
JOB_TITLE_SUBGROUP = factor(JOB_TITLE_SUBGROUP)) %>%
group_by(WORK_CITY) %>%
summarise(sumPAID_WAGE = mean(PAID_WAGE_PER_YEAR))
softsd <- softsd[order(-softsd$sumPAID_WAGE) , ]
softsd
## # A tibble: 2,723 x 2
## WORK_CITY sumPAID_WAGE
## <fctr> <dbl>
## 1 Los Gatos 210337.5
## 2 LOS GATOS 182190.1
## 3 MOUNTIAN VIEW 170000.0
## 4 LYNBROOK 166400.0
## 5 MANHATTAN 159689.0
## 6 MILLSTONE TOWNSHIP 157321.0
## 7 HAYMARKET 157260.0
## 8 SARATOGA 155516.9
## 9 Lewisville 153170.9
## 10 Los Altos 151833.3
## # ... with 2,713 more rows
top20SoftCity <- head(softsd, n = 20)
ggplot(top20SoftCity, aes(x = reorder(WORK_CITY, sumPAID_WAGE), y = sumPAID_WAGE)) +
geom_point(stat = "identity", fill = "black", size = 6) +
geom_segment(aes(y = 125000,
x = WORK_CITY,
yend = sumPAID_WAGE,
xend = WORK_CITY),
color = "black") + #allows lines leading to point
labs(title = "Lollipop Chart",
subtitle = "TOP 20 cities highest average annual paid wage") +
coord_flip()