#The soft skills we will look will be stored as: 'keywords'
keywords<-unname(unlist(read.csv("https://raw.githubusercontent.com/davidblumenstiel/DATA-607-Project-3/master/softskills.csv", stringsAsFactors = FALSE)))
#Iniates new columns for each keyword; also makes list index list
df[keywords] <- NA
keywordColIndex <- seq(length(df)-length(keywords)+1,length(df),1)
#Will determine if contains keywords; occurs (1), or doesnt occur(0)
#Coerces multiple occurances to 1
i = 0
for (desc in df$Description) {
i = i + 1
df[i,keywordColIndex] <-(as.integer(str_detect(desc,fixed(keywords, ignore_case = TRUE))))
}
#Gets rid of the descriptions to save space
df$Description <- NULL
#Gathers soft skills into a tidy dataset: df2
df2 <- df[,c(1, 44:ncol(df))] %>%
gather(skill, flag, -X) %>%
filter(flag == 1) %>%
select(id = X, skill) %>%
mutate(skill_type = 'soft')
soft_sum_3cols <- soft_summary %>% select(Job_Type,Queried_Salary,sum)
avg_ss_by_jobtype$Job_Type <- factor(avg_ss_by_jobtype$Job_Type,levels = c("data_analyst", "data_scientist", "data_engineer"))
#plot showing number soft skills on average per job type
ggplot(avg_ss_by_jobtype) +
aes(x = Job_Type, weight = x) +
geom_bar(fill = "#2171b5") +
labs(x = "Job Type", y = "avg # of soft skills in job description", title = "Soft Skills Mentioned per Job Type") +
theme_minimal()
#we agregate the dataframe into averages for every combination of salary and job type
skillby_type_sal <-aggregate(x=soft_sum_3cols$sum, by = soft_sum_3cols[c("Job_Type", "Queried_Salary")],
FUN=mean, na.rm=TRUE)
# make variable factors and order them
skillby_type_sal$Queried_Salary <- factor(skillby_type_sal$Queried_Salary,levels = c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999", ">160000"))
skillby_type_sal$Job_Type <- factor(skillby_type_sal$Job_Type,levels = c("data_analyst", "data_scientist", "data_engineer"))
# heatmap of average skills mentioned by job type and salary
ggplot(skillby_type_sal) +
aes(x = Queried_Salary, y = Job_Type, fill = x) +
geom_tile(size = 1L) +
scale_fill_distiller(palette = "PuBu") +
labs(x = "Salary Band", y = "Job Type", title = "Soft Skills Mentioned across Salary and Job Type") +
theme_minimal()
#not used for now bar chart for average # soft skills mentioned by salary
# avg_ss_by_salary$Queried_Salary <- factor(avg_ss_by_salary$Queried_Salary,levels = c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999", ">160000"))
# ggplot(avg_ss_by_salary) +
# aes(x = Queried_Salary, weight = x) +
# geom_bar(fill = "#1f9e89") +
# labs(x = "Salary Band", y = "avg # of soft skills in job description", title = "Soft Skill Mentions per Salary Band") +
# theme_minimal()
eng_highs <- filter(df, Job_Type == "data_engineer" & Queried_Salary == ">160000")
number_eng_highs <- nrow(eng_highs)
eng_highs_long <- eng_highs[,c(44:ncol(eng_highs))] %>%
gather(skill, flag) %>%
filter(flag == 1)
eng_highs_count <-aggregate(x=eng_highs_long$flag, by = eng_highs_long[c("skill")],
FUN=sum, na.rm=TRUE)
eng_highs_count$x <- round(eng_highs_count$x/number_eng_highs,4)
eng_highs_count <- eng_highs_count[order(-eng_highs_count$x),]
rownames(eng_highs_count) <- NULL
anl_lows <- filter(df, Job_Type == "data_analyst" & Queried_Salary == "<80000")
number_anl_lows <- nrow(anl_lows)
anl_lows_long <- anl_lows[,c(44:ncol(eng_highs))] %>%
gather(skill, flag) %>%
filter(flag == 1)
anl_lows_count <-aggregate(x=anl_lows_long$flag, by = anl_lows_long[c("skill")],
FUN=sum, na.rm=TRUE)
anl_lows_count$x <- round(anl_lows_count$x/number_anl_lows,4)
anl_lows_count <- anl_lows_count[order(-anl_lows_count$x),]
rownames(anl_lows_count) <- NULL
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
table_eng_highs <- eng_highs_count[1:22,]
table_anl_lows <- anl_lows_count[1:22,]
names(table_eng_highs)[2] <- "occurance"
names(table_anl_lows)[2] <- "occurance"
table_eng_highs %>%
kable("html",col.names = NA, align = 'clc', caption = 'Top Soft skills (engineers high salary)') %>%
kable_styling(full_width = F, position = "float_left")
skill | occurance |
---|---|
Team | 0.7885 |
Management | 0.3846 |
Problem | 0.3654 |
communication | 0.3077 |
Result | 0.2596 |
Hands-on | 0.2404 |
Insight | 0.2212 |
detail | 0.2019 |
Organization | 0.1923 |
Collaborate | 0.1827 |
Competitive | 0.1731 |
Driven | 0.1731 |
Responsible | 0.1635 |
Collaboration | 0.1442 |
Passionate | 0.1346 |
Problem solving | 0.1250 |
Innovation | 0.1154 |
Influence | 0.1058 |
Leadership | 0.1058 |
Managing | 0.0962 |
Team player | 0.0962 |
Analyzing | 0.0865 |
table_anl_lows %>%
kable("html", align = 'clc', caption = 'Top Soft skills (analyst low salary)') %>%
kable_styling(full_width = F, position = "right")
skill | occurance |
---|---|
Team | 0.6227 |
Management | 0.5386 |
communication | 0.4656 |
Problem | 0.4109 |
Organization | 0.3829 |
detail | 0.3240 |
Result | 0.3128 |
Research | 0.2917 |
Train | 0.2693 |
Presentation | 0.2567 |
Training | 0.2216 |
Responsible | 0.2188 |
Independent | 0.2146 |
Insight | 0.2020 |
Analyzing | 0.1893 |
Attention to detail | 0.1795 |
Collaborate | 0.1697 |
Writing | 0.1697 |
Leadership | 0.1683 |
Interpersonal | 0.1529 |
Strategic | 0.1515 |
Integrity | 0.1487 |
## skill occurance
## 1 Team 0.7885
## 2 Management 0.3846
## 3 Problem 0.3654
## 4 communication 0.3077
## 5 Result 0.2596
## 6 Hands-on 0.2404
## 7 Insight 0.2212
## 8 detail 0.2019
## 9 Organization 0.1923
## 10 Collaborate 0.1827
## 11 Competitive 0.1731
## 12 Driven 0.1731
## 13 Responsible 0.1635
## 14 Collaboration 0.1442
## 15 Passionate 0.1346
## 16 Problem solving 0.1250
## 17 Innovation 0.1154
## 18 Influence 0.1058
## 19 Leadership 0.1058
## 20 Managing 0.0962
## 21 Team player 0.0962
## 22 Analyzing 0.0865
# List all hard skills and how often they were listed in a separate dataframe
for (i in 1:nrow(df_raw)) {
slist <- str_extract_all(df_raw$Skill[i], "'.+?'")
temp_df <- data.frame(skill = slist[[1]])
if (nrow(temp_df) > 0) {
temp_df$id <- i - 1
temp_df$skill <- as.character(temp_df$skill)
temp_df <- select(temp_df, id, skill)
if (i == 1) {
sdf <- temp_df
} else {
sdf <- bind_rows(sdf, temp_df)
}
}
}
# Remove quotes and trim whitespace
sdf$skill <- str_replace_all(sdf$skill, "'", "")
sdf$skill <- str_replace_all(sdf$skill, '"', "")
sdf$skill <- str_trim(sdf$skill)
sdf$skill_type <- 'hard'
# Combine skills
skills <- bind_rows(df2, sdf)
df <- df_raw
job.indeed.df <- df %>%
select(c(Id=X, (Job_Title)))
# Pattern Building:
pattern.analyst <- c('analyst','statistician','analysis','analytics')
pattern.engineer <- c('engineer', 'engg', 'technician','technologist','designer','architect')
pattern.scientist <- c('scientist','doctor','dr.')
pattern.junior <- c('junior','jr', 'entry','internship','jr.')
pattern.senior <- c('senior', 'sr','experienced','sr.')
# Intermedaite Data Frame for Titlles:
final.data.df <- data.frame(Id=integer(nrow(job.indeed.df)), Job_Title=character(nrow(job.indeed.df))
, analyst=integer(nrow(job.indeed.df)) ,engineer=integer(nrow(job.indeed.df)),scientist=integer(nrow(job.indeed.df))
, junior=integer(nrow(job.indeed.df)),senior=integer(nrow(job.indeed.df)))
final.data.df$Id <- job.indeed.df$Id
final.data.df$Job_Title <- as.character( as.character( job.indeed.df$Job_Title) )
# Working on the counts:
for (i in 1: nrow(job.indeed.df)) {
final.data.df$analyst[i] <- if(grepl(paste(pattern.analyst,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$engineer[i] <- if(grepl(paste(pattern.engineer,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$scientist[i] <- if(grepl(paste(pattern.scientist,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$junior[i] <- if(grepl(paste(pattern.junior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$senior[i] <- if(grepl(paste(pattern.senior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
}
# Job Role Distribution by Position
post_count <- nrow(df_raw)
dplyr::summarize(final.data.df,
Analyst = sum(analyst) / n(),
Engineer = sum(engineer) / n(),
Scientist = sum(scientist) / n()) %>%
gather(position, pct_of_posts) %>%
ggplot(aes(x = position, y = pct_of_posts)) +
geom_col(fill = 'grey80') +
geom_hline(yintercept = .333, linetype = 'dotted') +
scale_y_continuous(label = percent_format(accuracy = 1)) +
labs(x = element_blank(),
y = element_blank(),
title = 'Job Title Distribution',
subtitle = str_c('Data Roles from ', comma(post_count), ' Posts')) +
theme(legend.position = 'none') +
theme_bw()
# hard skills dataframe for visualizations
hs <- filter(skills, skill_type == 'hard') %>%
left_join(final.data.df, by = c('id' = 'Id')) %>%
mutate(analyst = ifelse(analyst == 1, id, NA),
engineer = ifelse(engineer == 1, id, NA),
scientist = ifelse(scientist == 1, id, NA),
junior = ifelse(junior == 1, id, NA),
senior = ifelse(senior == 1, id, NA)) %>%
dplyr::group_by(skill) %>%
dplyr::summarize(total_count = n(),
ana_ids = n_distinct(analyst),
ana_pct = ana_ids / sum(df_all$analyst),
eng_ids = n_distinct(engineer),
eng_pct = eng_ids / sum(df_all$engineer),
sci_ids = n_distinct(scientist),
sci_pct = sci_ids / sum(df_all$scientist),
jr_ids = n_distinct(junior),
jr_pct = jr_ids / sum(df_all$junior),
sr_ids = n_distinct(senior),
sr_pct = sr_ids / sum(df_all$senior))
# specify top number of skills
top_x <- 10
# Top Hard Skills for All Data Postings
ggplot( arrange(hs, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/nrow(df_all))) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Analysts/Engineers/Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
# Top Hard Skills for Data Scientists
ggplot( arrange(hs, desc(sci_ids))[1:top_x,], aes(x = reorder(skill, sci_pct), y = sci_pct)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
# Top Hard Skills for Data Analysts
ggplot( arrange(hs, desc(ana_ids))[1:top_x,], aes(x = reorder(skill, ana_pct), y = ana_pct)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Analysts'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
# Top Hard Skills for Data Engineers
ggplot( arrange(hs, desc(eng_ids))[1:top_x,], aes(x = reorder(skill, eng_pct), y = eng_pct)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Engineers'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
# Distribution of Junior and Senior Data Scientist Roles
jrs <- filter(final.data.df, junior == 1) %>%
select(Id, Analyst = analyst, Engineer = engineer, Scientist = scientist) %>%
gather(position, count, -Id)
jrs$level <- 'Junior'
srs <- filter(final.data.df, senior == 1) %>%
select(Id, Analyst = analyst, Engineer = engineer, Scientist = scientist) %>%
gather(position, count, -Id)
srs$level <- 'Senior'
pos_and_role <- bind_rows(jrs, srs) %>%
group_by(level, position) %>%
dplyr::summarize(count = sum(count))
ggplot(pos_and_role, aes(x = position, y = count, fill = level)) +
geom_col(position = 'fill') +
theme_bw() +
scale_y_continuous(label = percent_format(accuracy = 1)) +
scale_fill_manual(values = c('lightskyblue3', 'royalblue4')) +
labs(x = element_blank(),
y = element_blank(),
title = 'Distribution of Specified Levels')
# Top Hard Skills for Junior Data Scientists
hs_jr_base <- filter(skills, skill_type == 'hard') %>%
inner_join(filter(final.data.df, junior == 1 & scientist == 1), by = c('id' = 'Id'))
hs_jr <- group_by(hs_jr_base, skill) %>%
dplyr::summarize(total_count = n())
hs_jr_denom <- n_distinct(hs_jr_base$id)
ggplot( arrange(hs_jr, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/hs_jr_denom)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Junior Data Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
hs_sr_base <- filter(skills, skill_type == 'hard') %>%
inner_join(filter(final.data.df, senior == 1 & scientist == 1), by = c('id' = 'Id'))
hs_sr <- group_by(hs_sr_base, skill) %>%
dplyr::summarize(total_count = n())
hs_sr_denom <- n_distinct(hs_sr_base$id)
# Top Hard Skills for Senior Data Scientists
ggplot( arrange(hs_sr, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/hs_sr_denom)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Senior Data Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
sci_industry <- filter(df_all, scientist == 1 & Company_Industry != '') %>%
dplyr::group_by(Company_Industry) %>%
dplyr::summarize(postings = n()) %>%
arrange(desc(postings))
# Top Company Industries Seeking Data Scientists
ggplot(sci_industry[1:10,], aes(x = reorder(Company_Industry, postings), y = postings / sum(sci_industry$postings))) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = 'Top 10 Company Industries',
subtitle = str_c('Data Scientist Posts'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))
# WordCLoud map for the soft and hard skillset that high rating/star companies are
# looking for in engineer vs analyst/scientist Position:
df.stars.5 <- df_all %>%
filter(No_of_Stars ==5 ) %>%
select(job_id, Queried_Salary, Company_Industry, analyst, engineer, scientist , junior, senior ) %>%
inner_join(skills, by = c( 'job_id' = 'id'))
df.stars.5.engg <- df.stars.5 %>%
filter ( engineer ==1) %>%
group_by(skill) %>%
dplyr::summarize(total_count = n())
df.stars.5.ana.sci <- df.stars.5 %>%
filter ( analyst + scientist > 0 ) %>%
group_by(skill) %>%
dplyr::summarize(total_count = n())
set.seed(1234)
wordcloud(words = df.stars.5.engg$skill, freq = df.stars.5.engg$total_count, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.10,
colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Shell Scripting could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Business Intelligence could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Google Cloud Platform could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Natural Language Processing could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Oral communication skills could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Responsibility could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Work-Life Balance could not be fit on page. It will not be
## plotted.
wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.5.ana.sci$total_count, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.10,
colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Data-driven decision-making could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Survey Design could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Written communication skills could not be fit on page.
## It will not be plotted.
# Salary Range by titles given by high rating/star companies:
df.stars.5.Salary <- df.stars.5 %>%
dplyr::group_by(Queried_Salary) %>%
dplyr::summarize(total_count = n(),
ana_total = sum(analyst),
eng_totals = sum(engineer),
sci_total = sum(scientist),
jr_total = sum(junior),
sr_total = sum(senior))
kable(df.stars.5.Salary)
Queried_Salary | total_count | ana_total | eng_totals | sci_total | jr_total | sr_total |
---|---|---|---|---|---|---|
<80000 | 37 | 37 | 0 | 0 | 0 | 0 |
>160000 | 47 | 17 | 0 | 30 | 0 | 0 |
100000-119999 | 111 | 33 | 54 | 24 | 0 | 0 |
120000-139999 | 136 | 55 | 34 | 77 | 0 | 9 |
140000-159999 | 59 | 17 | 28 | 14 | 0 | 8 |
80000-99999 | 61 | 25 | 36 | 0 | 0 | 13 |
# ==> We can see from above no junior level roles; No Engineer roles with greater then
# 160,000 USD
#Statewise Job posting for Data Science Jobs:
df.states <- df_all %>%
select(job_id, Location ) %>%
group_by(Location) %>%
dplyr::summarize(total_count = n())
all_states <- map_data("state")
df.states$region <- stateFromLower(df.states$Location)
Total <- merge(all_states, df.states, by="region")
Total <- Total[Total$region!="REMOTE",]
p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$total_count),colour="white"
) + scale_fill_continuous(low = "#56B4E9", high = "#0072B2", guide="colorbar")
p
#Creates a hard-skill and salary, and job type dataset
Skill_Pay <- sdf[1:2]
Skill_Pay[c("Salary","Job_Type")] <- NA
#Populates the salary and job type columns from the raw dataset
for (i in Skill_Pay$id) {
Skill_Pay$Salary[Skill_Pay$id == i] <- df_raw$Queried_Salary[df_raw$X == i]
Skill_Pay$Job_Type[Skill_Pay$id == i] <- df_raw$Job_Type[df_raw$X == i]
}
#Going to focus on only data_science-type jobs
Skill_PayDS <- filter(Skill_Pay,Job_Type == 'data_scientist')
#Changes the 'skill' column to a factor type
Skill_PayDS$skill <- as.factor(Skill_PayDS$skill)
#Filters it to skills mentioned over 200 times (to keep sample size high for analysis)
mentions <- 200
stab = table(Skill_PayDS$skill)
Skill_PayDS <- Skill_PayDS[Skill_PayDS$skill %in% names(stab)[stab >= mentions],]
#Drops the now un-used factors from skills
Skill_PayDS <- droplevels(Skill_PayDS)
#Factors and re-orders Salary
Pay_Grades <- c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999",">160000")
Skill_PayDS$Salary <- factor(Skill_PayDS$Salary, levels = Pay_Grades)
#Makes a table of Skills vs Salary
PayTable<- table(Skill_PayDS$skill ,Skill_PayDS$Salary)
PayTable
##
## <80000 80000-99999 100000-119999 120000-139999
## AI 5 50 105 131
## AWS 1 37 80 96
## Big Data 2 17 97 161
## C/C++ 6 36 98 141
## Data Mining 13 55 238 225
## Data Science 6 17 65 75
## Hadoop 3 27 163 297
## Hive 1 11 91 158
## Java 2 40 149 224
## Linux 2 44 55 66
## Machine Learning 30 145 467 577
## MATLAB 3 30 68 131
## Natural Language Processing 5 44 114 153
## Python 23 157 506 598
## R 19 127 460 502
## SAS 7 59 192 162
## Scala 1 7 67 119
## Spark 3 20 168 269
## SQL 13 134 354 371
## Tableau 5 38 157 164
## TensorFlow 6 46 86 128
##
## 140000-159999 >160000
## AI 114 46
## AWS 65 37
## Big Data 111 29
## C/C++ 99 51
## Data Mining 162 40
## Data Science 41 17
## Hadoop 244 93
## Hive 132 49
## Java 190 70
## Linux 48 10
## Machine Learning 440 183
## MATLAB 98 34
## Natural Language Processing 157 55
## Python 456 172
## R 328 114
## SAS 113 31
## Scala 100 44
## Spark 201 84
## SQL 232 89
## Tableau 94 26
## TensorFlow 100 54
#Specify the significance level
sig <- 0.05
#Loops through each skill
j = 0
k = 0
plist <- NA
SigSkills <- NA
while (j < nrow(PayTable)) {
j = j + 1
i = 0
unmentioned <- NA
#Loops through the elemenents of each skill, and builds a vector of the number of jobs for which a skill-salary combo wasn't mentioned: unmentioned
while (i < ncol(PayTable)) {
i = i + 1
unmentioned[i] <- Job_Totals[i] - PayTable[j,i]
}
#Performed a chi-squared tests for by each skill by paygrade to determine skills mentioned with significant relationship
#Saves the skills with significant relationships to a list of names and their p-values to a seperate list
if (chisq.test(cbind(unmentioned,PayTable[j,]))$p.value < sig) {
k = k + 1
plist[k] <- chisq.test(cbind(unmentioned,PayTable[j,]))$p.value
SigSkills[k] <- names(PayTable[,1])[j]
}
}
## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect
i = 0
skillplots <- list()
while (i < length(SigSkills)) {
i = i + 1
plottemp <- as.data.frame(PayTable[SigSkills[i],]/Job_Totals)
print (ggplot(plottemp, aes(x = plottemp[,1], y = rownames(plottemp), fill = rownames(plottemp))) +
geom_bar(stat = "identity", width=0.9) +
scale_y_discrete(limits=rownames(plottemp)) +
geom_vline(xintercept = sum(PayTable[SigSkills[i],])/sum(Job_Totals)) +
scale_fill_brewer(palette=1, type = "qual") +
xlab("Proportion") +
ylab("Salary Range") +
ggtitle(paste("Proportion of Job Postings that Mention",SigSkills[i],"; p = ",round(plist[i], digits = 6))) +
labs(fill = "Salary Range")+
theme(axis.text.x = element_text(angle = 15)) +
coord_flip())
}
#install.packages("RMySQL")
#library("RMySQL")
## Connection String to connect to the local host database:
#mysqlconnection = dbConnect(MySQL(), user = 'indeed', password = 'Indeed@123', dbname = 'indeed',
# host = 'localhost')
## Lists the tables currently in the database:
#dbListTables(mysqlconnection)
## Creating table and inserting rows from the mentioned dataframe; if table exists then it will overwrite:
#dbWriteTable(mysqlconnection, "skills", skills[, ], overwrite = TRUE, row.names = FALSE)
#dbWriteTable(mysqlconnection, "df_raw", df_raw[, ], overwrite = TRUE, row.names = FALSE)
## Lists the tables existing and above two newly created ones:
#dbListTables(mysqlconnection)