# Packages needed:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(dplyr)
library(ggplot2)
library(knitr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
skills <- read.csv("https://raw.githubusercontent.com/jonburns2454/Project-3-DATA607/main/ds_general_skills_revised.csv")
software <- read.csv("https://raw.githubusercontent.com/jonburns2454/Project-3-DATA607/main/Data%20Science%20Career%20Terms%20-%20ds%20software.csv")
glimpse(skills)
## Rows: 30
## Columns: 5
## $ Keyword <chr> "machine learning", "analysis", "statistics", "computer sc…
## $ LinkedIn <chr> "5,701", "5,168", "4,893", "4,517", "3,404", "2,605", "1,8…
## $ Indeed <chr> "3,439", "3,500", "2,992", "2,739", "2,344", "1,961", "1,4…
## $ SimplyHired <chr> "2,561", "2,668", "2,308", "2,093", "1,791", "1,497", "1,1…
## $ Monster <chr> "2,340", "3,306", "2,399", "1,900", "2,053", "1,815", "1,2…
glimpse(software)
## Rows: 42
## Columns: 12
## $ Keyword <chr> "Python", "R", "SQL", "Spark", "Hadoop"…
## $ LinkedIn <chr> "6,347", "4,553", "3,879", "2,169", "2,…
## $ Indeed <chr> "3,818", "3,106", "2,628", "1,551", "1,…
## $ SimplyHired <chr> "2,888", "2,393", "2,056", "1,167", "1,…
## $ Monster <chr> "2,544", "2,365", "1,841", "1,062", "1,…
## $ LinkedIn.. <chr> "74%", "53%", "45%", "25%", "25%", "23%…
## $ Indeed.. <chr> "74%", "60%", "51%", "30%", "31%", "27%…
## $ SimplyHired.. <chr> "75%", "62%", "54%", "30%", "30%", "28%…
## $ Monster.. <chr> "68%", "63%", "49%", "28%", "32%", "27%…
## $ Avg.. <chr> "73%", "60%", "50%", "29%", "30%", "26%…
## $ GlassDoor.Self.Reported...2017 <chr> "72%", "64%", "51%", "27%", "39%", "33%…
## $ Difference <chr> "1%", "-4%", "-1%", "2%", "-9%", "-7%",…
#Preliminary Work:
# - Checking for missing data
# - Cleaning
sum(is.na(skills))## 0
## [1] 0
sum(is.na(software))## 0
## [1] 0
software.cleaned <- software[, -c(6:12)] # Removing the nonessential data columns
software.cleaned <- software.cleaned[-c(38:42), ]
skills.cleaned <- skills[-c(16:30), ] # Removing data that interferes with the format
# Some regex to separeate the characters into numeric data and add it into a new data frame
LinkedIn <- as.numeric(gsub("[%\\,]", "", skills.cleaned$LinkedIn))
Indeed <- as.numeric(gsub("[%\\,]", "", skills.cleaned$Indeed))
SimplyHired <- as.numeric(gsub("[%\\,]", "", skills.cleaned$SimplyHired))
Monster <- as.numeric(gsub("[%\\,]", "", skills.cleaned$Monster))
Keyword <- (gsub("[%\\,]", "", skills.cleaned$Keyword))
skillsDF <- data.frame(Keyword, LinkedIn, Indeed, SimplyHired, Monster)
#Doing the same for our software data: (.S denotes software data)
LinkedIn.S <- as.numeric(gsub("[%\\,]", "", software.cleaned$LinkedIn))
Indeed.S <- as.numeric(gsub("[%\\,]", "", software.cleaned$Indeed))
SimplyHired.S <- as.numeric(gsub("[%\\,]", "", software.cleaned$SimplyHired))
Monster.S <- as.numeric(gsub("[%\\,]", "", software.cleaned$Monster))
Keyword.S <- (gsub("[%\\,]", "", software.cleaned$Keyword))
softwareDF <- data.frame(Keyword.S, LinkedIn.S, Indeed.S, SimplyHired.S, Monster.S)
# Adding in percentage variables to better envision the data breakdown.
# Software:
softwareDF <- softwareDF %>%
mutate(LinkedInFreq = (LinkedIn.S / sum(LinkedIn.S))*100) %>%
mutate(IndeedFreq = (Indeed.S / sum(Indeed.S))*100) %>%
mutate(SimplyHiredFreq = (SimplyHired.S / sum(SimplyHired.S))*100) %>%
mutate(MonsterFreq = (Monster.S / sum(Monster.S))*100)
##Skills:
skillsDF <- skillsDF %>%
mutate(LinkedInFreq = (LinkedIn / sum(LinkedIn))*100) %>%
mutate(IndeedFreq = (Indeed / sum(Indeed))*100) %>%
mutate(SimplyHiredFreq = (SimplyHired / sum(SimplyHired))*100) %>%
mutate(MonsterFreq = (Monster / sum(Monster))*100)
# **Transforming the data further for visualization**
meltedSkills <- melt(skillsDF, id.vars = "Keyword")
meltedSoftware <- melt(softwareDF, id.vars = "Keyword.S")
# **Bar Chart : Count**
meltedSkills %>%
filter(variable == "LinkedIn" | variable == "Indeed" | variable == "SimplyHired" | variable == "Monster") %>%
ggplot(aes(x = Keyword, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Keywords", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("LinkedIn" = "coral1", "Indeed" = "firebrick4", "SimplyHired" = "aquamarine4", "Monster" = "chartreuse4"))+
ggtitle("Keyword Count on Major Job Search Platforms (Skills)")

# **Bar Chart : Percent**
# *Note: This is the same idea, but instead of filtering our count numbers*
meltedSkills %>%
filter(variable == "LinkedInFreq" | variable == "IndeedFreq" | variable == "SimplyHiredFreq" | variable == "MonsterFreq") %>%
ggplot(aes(x = Keyword, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Keywords", y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("LinkedInFreq" = "coral1", "IndeedFreq" = "firebrick4", "SimplyHiredFreq" = "aquamarine4", "MonsterFreq" = "chartreuse4"))+
ggtitle("Keyword Frequency on Major Job Search Platforms (Skills)")

##Software
# **Bar Chart - Software**
meltedSoftware %>%
filter(variable == "LinkedIn.S" | variable == "Indeed.S" | variable == "SimplyHired.S" | variable == "Monster.S") %>%
ggplot(aes(x = Keyword.S, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
labs(x = "Keywords", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("LinkedIn.S" = "coral1", "Indeed.S" = "firebrick4", "SimplyHired.S" = "aquamarine4", "Monster.S" = "chartreuse4"))+
ggtitle("Keyword Count on Major Job Search Platforms (Software)")

# New add
# Your data manipulation and ggplot code for the second chart (frequency)
plot1 <- meltedSoftware %>%
filter(variable == "LinkedInFreq" | variable == "IndeedFreq" | variable == "SimplyHiredFreq" | variable == "MonsterFreq") %>%
ggplot(aes(x = Keyword.S, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
labs(x = "Keywords", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("LinkedInFreq" = "coral1", "IndeedFreq" = "firebrick4", "SimplyHiredFreq" = "aquamarine4", "MonsterFreq" = "chartreuse4")) +
ggtitle("Keyword Count on Major Job Search Platforms (Softwar)")
# Convert ggplot objects to plotly objects
interactive_plot1 <- ggplotly(plot1)
interactive_plot1_zoom <- interactive_plot1 %>%
layout(xaxis = list(title = "Keywords"), yaxis = list(title = "Count"))
# Make the plot zoom interactive
interactive_plot1_zoom <- interactive_plot1_zoom %>%
config(scrollZoom = TRUE)
interactive_plot1_zoom
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
skillsDF <- skillsDF %>%
mutate(Total = (LinkedInFreq + IndeedFreq + SimplyHiredFreq + MonsterFreq)/4)
# Check for normality in the distribution using the Shapiro-Wilk test
shapiro_test <- sapply(skillsDF[, 2:5], function(x) shapiro.test(x)$p.value)
# new data frame with normality results
normality_table <- data.frame(
Platform = c("LinkedIn", "Indeed", "SimplyHired", "Monster"),
p_value = shapiro_test
)
plot0 <- ggplot(skillsDF, aes(Keyword, Total)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Job Postings by Keyword 2018",
x = "Keyword",
y = "Total Job Postings") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# normality test results
ggplot(normality_table, aes(Platform, p_value)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Normality Test Results",
x = "Platform",
y = "p-value")

data <- read.csv("~/Data_607/Valued_skill_data.csv")
# Change variable type
data <- data %>% mutate(Share = as.numeric(sub("%", "", Share)))
data <- data %>% mutate(Share.1 = as.numeric(sub("%", "", Share.1)))
plot1 <- ggplot(data, aes(x = reorder(Skills.Required.by.Employers, Share), y = Share)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Latest Employer Skill Required ", x = "Skill", y = "Percentage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1) ) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) # Apply the percentage format
plot2 <- ggplot(data, aes(x = reorder(Skills.Listed.by.Employees, Share.1), y = Share.1)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Employee Skill Listed ", x = "Skill", y = "Percentage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1) ) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) # Apply the
grid.arrange(plot1, plot2, ncol = 2)

grid.arrange(plot1, plot0, ncol = 2)

# Summarize the normality test results
summary(normality_table)
## Platform p_value
## Length:4 Min. :0.02200
## Class :character 1st Qu.:0.03666
## Mode :character Median :0.04688
## Mean :0.04622
## 3rd Qu.:0.05644
## Max. :0.06911