library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
dice_job <- read.csv("https://raw.githubusercontent.com/DATA607/Project3/master/dice_updated.csv")
# head(dice_job)
# View(dice_job)
nrow(dice_job)
## [1] 1100
# My dataset include 1100 jobs in data science.
View(dice_job)
# Cleanup data
colnames(dice_job)
## [1] "X" "detailUrl" "jobTitle" "company" "location" "date"
## [7] "Id" "Skill" "Count"
drops <- c("counter")
dice_job <- dice_job[ , !(names(dice_job) %in% drops)]
head(dice_job)
## X detailUrl
## 1 1 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 2 2 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 3 3 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 4 4 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 5 5 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 6 6 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## jobTitle
## 1 Senior Data Engineer *** 200K or contract to hire ***
## 2 Senior Data Engineer *** 200K or contract to hire ***
## 3 Senior Data Engineer *** 200K or contract to hire ***
## 4 Senior Data Engineer *** 200K or contract to hire ***
## 5 Senior Data Engineer *** 200K or contract to hire ***
## 6 Senior Data Engineer *** 200K or contract to hire ***
## company location date Id
## 1 Projas Technologies, LLC San Francisco, CA 2017-03-19 1
## 2 Projas Technologies, LLC San Francisco, CA 2017-03-19 2
## 3 Projas Technologies, LLC San Francisco, CA 2017-03-19 3
## 4 Projas Technologies, LLC San Francisco, CA 2017-03-19 4
## 5 Projas Technologies, LLC San Francisco, CA 2017-03-19 5
## 6 Projas Technologies, LLC San Francisco, CA 2017-03-19 6
## Skill Count
## 1 Python 9
## 2 programming 1
## 3 analytics 0
## 4 mathematics 0
## 5 machine learning 0
## 6 team 0
# Research Question 1. What is the most needed skills in jobs in data science?
df_1 <- select(dice_job, Skill, Count)
head(df_1)
## Skill Count
## 1 Python 9
## 2 programming 1
## 3 analytics 0
## 4 mathematics 0
## 5 machine learning 0
## 6 team 0
# Group by skills and sum up the counts
df_1 <- aggregate(df_1$Count, by = list(Skills = df_1$Skill), FUN = sum)
head(df_1)
## Skills x
## 1 R 6
## 2 analytics 83
## 3 big data 28
## 4 communication 23
## 5 Cooperation 0
## 6 creative 6
colnames(df_1)[2] <- "Frequency"
df_top <- arrange(df_1, desc(Frequency))
df_top8 <- df_top[1:8, ]
df_top8
## Skills Frequency
## 1 Python 170
## 2 team 142
## 3 SQL 127
## 4 SAS 85
## 5 analytics 83
## 6 machine learning 75
## 7 Java 56
## 8 Hadoop 43
# plot the counts vs. skills
ggplot(df_top8, aes(x = df_top8$Skills, y = df_top8$Frequency, group = 1)) +
geom_point() + geom_line() +
ggtitle("Skills most needed") +
labs(x = "Skills", y = "Frequency")

# Research Question 2. What area need most jobs in data science?
df_2 <- select(dice_job, X, location)
colnames(df_2)[1] <- "number"
df_2 <- aggregate(df_2$number, by = list(Location = df_2$location), FUN = sum)
colnames(df_2)[2] <- "Total"
head(df_2)
## Location Total
## 1 Atlanta, GA 9449
## 2 Baltimore, MD 31482
## 3 Bellevue, WA 34639
## 4 Belmont, CA 19129
## 5 Bentonville, AR 1221
## 6 Bethesda, MD 13321
df_2 <- filter(df_2, df_2$Total >= mean(df_2$Total))
df_2 <- arrange(df_2, desc(Total))
df2_top3 <- df_2[1:3, ]
# plot the total of jobs vs. location
barplot(df2_top3$Total,
main = "Data Science Jobs vs. Location",
names.arg = df2_top3$Location,
ylab = "Total number of Jobs")
