library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

dice_job <- read.csv("https://raw.githubusercontent.com/DATA607/Project3/master/dice_updated.csv")

# head(dice_job)
# View(dice_job)
nrow(dice_job)
## [1] 1100
# My dataset include 1100 jobs in data science.
View(dice_job)

# Cleanup data
colnames(dice_job)
## [1] "X"         "detailUrl" "jobTitle"  "company"   "location"  "date"     
## [7] "Id"        "Skill"     "Count"
drops <- c("counter")
dice_job <- dice_job[ , !(names(dice_job) %in% drops)]
head(dice_job)
##   X                                                    detailUrl
## 1 1 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 2 2 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 3 3 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 4 4 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 5 5 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
## 6 6 http://www.dice.com/job/result/10126850/DATA-ENGINEER?src=19
##                                                jobTitle
## 1 Senior Data Engineer *** 200K or contract to hire ***
## 2 Senior Data Engineer *** 200K or contract to hire ***
## 3 Senior Data Engineer *** 200K or contract to hire ***
## 4 Senior Data Engineer *** 200K or contract to hire ***
## 5 Senior Data Engineer *** 200K or contract to hire ***
## 6 Senior Data Engineer *** 200K or contract to hire ***
##                    company          location       date Id
## 1 Projas Technologies, LLC San Francisco, CA 2017-03-19  1
## 2 Projas Technologies, LLC San Francisco, CA 2017-03-19  2
## 3 Projas Technologies, LLC San Francisco, CA 2017-03-19  3
## 4 Projas Technologies, LLC San Francisco, CA 2017-03-19  4
## 5 Projas Technologies, LLC San Francisco, CA 2017-03-19  5
## 6 Projas Technologies, LLC San Francisco, CA 2017-03-19  6
##              Skill Count
## 1           Python     9
## 2      programming     1
## 3        analytics     0
## 4      mathematics     0
## 5 machine learning     0
## 6             team     0
# Research Question 1. What is the most needed skills in jobs in data science?
df_1 <- select(dice_job, Skill, Count)
head(df_1)
##              Skill Count
## 1           Python     9
## 2      programming     1
## 3        analytics     0
## 4      mathematics     0
## 5 machine learning     0
## 6             team     0
# Group by skills and sum up the counts
df_1 <- aggregate(df_1$Count, by = list(Skills = df_1$Skill), FUN = sum)
head(df_1)
##          Skills  x
## 1            R   6
## 2     analytics 83
## 3      big data 28
## 4 communication 23
## 5   Cooperation  0
## 6      creative  6
colnames(df_1)[2] <- "Frequency"
df_top <- arrange(df_1, desc(Frequency))
df_top8 <- df_top[1:8, ]
df_top8
##             Skills Frequency
## 1           Python       170
## 2             team       142
## 3              SQL       127
## 4              SAS        85
## 5        analytics        83
## 6 machine learning        75
## 7             Java        56
## 8           Hadoop        43
# plot the counts vs. skills
ggplot(df_top8, aes(x = df_top8$Skills, y = df_top8$Frequency, group = 1)) +
  geom_point() + geom_line() + 
  ggtitle("Skills most needed") +
  labs(x = "Skills", y = "Frequency")

# Research Question 2. What area need most jobs in data science?
df_2 <- select(dice_job, X, location)
colnames(df_2)[1] <- "number"
df_2 <- aggregate(df_2$number, by = list(Location = df_2$location), FUN = sum)
colnames(df_2)[2] <- "Total"
head(df_2)
##          Location Total
## 1     Atlanta, GA  9449
## 2   Baltimore, MD 31482
## 3    Bellevue, WA 34639
## 4     Belmont, CA 19129
## 5 Bentonville, AR  1221
## 6    Bethesda, MD 13321
df_2 <- filter(df_2, df_2$Total >= mean(df_2$Total))
df_2 <- arrange(df_2, desc(Total))
df2_top3 <- df_2[1:3, ]
# plot the total of jobs vs. location
barplot(df2_top3$Total, 
        main = "Data Science Jobs vs. Location",
        names.arg = df2_top3$Location,
        ylab = "Total number of Jobs")