library(tidyverse)
library(tm)
library(wordcloud)
library(knitr)
library(kableExtra)
library(reshape2)
library(RSQLite)con <- dbConnect(RSQLite::SQLite(), "ML_Survey.sqlite" ,overwrite =TRUE )
dbListTables(con)## [1] "MCR_Tb"
df <- read.csv(file = "https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/multipleChoiceResponses.csv", header= TRUE)# add sequence to the dataframe
df<- tibble::rowid_to_column(df, "ID")col_index <- c(1:15, 38:47,70:82,134,168:172,208:211)
df2 <- df[,col_index]dbWriteTable(con, "MCR_Tb", df2, overwrite= T)
dbListTables(con)## [1] "MCR_Tb"
dbListFields(con, "MCR_Tb")## [1] "ID"
## [2] "GenderSelect"
## [3] "Country"
## [4] "Age"
## [5] "EmploymentStatus"
## [6] "StudentStatus"
## [7] "LearningDataScience"
## [8] "CodeWriter"
## [9] "CareerSwitcher"
## [10] "CurrentJobTitleSelect"
## [11] "TitleFit"
## [12] "CurrentEmployerType"
## [13] "MLToolNextYearSelect"
## [14] "MLMethodNextYearSelect"
## [15] "LanguageRecommendationSelect"
## [16] "JobSkillImportanceBigData"
## [17] "JobSkillImportanceDegree"
## [18] "JobSkillImportanceStats"
## [19] "JobSkillImportanceEnterpriseTools"
## [20] "JobSkillImportancePython"
## [21] "JobSkillImportanceR"
## [22] "JobSkillImportanceSQL"
## [23] "JobSkillImportanceKaggleRanking"
## [24] "JobSkillImportanceMOOC"
## [25] "JobSkillImportanceVisualizations"
## [26] "EmployerIndustry"
## [27] "EmployerSize"
## [28] "EmployerSizeChange"
## [29] "EmployerMLTime"
## [30] "EmployerSearchMethod"
## [31] "UniversityImportance"
## [32] "JobFunctionSelect"
## [33] "WorkHardwareSelect"
## [34] "WorkDataTypeSelect"
## [35] "WorkProductionFrequency"
## [36] "WorkDatasetSize"
## [37] "WorkAlgorithmsSelect"
## [38] "WorkToolsSelect"
## [39] "WorkMethodsSelect"
## [40] "TimeGatheringData"
## [41] "TimeModelBuilding"
## [42] "TimeProduction"
## [43] "TimeVisualizing"
## [44] "TimeFindingInsights"
## [45] "CompensationAmount"
## [46] "CompensationCurrency"
## [47] "SalaryChange"
## [48] "JobSatisfaction"
display_cols <- dbReadTable(con, "MCR_Tb")
head(display_cols, 3)## ID GenderSelect Country Age
## 1 1 Non-binary, genderqueer, or gender non-conforming NA
## 2 2 Female United States 30
## 3 3 Male Canada 28
## EmploymentStatus StudentStatus LearningDataScience
## 1 Employed full-time
## 2 Not employed, but looking for work
## 3 Not employed, but looking for work
## CodeWriter CareerSwitcher CurrentJobTitleSelect TitleFit
## 1 Yes DBA/Database Engineer Fine
## 2
## 3
## CurrentEmployerType
## 1 Employed by a company that doesn't perform advanced analytics,Employed by non-profit or NGO
## 2
## 3
## MLToolNextYearSelect MLMethodNextYearSelect LanguageRecommendationSelect
## 1 SAS Base Random Forests F#
## 2 Python Random Forests Python
## 3 Amazon Web services Deep learning R
## JobSkillImportanceBigData JobSkillImportanceDegree
## 1
## 2 Nice to have
## 3 Necessary
## JobSkillImportanceStats JobSkillImportanceEnterpriseTools
## 1
## 2 Unnecessary
## 3
## JobSkillImportancePython JobSkillImportanceR JobSkillImportanceSQL
## 1
## 2 Unnecessary Necessary
## 3 Necessary
## JobSkillImportanceKaggleRanking JobSkillImportanceMOOC
## 1
## 2
## 3
## JobSkillImportanceVisualizations EmployerIndustry EmployerSize
## 1 Internet-based 100 to 499 employees
## 2
## 3
## EmployerSizeChange EmployerMLTime
## 1 Increased slightly 3-5 years
## 2
## 3
## EmployerSearchMethod
## 1 I visited the company's Web site and found a job listing there
## 2
## 3
## UniversityImportance
## 1 Not very important
## 2
## 3
## JobFunctionSelect
## 1 Build prototypes to explore applying machine learning to new areas
## 2
## 3
## WorkHardwareSelect
## 1 Gaming Laptop (Laptop + CUDA capable GPU),Workstation + Cloud service
## 2
## 3
## WorkDataTypeSelect WorkProductionFrequency WorkDatasetSize
## 1 Text data,Relational data Rarely 10GB
## 2
## 3
## WorkAlgorithmsSelect
## 1 Neural Networks,Random Forests,RNNs
## 2
## 3
## WorkToolsSelect
## 1 Amazon Web services,Oracle Data Mining/ Oracle R Enterprise,Perl
## 2
## 3
## WorkMethodsSelect
## 1 Association Rules,Collaborative Filtering,Neural Networks,PCA and Dimensionality Reduction,Random Forests
## 2
## 3
## TimeGatheringData TimeModelBuilding TimeProduction TimeVisualizing
## 1 0 100 0 0
## 2 NA NA NA NA
## 3 NA NA NA NA
## TimeFindingInsights CompensationAmount CompensationCurrency
## 1 0
## 2 NA
## 3 NA
## SalaryChange JobSatisfaction
## 1 I am not currently employed 5
## 2
## 3
dbDisconnect(con)dataskills <- display_colsreview_text <- paste(dataskills, collapse=" ")
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)Use text mining (TM) to extract count of words using a corpus. Text Mining package also filters out “stop words” - words that don’t have value (this, is, and), numbers, and other unnecessary words that don’t add value (as defined by us).
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("important", "kaggle", "somewhat", "useful", "yes", "etc", "often", "enough", "courses", "non", "nice", "laptop", "coursera", "year", "udemy", "run", "youtube", "socrata", "workstation", "online", "edx", "sometimes", "employed", "logistic", "male", "necessary", "company", "increased"))
dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)
table <- head(frequency, 20)Wordclouds give a quick and easy display of our top words. This allows us to quickly see which words are among the top for data science skills.
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100],
colors=brewer.pal(8, "Dark2"))Looking at the most frequent words, it seems that most data science skills from the kaggle survey relate to the following words:
wf <- data.frame(word=names(frequency), frequency=frequency)
ggplot(subset(wf, frequency>5000), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", aes(fill= reorder(word, -frequency))) +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
theme(legend.position="none") +
ylab("Frequency") +
xlab("Words") +
ggtitle("Most frequent words by count")ggplot(subset(wf, frequency>5000 & frequency < 20000), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", aes(fill= reorder(word, -frequency))) +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
theme(legend.position="none") +
ylab("Frequency") +
xlab("Words") +
ggtitle("Most frequent words by count")The data shows that potential employers would value some of the following skills: time (assuming time series), python, regression, machine learning, SQL.
table <- head(wf, 20)
kable(table, "html", escape = F) %>%
kable_styling("striped", full_width = T) %>%
column_spec(1, bold = T)| word | frequency | |
|---|---|---|
| data | data | 28166 |
| time | time | 17558 |
| python | python | 14727 |
| regression | regression | 13783 |
| learning | learning | 11802 |
| full | full | 10897 |
| analytics | analytics | 8456 |
| fine | fine | 7501 |
| decision | decision | 7491 |
| trees | trees | 7491 |
| employees | employees | 7449 |
| text | text | 7153 |
| forests | forests | 6986 |
| random | random | 6986 |
| neural | neural | 6940 |
| advanced | advanced | 6309 |
| years | years | 6251 |
| networks | networks | 6164 |
| machine | machine | 5620 |
| analysis | analysis | 5298 |