Project 3

Libraries

library(tidyverse)
library(tm)
library(wordcloud)
library(knitr)
library(kableExtra)
library(reshape2)
library(RSQLite)

SQL Database

Create ephemeral in-memory RSQLite Database

con <- dbConnect(RSQLite::SQLite(), "ML_Survey.sqlite" ,overwrite =TRUE )
dbListTables(con)
## [1] "MCR_Tb"

Load data from github into R

df <-  read.csv(file = "https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/multipleChoiceResponses.csv", header= TRUE)

Sequence added to dataframe

# add sequence to the dataframe 
df<- tibble::rowid_to_column(df, "ID")
col_index <- c(1:15, 38:47,70:82,134,168:172,208:211)
df2 <- df[,col_index]
dbWriteTable(con,  "MCR_Tb", df2, overwrite= T)
dbListTables(con)
## [1] "MCR_Tb"

Display Database tables

dbListFields(con, "MCR_Tb")
##  [1] "ID"                               
##  [2] "GenderSelect"                     
##  [3] "Country"                          
##  [4] "Age"                              
##  [5] "EmploymentStatus"                 
##  [6] "StudentStatus"                    
##  [7] "LearningDataScience"              
##  [8] "CodeWriter"                       
##  [9] "CareerSwitcher"                   
## [10] "CurrentJobTitleSelect"            
## [11] "TitleFit"                         
## [12] "CurrentEmployerType"              
## [13] "MLToolNextYearSelect"             
## [14] "MLMethodNextYearSelect"           
## [15] "LanguageRecommendationSelect"     
## [16] "JobSkillImportanceBigData"        
## [17] "JobSkillImportanceDegree"         
## [18] "JobSkillImportanceStats"          
## [19] "JobSkillImportanceEnterpriseTools"
## [20] "JobSkillImportancePython"         
## [21] "JobSkillImportanceR"              
## [22] "JobSkillImportanceSQL"            
## [23] "JobSkillImportanceKaggleRanking"  
## [24] "JobSkillImportanceMOOC"           
## [25] "JobSkillImportanceVisualizations" 
## [26] "EmployerIndustry"                 
## [27] "EmployerSize"                     
## [28] "EmployerSizeChange"               
## [29] "EmployerMLTime"                   
## [30] "EmployerSearchMethod"             
## [31] "UniversityImportance"             
## [32] "JobFunctionSelect"                
## [33] "WorkHardwareSelect"               
## [34] "WorkDataTypeSelect"               
## [35] "WorkProductionFrequency"          
## [36] "WorkDatasetSize"                  
## [37] "WorkAlgorithmsSelect"             
## [38] "WorkToolsSelect"                  
## [39] "WorkMethodsSelect"                
## [40] "TimeGatheringData"                
## [41] "TimeModelBuilding"                
## [42] "TimeProduction"                   
## [43] "TimeVisualizing"                  
## [44] "TimeFindingInsights"              
## [45] "CompensationAmount"               
## [46] "CompensationCurrency"             
## [47] "SalaryChange"                     
## [48] "JobSatisfaction"

Display Database data

display_cols <- dbReadTable(con, "MCR_Tb")

head(display_cols, 3)
##   ID                                      GenderSelect       Country Age
## 1  1 Non-binary, genderqueer, or gender non-conforming                NA
## 2  2                                            Female United States  30
## 3  3                                              Male        Canada  28
##                     EmploymentStatus StudentStatus LearningDataScience
## 1                 Employed full-time                                  
## 2 Not employed, but looking for work                                  
## 3 Not employed, but looking for work                                  
##   CodeWriter CareerSwitcher CurrentJobTitleSelect TitleFit
## 1        Yes                DBA/Database Engineer     Fine
## 2                                                         
## 3                                                         
##                                                                           CurrentEmployerType
## 1 Employed by a company that doesn't perform advanced analytics,Employed by non-profit or NGO
## 2                                                                                            
## 3                                                                                            
##   MLToolNextYearSelect MLMethodNextYearSelect LanguageRecommendationSelect
## 1             SAS Base         Random Forests                           F#
## 2               Python         Random Forests                       Python
## 3  Amazon Web services          Deep learning                            R
##   JobSkillImportanceBigData JobSkillImportanceDegree
## 1                                                   
## 2                                       Nice to have
## 3                 Necessary                         
##   JobSkillImportanceStats JobSkillImportanceEnterpriseTools
## 1                                                          
## 2             Unnecessary                                  
## 3                                                          
##   JobSkillImportancePython JobSkillImportanceR JobSkillImportanceSQL
## 1                                                                   
## 2              Unnecessary                                 Necessary
## 3                                    Necessary                      
##   JobSkillImportanceKaggleRanking JobSkillImportanceMOOC
## 1                                                       
## 2                                                       
## 3                                                       
##   JobSkillImportanceVisualizations EmployerIndustry         EmployerSize
## 1                                    Internet-based 100 to 499 employees
## 2                                                                       
## 3                                                                       
##   EmployerSizeChange EmployerMLTime
## 1 Increased slightly      3-5 years
## 2                                  
## 3                                  
##                                             EmployerSearchMethod
## 1 I visited the company's Web site and found a job listing there
## 2                                                               
## 3                                                               
##   UniversityImportance
## 1   Not very important
## 2                     
## 3                     
##                                                    JobFunctionSelect
## 1 Build prototypes to explore applying machine learning to new areas
## 2                                                                   
## 3                                                                   
##                                                      WorkHardwareSelect
## 1 Gaming Laptop (Laptop + CUDA capable GPU),Workstation + Cloud service
## 2                                                                      
## 3                                                                      
##          WorkDataTypeSelect WorkProductionFrequency WorkDatasetSize
## 1 Text data,Relational data                  Rarely            10GB
## 2                                                                  
## 3                                                                  
##                  WorkAlgorithmsSelect
## 1 Neural Networks,Random Forests,RNNs
## 2                                    
## 3                                    
##                                                    WorkToolsSelect
## 1 Amazon Web services,Oracle Data Mining/ Oracle R Enterprise,Perl
## 2                                                                 
## 3                                                                 
##                                                                                           WorkMethodsSelect
## 1 Association Rules,Collaborative Filtering,Neural Networks,PCA and Dimensionality Reduction,Random Forests
## 2                                                                                                          
## 3                                                                                                          
##   TimeGatheringData TimeModelBuilding TimeProduction TimeVisualizing
## 1                 0               100              0               0
## 2                NA                NA             NA              NA
## 3                NA                NA             NA              NA
##   TimeFindingInsights CompensationAmount CompensationCurrency
## 1                   0                                        
## 2                  NA                                        
## 3                  NA                                        
##                  SalaryChange JobSatisfaction
## 1 I am not currently employed               5
## 2                                            
## 3
dbDisconnect(con)

Tidy/Clean Data

dataskills <- display_cols

Create corpus from data

review_text <- paste(dataskills, collapse=" ")
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)

Text Mining:

Use text mining (TM) to extract count of words using a corpus. Text Mining package also filters out “stop words” - words that don’t have value (this, is, and), numbers, and other unnecessary words that don’t add value (as defined by us).

corpus <- tm_map(corpus, content_transformer(tolower))

corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("important", "kaggle", "somewhat", "useful", "yes", "etc", "often", "enough", "courses", "non", "nice", "laptop", "coursera", "year", "udemy", "run", "youtube", "socrata", "workstation", "online", "edx", "sometimes", "employed", "logistic", "male", "necessary", "company", "increased"))

dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)


frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)

table <- head(frequency, 20)

Wordcloud of top words from within our dataset.

Wordclouds give a quick and easy display of our top words. This allows us to quickly see which words are among the top for data science skills.

words <- names(frequency)
wordcloud(words[1:100], frequency[1:100], 
          colors=brewer.pal(8, "Dark2"))

Analysis

Histogram of Frequent words

Looking at the most frequent words, it seems that most data science skills from the kaggle survey relate to the following words:

wf <- data.frame(word=names(frequency), frequency=frequency)

ggplot(subset(wf, frequency>5000), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", aes(fill= reorder(word, -frequency))) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  theme(legend.position="none") +
  ylab("Frequency") +
  xlab("Words") +
  ggtitle("Most frequent words by count")

Top 2 words (data and time) removed from dataset. This allows us to subset the data and see more easily the variation between the top variables.

ggplot(subset(wf, frequency>5000 & frequency < 20000), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", aes(fill= reorder(word, -frequency))) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  theme(legend.position="none") +
  ylab("Frequency") +
  xlab("Words") +
  ggtitle("Most frequent words by count")

The data shows that potential employers would value some of the following skills: time (assuming time series), python, regression, machine learning, SQL.

table <- head(wf, 20)
kable(table, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)
word frequency
data data 28166
time time 17558
python python 14727
regression regression 13783
learning learning 11802
full full 10897
analytics analytics 8456
fine fine 7501
decision decision 7491
trees trees 7491
employees employees 7449
text text 7153
forests forests 6986
random random 6986
neural neural 6940
advanced advanced 6309
years years 6251
networks networks 6164
machine machine 5620
analysis analysis 5298