Project 3

Libraries

library(tidyverse)
library(tm)
library(wordcloud)
library(knitr)
library(kableExtra)
library(reshape2)
library(RSQLite)

SQL Database

Create ephemeral in-memory RSQLite Database

con <- dbConnect(RSQLite::SQLite(), "ML_Survey.sqlite" ,overwrite =TRUE )
dbListTables(con)

## [1] "MCR_Tb"

Load data from github into R

df <-  read.csv(file = "https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/multipleChoiceResponses.csv", header= TRUE)

Sequence added to dataframe

# add sequence to the dataframe 
df<- tibble::rowid_to_column(df, "ID")

col_index <- c(1:15, 38:47,70:82,134,168:172,208:211)
df2 <- df[,col_index]

dbWriteTable(con,  "MCR_Tb", df2, overwrite= T)
dbListTables(con)

## [1] "MCR_Tb"

Display Database tables

dbListFields(con, "MCR_Tb")

##  [1] "ID"                               
##  [2] "GenderSelect"                     
##  [3] "Country"                          
##  [4] "Age"                              
##  [5] "EmploymentStatus"                 
##  [6] "StudentStatus"                    
##  [7] "LearningDataScience"              
##  [8] "CodeWriter"                       
##  [9] "CareerSwitcher"                   
## [10] "CurrentJobTitleSelect"            
## [11] "TitleFit"                         
## [12] "CurrentEmployerType"              
## [13] "MLToolNextYearSelect"             
## [14] "MLMethodNextYearSelect"           
## [15] "LanguageRecommendationSelect"     
## [16] "JobSkillImportanceBigData"        
## [17] "JobSkillImportanceDegree"         
## [18] "JobSkillImportanceStats"          
## [19] "JobSkillImportanceEnterpriseTools"
## [20] "JobSkillImportancePython"         
## [21] "JobSkillImportanceR"              
## [22] "JobSkillImportanceSQL"            
## [23] "JobSkillImportanceKaggleRanking"  
## [24] "JobSkillImportanceMOOC"           
## [25] "JobSkillImportanceVisualizations" 
## [26] "EmployerIndustry"                 
## [27] "EmployerSize"                     
## [28] "EmployerSizeChange"               
## [29] "EmployerMLTime"                   
## [30] "EmployerSearchMethod"             
## [31] "UniversityImportance"             
## [32] "JobFunctionSelect"                
## [33] "WorkHardwareSelect"               
## [34] "WorkDataTypeSelect"               
## [35] "WorkProductionFrequency"          
## [36] "WorkDatasetSize"                  
## [37] "WorkAlgorithmsSelect"             
## [38] "WorkToolsSelect"                  
## [39] "WorkMethodsSelect"                
## [40] "TimeGatheringData"                
## [41] "TimeModelBuilding"                
## [42] "TimeProduction"                   
## [43] "TimeVisualizing"                  
## [44] "TimeFindingInsights"              
## [45] "CompensationAmount"               
## [46] "CompensationCurrency"             
## [47] "SalaryChange"                     
## [48] "JobSatisfaction"

Display Database data

display_cols <- dbReadTable(con, "MCR_Tb")

head(display_cols, 3)

##   ID                                      GenderSelect       Country Age
## 1  1 Non-binary, genderqueer, or gender non-conforming                NA
## 2  2                                            Female United States  30
## 3  3                                              Male        Canada  28
##                     EmploymentStatus StudentStatus LearningDataScience
## 1                 Employed full-time                                  
## 2 Not employed, but looking for work                                  
## 3 Not employed, but looking for work                                  
##   CodeWriter CareerSwitcher CurrentJobTitleSelect TitleFit
## 1        Yes                DBA/Database Engineer     Fine
## 2                                                         
## 3                                                         
##                                                                           CurrentEmployerType
## 1 Employed by a company that doesn't perform advanced analytics,Employed by non-profit or NGO
## 2                                                                                            
## 3                                                                                            
##   MLToolNextYearSelect MLMethodNextYearSelect LanguageRecommendationSelect
## 1             SAS Base         Random Forests                           F#
## 2               Python         Random Forests                       Python
## 3  Amazon Web services          Deep learning                            R
##   JobSkillImportanceBigData JobSkillImportanceDegree
## 1                                                   
## 2                                       Nice to have
## 3                 Necessary                         
##   JobSkillImportanceStats JobSkillImportanceEnterpriseTools
## 1                                                          
## 2             Unnecessary                                  
## 3                                                          
##   JobSkillImportancePython JobSkillImportanceR JobSkillImportanceSQL
## 1                                                                   
## 2              Unnecessary                                 Necessary
## 3                                    Necessary                      
##   JobSkillImportanceKaggleRanking JobSkillImportanceMOOC
## 1                                                       
## 2                                                       
## 3                                                       
##   JobSkillImportanceVisualizations EmployerIndustry         EmployerSize
## 1                                    Internet-based 100 to 499 employees
## 2                                                                       
## 3                                                                       
##   EmployerSizeChange EmployerMLTime
## 1 Increased slightly      3-5 years
## 2                                  
## 3                                  
##                                             EmployerSearchMethod
## 1 I visited the company's Web site and found a job listing there
## 2                                                               
## 3                                                               
##   UniversityImportance
## 1   Not very important
## 2                     
## 3                     
##                                                    JobFunctionSelect
## 1 Build prototypes to explore applying machine learning to new areas
## 2                                                                   
## 3                                                                   
##                                                      WorkHardwareSelect
## 1 Gaming Laptop (Laptop + CUDA capable GPU),Workstation + Cloud service
## 2                                                                      
## 3                                                                      
##          WorkDataTypeSelect WorkProductionFrequency WorkDatasetSize
## 1 Text data,Relational data                  Rarely            10GB
## 2                                                                  
## 3                                                                  
##                  WorkAlgorithmsSelect
## 1 Neural Networks,Random Forests,RNNs
## 2                                    
## 3                                    
##                                                    WorkToolsSelect
## 1 Amazon Web services,Oracle Data Mining/ Oracle R Enterprise,Perl
## 2                                                                 
## 3                                                                 
##                                                                                           WorkMethodsSelect
## 1 Association Rules,Collaborative Filtering,Neural Networks,PCA and Dimensionality Reduction,Random Forests
## 2                                                                                                          
## 3                                                                                                          
##   TimeGatheringData TimeModelBuilding TimeProduction TimeVisualizing
## 1                 0               100              0               0
## 2                NA                NA             NA              NA
## 3                NA                NA             NA              NA
##   TimeFindingInsights CompensationAmount CompensationCurrency
## 1                   0                                        
## 2                  NA                                        
## 3                  NA                                        
##                  SalaryChange JobSatisfaction
## 1 I am not currently employed               5
## 2                                            
## 3

dbDisconnect(con)

Tidy/Clean Data

dataskills <- display_cols

Create corpus from data

review_text <- paste(dataskills, collapse=" ")
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)

Text Mining:

Use text mining (TM) to extract count of words using a corpus. Text Mining package also filters out “stop words” - words that don’t have value (this, is, and), numbers, and other unnecessary words that don’t add value (as defined by us).

corpus <- tm_map(corpus, content_transformer(tolower))

corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("important", "kaggle", "somewhat", "useful", "yes", "etc", "often", "enough", "courses", "non", "nice", "laptop", "coursera", "year", "udemy", "run", "youtube", "socrata", "workstation", "online", "edx", "sometimes", "employed", "logistic", "male", "necessary", "company", "increased"))

dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)


frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)

table <- head(frequency, 20)

Wordcloud of top words from within our dataset.

Wordclouds give a quick and easy display of our top words. This allows us to quickly see which words are among the top for data science skills.

words <- names(frequency)
wordcloud(words[1:100], frequency[1:100], 
          colors=brewer.pal(8, "Dark2"))

Analysis

Histogram of Frequent words

Looking at the most frequent words, it seems that most data science skills from the kaggle survey relate to the following words:

wf <- data.frame(word=names(frequency), frequency=frequency)

ggplot(subset(wf, frequency>5000), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", aes(fill= reorder(word, -frequency))) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  theme(legend.position="none") +
  ylab("Frequency") +
  xlab("Words") +
  ggtitle("Most frequent words by count")

Top 2 words (data and time) removed from dataset. This allows us to subset the data and see more easily the variation between the top variables.

ggplot(subset(wf, frequency>5000 & frequency < 20000), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", aes(fill= reorder(word, -frequency))) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  theme(legend.position="none") +
  ylab("Frequency") +
  xlab("Words") +
  ggtitle("Most frequent words by count")

The data shows that potential employers would value some of the following skills: time (assuming time series), python, regression, machine learning, SQL.

table <- head(wf, 20)
kable(table, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)

	word	frequency
data	data	28166
time	time	17558
python	python	14727
regression	regression	13783
learning	learning	11802
full	full	10897
analytics	analytics	8456
fine	fine	7501
decision	decision	7491
trees	trees	7491
employees	employees	7449
text	text	7153
forests	forests	6986
random	random	6986
neural	neural	6940
advanced	advanced	6309
years	years	6251
networks	networks	6164
machine	machine	5620
analysis	analysis	5298

CUNY MSDS DATA 607 Project 3

Amanda Arce

October 1, 2018