The original data was pulled from Indeed, https://www.indeed.com/.
The program written in Python created csv file Skill.csv https://github.com/mgroysman/Data-607-Week-12-Assignment-
The data was loaded in MySQL normalized and outputed into csv file SkillsDataforR.csv (SQL query “SQL Load Skills V3.sql”)
Connecting to MySQL database using RMySQL package.
Moving MySQL table Jobs.output_for_r into R dataframe
#install.packages("RMySQL")
library(RMySQL)
## Loading required package: DBI
mydb = dbConnect(MySQL(), user='root', password='South@30', dbname='Jobs', host='localhost')
dbListTables(mydb)
## [1] "companies" "jobsxwalk" "output_for_r" "skills"
## [5] "tblcompanies" "tbljobs" "tbljobsxwalk" "tbllocations"
## [9] "tblskills"
MySQL <- dbSendQuery(mydb, "SELECT * FROM Jobs.output_for_r")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 3 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 5 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 7 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 8 imported
## as numeric
Rdata = fetch(MySQL, n=-1)
#MySQL<-read.csv(file='C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/SkillsDataforR.csv',header=FALSE)
#head(MySQL)
#colnames(MySQL)<-c("Skills","SkillsKey","Title","TitleKey","Company","CompanyKey","Location","LocationKey","PKey")
#head(MySQL)
#summary(MySQL)
summary(Rdata)
## SKILL SKILLKEY JOBTITLE JOBID
## Length:30410 Min. : 1.0 Length:30410 Min. : 1
## Class :character 1st Qu.:12.0 Class :character 1st Qu.: 685
## Mode :character Median :24.0 Mode :character Median :1302
## Mean :26.1 Mean :1417
## 3rd Qu.:37.0 3rd Qu.:2099
## Max. :56.0 Max. :3451
## COMPANY COMPID LOCATION LOCID
## Length:30410 Min. : 1.0 Length:30410 Min. : 1.0
## Class :character 1st Qu.: 292.0 Class :character 1st Qu.: 39.0
## Mode :character Median : 729.0 Mode :character Median :137.0
## Mean : 750.6 Mean :236.2
## 3rd Qu.:1137.0 3rd Qu.:387.0
## Max. :1890.0 Max. :934.0
## ID
## Min. : 1
## 1st Qu.: 7603
## Median :15206
## Mean :15206
## 3rd Qu.:22808
## Max. :30410
head(Rdata)
## SKILL SKILLKEY JOBTITLE JOBID
## 1 all 1 Sr. Data Scientist 1
## 2 python 2 Sr. Data Scientist 1
## 3 r 3 Sr. Data Scientist 1
## 4 modeling 10 Sr. Data Scientist 1
## 5 Machine learning 11 Sr. Data Scientist 1
## 6 SQL 12 Sr. Data Scientist 1
## COMPANY COMPID LOCATION LOCID ID
## 1 Confidential, Outsourcing 1 NA 1 1
## 2 Confidential, Outsourcing 1 NA 1 2
## 3 Confidential, Outsourcing 1 NA 1 3
## 4 Confidential, Outsourcing 1 NA 1 4
## 5 Confidential, Outsourcing 1 NA 1 5
## 6 Confidential, Outsourcing 1 NA 1 6
#install.packages("mongolite")
library(mongolite)
my_collection = mongo(collection = "DataScience_Skills_MongoDB", db = "DataScience") # create connection, database and collection
my_collection$insert(Rdata)
## List of 5
## $ nInserted : num 30410
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
my_collection$count()
## [1] 30410