library(RSQLite)
## Warning: package 'RSQLite' was built under R version 3.3.3
# need to add this library first
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.3.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.3.3
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v readr 1.1.1
## v tibble 1.4.2 v purrr 0.2.4
## v tidyr 0.8.0 v stringr 1.3.1
## v ggplot2 3.0.0 v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.3.3
## Warning: package 'tidyr' was built under R version 3.3.3
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Warning: package 'forcats' was built under R version 3.3.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd ('E:/Proj3 Nov3 Charlie FolderGithub/csvs')
# setwd ('C:/Users/User/Desktop/To Email DeskTop')
#E:\Proj3 Nov3 Charlie FolderGithub\csvs
# Physical library on Gracie's PC
getwd()
## [1] "E:/Proj3 Nov3 Charlie FolderGithub/csvs"
#try(setwd("csvs"))
# Relative path in Github
Database1: Start with Indeed Data Scraping Data
# Load csv file containing foreign key into a dataframe
Keyword_Search_csv_df <- read.csv("Indeed_Keyword_Search_data.csv")
# Load csv file containing primary key into a dataframe
keywords_csv_df <- read.csv("Indeed_Keyword.csv")
# Select relevent columns for database table insert
Keyword_Search_csv_df <- Keyword_Search_csv_df %>%
select(Data_Scientist_Skills_ID, Company, Web_scrape_date, state, job_post_link)
head(Keyword_Search_csv_df)
## Data_Scientist_Skills_ID Company Web_scrape_date state
## 1 12 Quest Groups LLC 2018-10-30 NY
## 2 2 Deloitte 2018-10-30 NY
## 3 12 Rockstar New York 2018-10-30 NY
## 4 7 Rockstar New York 2018-10-30 NY
## 5 4 Rockstar New York 2018-10-30 NY
## 6 2 Rockstar New York 2018-10-30 NY
## job_post_link
## 1 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CINkGedeUSmfoCD6vzIo7nqDgc1O7Ij50xtHbnUDtgdnRvMdLQBRXYG5gtOfJ3Va02of8kg1VfdMuKdphvIpMPmeynQADeiZ11NfG0wpQx4hOTpk9c12TK_az10UfrGEqD0nbAMNx96WnM_k8jqI_n0sQlAZHQ8BfkLnQpT9TgyDk8H3QkGGKpCrOpbc7arEtMGYxoS5p7P2IuWPGWHpBXSUr68QKx72H0olEpBb3lnb-Z4uQ8oOW8-hasNnXMwU5XOodw1n1R5WL9hVSwxddtyDTFRVWuaMe1achnuPrZA_BEgjoyYetQzBzO2rwdpyhcMaj5Ckwl1GoP8E7rXpD8okonBbIeMSiudkjOXNok_N4A2TNQen0JkUtdKru9M1RlSOE9itsM06RqB0JZ1FzpvZPh5jaBKNDWdM29mxaZIfSFd-LbjGgJ&vjs=3&p=5&sk=&fvj=1
## 2 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0DIXU_djF9v0NrX_xbLRwj6RWFeuMEgTY6VvwKgvleOVN2aNd4LWgpetAhtYQED-V7CCI0ZA8vfYA-_lRlKhPw6BT146FI1D-089TTA6b7rR35mjFQXU6K3FvPbBs-WDB1n8Jq9KtnGVq2d4ceA9he2FkVHOYHDywnPVLBEbuTEQ50cHrKbMBifPwsXXEhoUchseWpQKX1aypOmLHi_Wql8QswTaCTEb_iACGNxQbB_3F1orLgfYAOPcYnWDIOEAPbhf_jZ5RvhEWWGkbvlEFxQd8xoB2UNnpJvM1uFl1uT6aP_YrhGI8906Amc5P9f07OEAsT7XTjmek5wRV_cZN7Vb3MgYrT0aHQptZDGwxaK8mRuIOALQf0dPya7QFqYd9xneTfWFmF17BiVOLC8FwK_sv_WDMSoC9vO9XTsCANPIIG67rtZ8mjIZTMlHVn3zRIhgCVHYtzth_5z8T0eHBNYKzkfl51IB_IKcEiM2r0nsA==&vjs=3&p=2&sk=&fvj=0
## 3 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
## 4 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
## 5 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
## 6 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
# Select relevent columns for database table insert
keywords_csv_df <- keywords_csv_df %>%
select(ID, Keyword)
head(keywords_csv_df)
## ID Keyword
## 1 18 PHP
## 2 17 JS
## 3 16 Ruby
## 4 15 Statistics
## 5 14 Probability
## 6 13 Machine Learning
Database2: continue with the ONET (Labor Dept Data) ONET Characteristics table is the main table from ONET database, then Salary table, then Skills (Soft) table
characteristics <- read.csv ("characteristics.csv")
characteristics <- characteristics %>%
select (Title,SOC , OccupationType, TypicalEntryLvlEduc,
PreEmplExperience, PostEmplTraining)
head(characteristics)
## Title SOC OccupationType
## 1 Computer and information research scientists 1111 Line item
## 2 Computer systems analysts 1121 Line item
## 3 Information security analysts 1122 Line item
## 4 Computer programmers 1131 Line item
## 5 Software developers, applications 1132 Line item
## 6 Software developers, systems software 1133 Line item
## TypicalEntryLvlEduc PreEmplExperience PostEmplTraining
## 1 Master's degree None None
## 2 Bachelor's degree None None
## 3 Bachelor's degree Less than 5 years None
## 4 Bachelor's degree None None
## 5 Bachelor's degree None None
## 6 Bachelor's degree None None
salary <- read.csv ("salary.csv")
salary <- salary %>%
select ( SOC, No.Employees, RSE, Mean.Hourly.Wage, Mean.Annual.Wage, Wage.RSE)
head(salary)
## SOC No.Employees RSE Mean.Hourly.Wage Mean.Annual.Wage Wage.RSE
## 1 1133 394,590 1.3 % $53.74 111780 0.4 %
## 2 1134 125,890 1.3 % $35.63 74110 0.6 %
## 3 1141 113,690 1.1 % $42.81 89050 0.3 %
## 4 1142 375,040 0.8 % $41.51 86340 0.3 %
## 5 1143 157,830 1.6 % $51.86 107870 0.5 %
## 6 1151 613,780 0.7 % $26.03 54150 0.4 %
The linkage bw ONET and INDEED data is the technical skills ## Tech Skill format 1: wide format, each skill is spreaded
skills_sum <- read.csv ('skills_sum.csv')
skills_sum %>%
select (jobNoNA, jobcodeNoNA, C, Django, JavaScript,
MongoDB, MySQL, NoSQL, PostgreSQL,
R, Ruby ,SAS, STATISTICA, SuperANOVA, Tableau )
## jobNoNA jobcodeNoNA C
## 1 Biostatisticians 15-2041.01 0
## 2 Acturies 15-2011.00 0
## 3 Computer and Information Research Scientists 15-1111.00 1
## 4 Data Administrator 15-1141.00 1
## 5 Clinical Data Manager 15-2041.02 0
## 6 Computer User Support Specialists 15-1151.00 1
## 7 Compensation and Benefits managers 11-3111.00 0
## 8 Market Research Analysts and Marketing Specialists 13-1161.00 0
## 9 Auditors 13-2011.02 0
## 10 Financial Analysts 13-2051.00 0
## 11 Risk Management Specialists 13-2099.02 0
## 12 Mathematicians 15-2021.00 1
## 13 Operations Research Analysts 15-2031.00 1
## 14 Statistician 15-2041.00 0
## 15 Economists 19-3011.00 0
## 16 Survery Rsearchers 19-3022.00 0
## 17 Regulatory Affairs Speacialists 13-1041.07 0
## 18 Compensation, Benefits and Job Analysis Specialists 13-1141.00 0
## 19 Credit Analysts 13-2041.00 0
## 20 Insurance Underwriters 13-2053.00 0
## 21 Social Science Research Assistant 13-2053.00 0
## 22 Computer System Analysts 15-1121.00 0
## 23 Software Developers 15-1133.00 0
## 24 Computer Systems Engineers Archtects 15-1199.02 1
## 25 Remote Sensing Scientists and Technologists 19-2099.01 1
## 26 Computer Science Teachers, Postsecondary 25-1021.00 1
## 27 Audio_Visual and Multimedia Collections Specialists 25-9011.00 0
## 28 Adudio and Video Equipment Technicians 27-4011.00 0
## 29 Broadcast Technicians 27-4012.00 0
## 30 Computer Operators 43-9011.00 0
## 31 Computer, Automated Teller 49-2011.00 0
## 32 Computer Programmer 15-1131.00 1
## 33 Web Developers 15-1134.00 1
## Django JavaScript MongoDB MySQL NoSQL PostgreSQL R Ruby SAS STATISTICA
## 1 0 0 0 1 0 0 1 0 1 1
## 2 0 0 0 0 0 0 1 0 1 0
## 3 1 1 1 1 1 1 1 1 1 0
## 4 1 1 1 1 1 1 1 1 1 0
## 5 0 0 0 0 0 0 0 0 1 0
## 6 0 1 0 1 0 1 0 1 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 1 0 0 0 0 1 0 0 0
## 9 0 0 0 0 0 0 1 0 1 0
## 10 0 0 0 0 0 0 1 0 1 0
## 11 0 0 0 0 0 0 1 1 1 0
## 12 0 1 0 1 0 0 1 0 1 0
## 13 0 0 0 1 0 0 1 0 1 0
## 14 0 0 0 0 0 0 1 0 1 1
## 15 0 0 0 1 0 0 0 0 1 0
## 16 0 1 0 0 0 0 0 0 1 0
## 17 0 0 0 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0 0 1 0
## 19 0 0 0 0 0 0 0 0 1 0
## 20 0 0 0 0 0 0 0 0 0 0
## 21 0 0 0 0 0 0 0 0 1 0
## 22 0 0 1 1 0 1 0 0 1 0
## 23 0 1 1 0 1 1 0 0 1 0
## 24 0 0 1 0 0 1 0 0 1 0
## 25 0 0 0 0 0 0 0 0 0 0
## 26 0 0 0 0 0 0 0 0 0 0
## 27 0 0 0 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0 0 0 0
## 29 0 0 0 0 0 0 0 0 0 0
## 30 0 0 0 0 0 0 0 0 0 0
## 31 0 0 0 0 0 0 0 0 0 0
## 32 0 0 1 0 0 1 0 0 1 0
## 33 0 1 1 0 1 1 0 0 0 0
## SuperANOVA Tableau
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 0
## 6 0 1
## 7 0 0
## 8 0 0
## 9 0 1
## 10 0 1
## 11 0 1
## 12 0 0
## 13 0 1
## 14 1 1
## 15 0 0
## 16 0 1
## 17 0 0
## 18 0 0
## 19 0 0
## 20 0 0
## 21 0 0
## 22 0 0
## 23 0 0
## 24 0 1
## 25 0 0
## 26 0 0
## 27 0 0
## 28 0 0
## 29 0 0
## 30 0 0
## 31 0 0
## 32 0 1
## 33 0 1
head(skills_sum)
## X jobNoNA jobcodeNoNA
## 1 1 Biostatisticians 15-2041.01
## 2 2 Acturies 15-2011.00
## 3 3 Computer and Information Research Scientists 15-1111.00
## 4 4 Data Administrator 15-1141.00
## 5 5 Clinical Data Manager 15-2041.02
## 6 6 Computer User Support Specialists 15-1151.00
## Apache.Cassandra Apache.Hadoop Apache.Hive Apache.Pig C C.. C...
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 1 1 1 1 1 0 0
## 4 1 1 1 1 1 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 0
## Computational.statistics.software Django Hypertext.markup.language.HTML
## 1 0 0 0
## 2 0 0 0
## 3 1 1 0
## 4 0 1 1
## 5 0 0 0
## 6 0 0 1
## IBM.SPSS.Statistics JavaScript JavaScript.Object.Notation.JSON
## 1 1 0 0
## 2 1 0 0
## 3 1 1 0
## 4 1 1 1
## 5 1 0 0
## 6 0 1 0
## Micosoft.SQL.Server.Analysis.Services.SSAS Microsoft.SQL.Server
## 1 0 1
## 2 0 0
## 3 0 1
## 4 1 1
## 5 0 1
## 6 0 1
## Microsoft.SQL.Server.Integration.Services.SSIS MongoDB MySQL
## 1 0 0 1
## 2 0 0 0
## 3 0 1 1
## 4 1 1 1
## 5 0 0 0
## 6 0 0 1
## NCR.Teradata.Warehouse.Miner NeuroSolutions.for.MatLab NoSQL Oracle.Java
## 1 0 0 0 1
## 2 0 0 0 0
## 3 0 0 1 1
## 4 0 0 1 1
## 5 0 0 0 1
## 6 0 0 0 1
## Oracle.JavaServer.Pages.JSP Oracle.PL.SQL Oracle.SQL.Loader
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 1 1 1
## 5 0 0 1
## 6 1 1 0
## Oracle.SQL.Plus PostgreSQL
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 1
## 5 0 0
## 6 0 1
## Python............................................................... R
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 0
## 6 0 0
## Redgate.SQL.Server Ruby Ruby.on.Rails SAS SAS.Enterprise.Miner SAS.JMP
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 0
## 3 0 1 0 1 0 0
## 4 1 1 1 1 0 0
## 5 0 0 0 1 0 1
## 6 0 1 0 0 0 0
## SAS.CONNECT STATISTICA Statistical.software Statistical.Solutions.BMDP
## 1 0 1 0 0
## 2 0 0 1 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Structured.query.language.SQL
## 1 1
## 2 1
## 3 1
## 4 1
## 5 0
## 6 1
## Sun.Microsystems.Java.2.Platform.Enterprise.Edition.J2EE SuperANOVA
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## Tableau Teradata.Enterprise.Data.Warehouse The.MathWorks.MATLAB
## 1 0 0 1
## 2 0 0 0
## 3 1 0 1
## 4 1 0 1
## 5 0 0 0
## 6 1 0 0
## UNISTAT.Statistical.Package
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
#str(skills_sum)
Connect to the database ## we create a temporary SQLite database called Proj3_67 (changed name to mydb)
mydb <- dbConnect(RSQLite::SQLite(), "Proj3_607.db")
# List out how many tables are in this object
# dbListTables(mydb)
# TEST, to get query from this database (ONET Characteristic table only for now)
dbGetQuery (mydb,"Select* FROM C")
## Title SOC OccupationType
## 1 Computer and information research scientists 1111 Line item
## 2 Computer systems analysts 1121 Line item
## 3 Information security analysts 1122 Line item
## 4 Computer programmers 1131 Line item
## 5 Software developers, applications 1132 Line item
## 6 Software developers, systems software 1133 Line item
## 7 Web developers 1134 Line item
## 8 Database administrators 1141 Line item
## 9 Network and computer systems administrators 1142 Line item
## 10 Computer network architects 1143 Line item
## 11 Computer user support specialists 1151 Line item
## 12 Computer network support specialists 1152 Line item
## 13 Computer occupations, all other 1199 Line item
## 14 Actuaries 2011 Line item
## 15 Mathematicians 2021 Line item
## 16 Operations research analysts 2031 Line item
## 17 Statisticians 2041 Line item
## 18 Mathematical technicians 2091 Line item
## 19 Mathematical science occupations, all other 2099 Line item
## TypicalEntryLvlEduc PreEmplExperience PostEmplTraining
## 1 Master's degree None None
## 2 Bachelor's degree None None
## 3 Bachelor's degree Less than 5 years None
## 4 Bachelor's degree None None
## 5 Bachelor's degree None None
## 6 Bachelor's degree None None
## 7 Associate's degree None None
## 8 Bachelor's degree None None
## 9 Bachelor's degree None None
## 10 Bachelor's degree 5 years or more None
## 11 Some college, no degree None None
## 12 Associate's degree None None
## 13 Bachelor's degree None None
## 14 Bachelor's degree None Long-term on-the-job training
## 15 Master's degree None None
## 16 Bachelor's degree None None
## 17 Master's degree None None
## 18 Bachelor's degree None None
## 19 Bachelor's degree None None
Relational Database 1st set # Two relational Tables within Indeed Web Scraping, linked by Primary Key and Foreigh Key
# Create/Populate tables with data from the dataframes
dbWriteTable(mydb, "webscrapes", Keyword_Search_csv_df, overwrite = TRUE)
dbWriteTable(mydb, "keywords", keywords_csv_df, overwrite = TRUE)
# Join the two tables using the primary and foreing key(relational database)
join_tb_indeed <- "select keywords.Keyword, webscrapes.Company, webscrapes.Web_scrape_date,webscrapes.state
from webscrapes
inner join keywords
on keywords.ID = webscrapes.Data_Scientist_Skills_ID"
head(dbGetQuery(mydb, join_tb_indeed))
## Keyword Company Web_scrape_date state
## 1 Hadoop Quest Groups LLC 2018-10-30 NY
## 2 Python Deloitte 2018-10-30 NY
## 3 Hadoop Rockstar New York 2018-10-30 NY
## 4 Tableau Rockstar New York 2018-10-30 NY
## 5 SQL Rockstar New York 2018-10-30 NY
## 6 Python Rockstar New York 2018-10-30 NY
tail(dbGetQuery(mydb, join_tb_indeed))
## Keyword Company Web_scrape_date state
## 1225 SQL Stairs Alliance 2018-10-30 CA
## 1226 Machine Learning Amazon.com 2018-10-30 CA
## 1227 SparkML Amazon.com 2018-10-30 CA
## 1228 AI Amazon.com 2018-10-30 CA
## 1229 Python Amazon.com 2018-10-30 CA
## 1230 AWS Amazon.com 2018-10-30 CA
Relational Database 2nd set:
#Two relational Tables within ONET (Labor dept) Web Scraping, linked by Primary Key and Foreigh Key
# ONET Characteristics table is the main table from ONET database,
dbWriteTable(mydb,"C", characteristics, overwrite = TRUE)
dbWriteTable(mydb,"salary", salary, overwrite = TRUE)
join_tb_onetsalary <- "select C.title, C.SOC, C.OccupationType, C.TypicalEntryLvlEduc,
C.PreEmplExperience, C.PostEmplTraining,
salary.SOC, RSE
from C
inner join salary
on C.SOC = salary.SOC"
head (dbGetQuery(mydb,join_tb_onetsalary))
## Title SOC OccupationType
## 1 Software developers, systems software 1133 Line item
## 2 Web developers 1134 Line item
## 3 Database administrators 1141 Line item
## 4 Network and computer systems administrators 1142 Line item
## 5 Computer network architects 1143 Line item
## 6 Computer user support specialists 1151 Line item
## TypicalEntryLvlEduc PreEmplExperience PostEmplTraining SOC RSE
## 1 Bachelor's degree None None 1133 1.3 %
## 2 Associate's degree None None 1134 1.3 %
## 3 Bachelor's degree None None 1141 1.1 %
## 4 Bachelor's degree None None 1142 0.8 %
## 5 Bachelor's degree 5 years or more None 1143 1.6 %
## 6 Some college, no degree None None 1151 0.7 %
Relational Database 3rd set: Link INDEED web scraping and Labor Dept ONET webscraping
# Linkage between those two sets: using the Technical Skills
# Tech Skills linked with ONET Database
join_tb_ONETTechSkl <-
"select C.title, C.SOC, C.OccupationType, C.TypicalEntryLvlEduc,
C.PreEmplExperience, C.PostEmplTraining,
TechSkl.job, TechSkl.jobcode,TechSkl.Example
from C
inner join TechSkl
on C.Title = TechSkl.job"
head (dbGetQuery(mydb,join_tb_ONETTechSkl))
## [1] Title SOC OccupationType
## [4] TypicalEntryLvlEduc PreEmplExperience PostEmplTraining
## [7] job jobcode Example
## <0 rows> (or 0-length row.names)
Tech Skills linked with Indeed Database
join_tb_IndeedTechSkl <- "select keywords.Keyword,
TechSkl.job, TechSkl.jobcode,TechSkl.Example
from TechSkl
inner join keywords
on keywords.Keyword = TechSkl.Example"
head (dbGetQuery(mydb,join_tb_IndeedTechSkl))
## [1] Keyword job jobcode Example
## <0 rows> (or 0-length row.names)
List all the relational tables within the SQlite database
dbListTables(mydb)
## [1] "C" "TechSkl" "keywords" "salary" "webscrapes"
# all tables are displayed here, 6 tables (all relational in total)
List the column fields within each individual tables within the SQLite Database
dbListFields(mydb,"C")
## [1] "Title" "SOC" "OccupationType"
## [4] "TypicalEntryLvlEduc" "PreEmplExperience" "PostEmplTraining"
# dbListFields(mydb,"SftSkl")
# dbListFields(mydb,"TechSkillswOccupation")
dbListFields(mydb,"salary")
## [1] "SOC" "No.Employees" "RSE"
## [4] "Mean.Hourly.Wage" "Mean.Annual.Wage" "Wage.RSE"
dbListFields(mydb,"TechSkl")
## [1] "X" "Section" "Category" "Example" "job" "jobcode"
dbListFields(mydb,"keywords")
## [1] "ID" "Keyword"
dbListFields(mydb,"webscrapes")
## [1] "Data_Scientist_Skills_ID" "Company"
## [3] "Web_scrape_date" "state"
## [5] "job_post_link"
setwd('..')