library(RSQLite)
## Warning: package 'RSQLite' was built under R version 3.3.3
  # need to add this library first
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.3.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.3.3
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library (tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v readr   1.1.1
## v tibble  1.4.2     v purrr   0.2.4
## v tidyr   0.8.0     v stringr 1.3.1
## v ggplot2 3.0.0     v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.3.3
## Warning: package 'tidyr' was built under R version 3.3.3
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Warning: package 'forcats' was built under R version 3.3.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
setwd ('E:/Proj3 Nov3 Charlie FolderGithub/csvs')
# setwd ('C:/Users/User/Desktop/To Email DeskTop')
#E:\Proj3 Nov3 Charlie FolderGithub\csvs
# Physical library on Gracie's PC
getwd()
## [1] "E:/Proj3 Nov3 Charlie FolderGithub/csvs"
#try(setwd("csvs"))
# Relative path in Github

Database1: Start with Indeed Data Scraping Data

# Load csv file containing foreign key into a dataframe
Keyword_Search_csv_df <- read.csv("Indeed_Keyword_Search_data.csv")

# Load csv file containing primary key into a dataframe
keywords_csv_df <- read.csv("Indeed_Keyword.csv")

# Select relevent columns for database table insert
Keyword_Search_csv_df <- Keyword_Search_csv_df %>% 
  select(Data_Scientist_Skills_ID, Company, Web_scrape_date, state, job_post_link)
head(Keyword_Search_csv_df)
##   Data_Scientist_Skills_ID           Company Web_scrape_date state
## 1                       12  Quest Groups LLC      2018-10-30    NY
## 2                        2          Deloitte      2018-10-30    NY
## 3                       12 Rockstar New York      2018-10-30    NY
## 4                        7 Rockstar New York      2018-10-30    NY
## 5                        4 Rockstar New York      2018-10-30    NY
## 6                        2 Rockstar New York      2018-10-30    NY
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            job_post_link
## 1                                                                                                                         https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CINkGedeUSmfoCD6vzIo7nqDgc1O7Ij50xtHbnUDtgdnRvMdLQBRXYG5gtOfJ3Va02of8kg1VfdMuKdphvIpMPmeynQADeiZ11NfG0wpQx4hOTpk9c12TK_az10UfrGEqD0nbAMNx96WnM_k8jqI_n0sQlAZHQ8BfkLnQpT9TgyDk8H3QkGGKpCrOpbc7arEtMGYxoS5p7P2IuWPGWHpBXSUr68QKx72H0olEpBb3lnb-Z4uQ8oOW8-hasNnXMwU5XOodw1n1R5WL9hVSwxddtyDTFRVWuaMe1achnuPrZA_BEgjoyYetQzBzO2rwdpyhcMaj5Ckwl1GoP8E7rXpD8okonBbIeMSiudkjOXNok_N4A2TNQen0JkUtdKru9M1RlSOE9itsM06RqB0JZ1FzpvZPh5jaBKNDWdM29mxaZIfSFd-LbjGgJ&vjs=3&p=5&sk=&fvj=1
## 2                                                                 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0DIXU_djF9v0NrX_xbLRwj6RWFeuMEgTY6VvwKgvleOVN2aNd4LWgpetAhtYQED-V7CCI0ZA8vfYA-_lRlKhPw6BT146FI1D-089TTA6b7rR35mjFQXU6K3FvPbBs-WDB1n8Jq9KtnGVq2d4ceA9he2FkVHOYHDywnPVLBEbuTEQ50cHrKbMBifPwsXXEhoUchseWpQKX1aypOmLHi_Wql8QswTaCTEb_iACGNxQbB_3F1orLgfYAOPcYnWDIOEAPbhf_jZ5RvhEWWGkbvlEFxQd8xoB2UNnpJvM1uFl1uT6aP_YrhGI8906Amc5P9f07OEAsT7XTjmek5wRV_cZN7Vb3MgYrT0aHQptZDGwxaK8mRuIOALQf0dPya7QFqYd9xneTfWFmF17BiVOLC8FwK_sv_WDMSoC9vO9XTsCANPIIG67rtZ8mjIZTMlHVn3zRIhgCVHYtzth_5z8T0eHBNYKzkfl51IB_IKcEiM2r0nsA==&vjs=3&p=2&sk=&fvj=0
## 3 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
## 4 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
## 5 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
## 6 https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A91Vduf5Jnd4MOZNnJpSBueM4-3evPFU-42Fh3zggbfpnitKphXPj6xer_fRzI4FNiNTTeaNvFkFefAYIJLVr4nqlfQ1oV2g9DpmgK3r_gFtfmcPSVMlggvr8BL-Gx5xBkeRM8EZNbgbkPNVGACnGO34yPTKyXZEWFJMNvVb_vE-f5oJD9EXTipNT_kchuvevCCOursfVz_z3Gk-A_X-fM1it3awVX4I-BNdcq8llR6_7zzM6ASV0_AQr4LDjGE1eWG2DZpQ8_ycmWUA99xM_V-chMfzqq65EmemBmWE2P8comPd_ZiHJ3GYbrRADvKNnE63yhBBbG2DJDVHd_YFFC0v8uOqzpdP1Jumbvk-IOTLtb2clCpRKzyN1lA8yoQPMcaN8Nm7YOan1ClCkOqIDPikBaVzJD5c2BpmJpdKcGAvj1Pjm3AQZB6dbLKx7ORZYHPBxZiKy1Tibz378DXadG53nmbKqZHo0FEYDQWKHYt6agj5ChMRxy-z246Q4660C3VE9GVpeteOeJ-KMaQ0sRBzPTdS8CCPq-fsnOHBETOg==&vjs=3&p=1&sk=&fvj=0
# Select relevent columns for database table insert
keywords_csv_df <- keywords_csv_df %>% 
  select(ID, Keyword)
head(keywords_csv_df)
##   ID            Keyword
## 1 18               PHP 
## 2 17                JS 
## 3 16              Ruby 
## 4 15        Statistics 
## 5 14       Probability 
## 6 13  Machine Learning

Database2: continue with the ONET (Labor Dept Data) ONET Characteristics table is the main table from ONET database, then Salary table, then Skills (Soft) table

characteristics <- read.csv ("characteristics.csv")

characteristics <- characteristics %>% 
  select (Title,SOC ,   OccupationType,     TypicalEntryLvlEduc,    
           PreEmplExperience,   PostEmplTraining)
head(characteristics)
##                                          Title  SOC OccupationType
## 1 Computer and information research scientists 1111      Line item
## 2                    Computer systems analysts 1121      Line item
## 3                Information security analysts 1122      Line item
## 4                         Computer programmers 1131      Line item
## 5            Software developers, applications 1132      Line item
## 6        Software developers, systems software 1133      Line item
##   TypicalEntryLvlEduc PreEmplExperience PostEmplTraining
## 1     Master's degree              None             None
## 2   Bachelor's degree              None             None
## 3   Bachelor's degree Less than 5 years             None
## 4   Bachelor's degree              None             None
## 5   Bachelor's degree              None             None
## 6   Bachelor's degree              None             None
salary <- read.csv ("salary.csv")
salary <- salary %>% 
  select (  SOC,    No.Employees,   RSE,    Mean.Hourly.Wage,   Mean.Annual.Wage,   Wage.RSE)
head(salary)
##    SOC No.Employees   RSE Mean.Hourly.Wage Mean.Annual.Wage Wage.RSE
## 1 1133      394,590 1.3 %           $53.74           111780    0.4 %
## 2 1134      125,890 1.3 %           $35.63            74110    0.6 %
## 3 1141      113,690 1.1 %           $42.81            89050    0.3 %
## 4 1142      375,040 0.8 %           $41.51            86340    0.3 %
## 5 1143      157,830 1.6 %           $51.86           107870    0.5 %
## 6 1151      613,780 0.7 %           $26.03            54150    0.4 %

The linkage bw ONET and INDEED data is the technical skills ## Tech Skill format 1: wide format, each skill is spreaded

skills_sum <- read.csv ('skills_sum.csv')
skills_sum %>% 
  select (jobNoNA,  jobcodeNoNA,        C,      Django,             JavaScript,
          MongoDB,  MySQL,      NoSQL,          PostgreSQL, 
          R,    Ruby          ,SAS,             STATISTICA,           SuperANOVA,   Tableau )
##                                                jobNoNA jobcodeNoNA C
## 1                                     Biostatisticians  15-2041.01 0
## 2                                             Acturies  15-2011.00 0
## 3         Computer and Information Research Scientists  15-1111.00 1
## 4                                   Data Administrator  15-1141.00 1
## 5                                Clinical Data Manager  15-2041.02 0
## 6                    Computer User Support Specialists  15-1151.00 1
## 7                   Compensation and Benefits managers  11-3111.00 0
## 8   Market Research Analysts and Marketing Specialists  13-1161.00 0
## 9                                             Auditors  13-2011.02 0
## 10                                  Financial Analysts  13-2051.00 0
## 11                         Risk Management Specialists  13-2099.02 0
## 12                                      Mathematicians  15-2021.00 1
## 13                        Operations Research Analysts  15-2031.00 1
## 14                                        Statistician  15-2041.00 0
## 15                                          Economists  19-3011.00 0
## 16                                  Survery Rsearchers  19-3022.00 0
## 17                     Regulatory Affairs Speacialists  13-1041.07 0
## 18 Compensation, Benefits and Job Analysis Specialists  13-1141.00 0
## 19                                     Credit Analysts  13-2041.00 0
## 20                              Insurance Underwriters  13-2053.00 0
## 21                   Social Science Research Assistant  13-2053.00 0
## 22                            Computer System Analysts  15-1121.00 0
## 23                                 Software Developers  15-1133.00 0
## 24                Computer Systems Engineers Archtects  15-1199.02 1
## 25         Remote Sensing Scientists and Technologists  19-2099.01 1
## 26            Computer Science Teachers, Postsecondary  25-1021.00 1
## 27 Audio_Visual and Multimedia Collections Specialists  25-9011.00 0
## 28              Adudio and Video Equipment Technicians  27-4011.00 0
## 29                               Broadcast Technicians  27-4012.00 0
## 30                                  Computer Operators  43-9011.00 0
## 31                          Computer, Automated Teller  49-2011.00 0
## 32                                 Computer Programmer  15-1131.00 1
## 33                                      Web Developers  15-1134.00 1
##    Django JavaScript MongoDB MySQL NoSQL PostgreSQL R Ruby SAS STATISTICA
## 1       0          0       0     1     0          0 1    0   1          1
## 2       0          0       0     0     0          0 1    0   1          0
## 3       1          1       1     1     1          1 1    1   1          0
## 4       1          1       1     1     1          1 1    1   1          0
## 5       0          0       0     0     0          0 0    0   1          0
## 6       0          1       0     1     0          1 0    1   0          0
## 7       0          0       0     0     0          0 0    0   0          0
## 8       0          1       0     0     0          0 1    0   0          0
## 9       0          0       0     0     0          0 1    0   1          0
## 10      0          0       0     0     0          0 1    0   1          0
## 11      0          0       0     0     0          0 1    1   1          0
## 12      0          1       0     1     0          0 1    0   1          0
## 13      0          0       0     1     0          0 1    0   1          0
## 14      0          0       0     0     0          0 1    0   1          1
## 15      0          0       0     1     0          0 0    0   1          0
## 16      0          1       0     0     0          0 0    0   1          0
## 17      0          0       0     0     0          0 0    0   0          0
## 18      0          0       0     0     0          0 0    0   1          0
## 19      0          0       0     0     0          0 0    0   1          0
## 20      0          0       0     0     0          0 0    0   0          0
## 21      0          0       0     0     0          0 0    0   1          0
## 22      0          0       1     1     0          1 0    0   1          0
## 23      0          1       1     0     1          1 0    0   1          0
## 24      0          0       1     0     0          1 0    0   1          0
## 25      0          0       0     0     0          0 0    0   0          0
## 26      0          0       0     0     0          0 0    0   0          0
## 27      0          0       0     0     0          0 0    0   0          0
## 28      0          0       0     0     0          0 0    0   0          0
## 29      0          0       0     0     0          0 0    0   0          0
## 30      0          0       0     0     0          0 0    0   0          0
## 31      0          0       0     0     0          0 0    0   0          0
## 32      0          0       1     0     0          1 0    0   1          0
## 33      0          1       1     0     1          1 0    0   0          0
##    SuperANOVA Tableau
## 1           0       0
## 2           0       0
## 3           0       1
## 4           0       1
## 5           0       0
## 6           0       1
## 7           0       0
## 8           0       0
## 9           0       1
## 10          0       1
## 11          0       1
## 12          0       0
## 13          0       1
## 14          1       1
## 15          0       0
## 16          0       1
## 17          0       0
## 18          0       0
## 19          0       0
## 20          0       0
## 21          0       0
## 22          0       0
## 23          0       0
## 24          0       1
## 25          0       0
## 26          0       0
## 27          0       0
## 28          0       0
## 29          0       0
## 30          0       0
## 31          0       0
## 32          0       1
## 33          0       1
head(skills_sum)
##   X                                      jobNoNA jobcodeNoNA
## 1 1                             Biostatisticians  15-2041.01
## 2 2                                     Acturies  15-2011.00
## 3 3 Computer and Information Research Scientists  15-1111.00
## 4 4                           Data Administrator  15-1141.00
## 5 5                        Clinical Data Manager  15-2041.02
## 6 6            Computer User Support Specialists  15-1151.00
##   Apache.Cassandra Apache.Hadoop Apache.Hive Apache.Pig C C.. C...
## 1                0             0           0          0 0   0    0
## 2                0             0           0          0 0   0    0
## 3                1             1           1          1 1   0    0
## 4                1             1           1          1 1   0    0
## 5                0             0           0          0 0   0    0
## 6                0             0           0          0 1   0    0
##   Computational.statistics.software Django Hypertext.markup.language.HTML
## 1                                 0      0                              0
## 2                                 0      0                              0
## 3                                 1      1                              0
## 4                                 0      1                              1
## 5                                 0      0                              0
## 6                                 0      0                              1
##   IBM.SPSS.Statistics JavaScript JavaScript.Object.Notation.JSON
## 1                   1          0                               0
## 2                   1          0                               0
## 3                   1          1                               0
## 4                   1          1                               1
## 5                   1          0                               0
## 6                   0          1                               0
##   Micosoft.SQL.Server.Analysis.Services.SSAS Microsoft.SQL.Server
## 1                                          0                    1
## 2                                          0                    0
## 3                                          0                    1
## 4                                          1                    1
## 5                                          0                    1
## 6                                          0                    1
##   Microsoft.SQL.Server.Integration.Services.SSIS MongoDB MySQL
## 1                                              0       0     1
## 2                                              0       0     0
## 3                                              0       1     1
## 4                                              1       1     1
## 5                                              0       0     0
## 6                                              0       0     1
##   NCR.Teradata.Warehouse.Miner NeuroSolutions.for.MatLab NoSQL Oracle.Java
## 1                            0                         0     0           1
## 2                            0                         0     0           0
## 3                            0                         0     1           1
## 4                            0                         0     1           1
## 5                            0                         0     0           1
## 6                            0                         0     0           1
##   Oracle.JavaServer.Pages.JSP Oracle.PL.SQL Oracle.SQL.Loader
## 1                           0             0                 0
## 2                           0             0                 0
## 3                           0             0                 0
## 4                           1             1                 1
## 5                           0             0                 1
## 6                           1             1                 0
##   Oracle.SQL.Plus PostgreSQL
## 1               0          0
## 2               0          0
## 3               0          1
## 4               1          1
## 5               0          0
## 6               0          1
##   Python............................................................... R
## 1                                                                     0 1
## 2                                                                     0 1
## 3                                                                     0 1
## 4                                                                     0 1
## 5                                                                     0 0
## 6                                                                     0 0
##   Redgate.SQL.Server Ruby Ruby.on.Rails SAS SAS.Enterprise.Miner SAS.JMP
## 1                  0    0             0   1                    0       0
## 2                  0    0             0   1                    0       0
## 3                  0    1             0   1                    0       0
## 4                  1    1             1   1                    0       0
## 5                  0    0             0   1                    0       1
## 6                  0    1             0   0                    0       0
##   SAS.CONNECT STATISTICA Statistical.software Statistical.Solutions.BMDP
## 1           0          1                    0                          0
## 2           0          0                    1                          0
## 3           0          0                    0                          0
## 4           0          0                    0                          0
## 5           0          0                    0                          0
## 6           0          0                    0                          0
##   Structured.query.language.SQL
## 1                             1
## 2                             1
## 3                             1
## 4                             1
## 5                             0
## 6                             1
##   Sun.Microsystems.Java.2.Platform.Enterprise.Edition.J2EE SuperANOVA
## 1                                                        0          0
## 2                                                        0          0
## 3                                                        1          0
## 4                                                        0          0
## 5                                                        0          0
## 6                                                        0          0
##   Tableau Teradata.Enterprise.Data.Warehouse The.MathWorks.MATLAB
## 1       0                                  0                    1
## 2       0                                  0                    0
## 3       1                                  0                    1
## 4       1                                  0                    1
## 5       0                                  0                    0
## 6       1                                  0                    0
##   UNISTAT.Statistical.Package
## 1                           0
## 2                           0
## 3                           0
## 4                           0
## 5                           0
## 6                           0
#str(skills_sum)

Connect to the database ## we create a temporary SQLite database called Proj3_67 (changed name to mydb)

mydb <- dbConnect(RSQLite::SQLite(), "Proj3_607.db")
# List out how many tables are in this object
 # dbListTables(mydb)
 
# TEST, to get query from this database (ONET Characteristic table only for now)
dbGetQuery  (mydb,"Select* FROM C") 
##                                           Title  SOC OccupationType
## 1  Computer and information research scientists 1111      Line item
## 2                     Computer systems analysts 1121      Line item
## 3                 Information security analysts 1122      Line item
## 4                          Computer programmers 1131      Line item
## 5             Software developers, applications 1132      Line item
## 6         Software developers, systems software 1133      Line item
## 7                                Web developers 1134      Line item
## 8                       Database administrators 1141      Line item
## 9   Network and computer systems administrators 1142      Line item
## 10                  Computer network architects 1143      Line item
## 11            Computer user support specialists 1151      Line item
## 12         Computer network support specialists 1152      Line item
## 13              Computer occupations, all other 1199      Line item
## 14                                    Actuaries 2011      Line item
## 15                               Mathematicians 2021      Line item
## 16                 Operations research analysts 2031      Line item
## 17                                Statisticians 2041      Line item
## 18                     Mathematical technicians 2091      Line item
## 19  Mathematical science occupations, all other 2099      Line item
##        TypicalEntryLvlEduc PreEmplExperience              PostEmplTraining
## 1          Master's degree              None                          None
## 2        Bachelor's degree              None                          None
## 3        Bachelor's degree Less than 5 years                          None
## 4        Bachelor's degree              None                          None
## 5        Bachelor's degree              None                          None
## 6        Bachelor's degree              None                          None
## 7       Associate's degree              None                          None
## 8        Bachelor's degree              None                          None
## 9        Bachelor's degree              None                          None
## 10       Bachelor's degree   5 years or more                          None
## 11 Some college, no degree              None                          None
## 12      Associate's degree              None                          None
## 13       Bachelor's degree              None                          None
## 14       Bachelor's degree              None Long-term on-the-job training
## 15         Master's degree              None                          None
## 16       Bachelor's degree              None                          None
## 17         Master's degree              None                          None
## 18       Bachelor's degree              None                          None
## 19       Bachelor's degree              None                          None

Relational Database 1st set # Two relational Tables within Indeed Web Scraping, linked by Primary Key and Foreigh Key

# Create/Populate tables with data from the dataframes
dbWriteTable(mydb, "webscrapes", Keyword_Search_csv_df, overwrite = TRUE)
dbWriteTable(mydb, "keywords", keywords_csv_df, overwrite = TRUE)

# Join the two tables using the primary and foreing key(relational database)
join_tb_indeed <- "select keywords.Keyword, webscrapes.Company, webscrapes.Web_scrape_date,webscrapes.state 
from webscrapes 
inner join keywords
on keywords.ID = webscrapes.Data_Scientist_Skills_ID"

head(dbGetQuery(mydb, join_tb_indeed))
##     Keyword           Company Web_scrape_date state
## 1   Hadoop   Quest Groups LLC      2018-10-30    NY
## 2   Python           Deloitte      2018-10-30    NY
## 3   Hadoop  Rockstar New York      2018-10-30    NY
## 4  Tableau  Rockstar New York      2018-10-30    NY
## 5      SQL  Rockstar New York      2018-10-30    NY
## 6   Python  Rockstar New York      2018-10-30    NY
tail(dbGetQuery(mydb, join_tb_indeed))
##                 Keyword         Company Web_scrape_date state
## 1225               SQL  Stairs Alliance      2018-10-30    CA
## 1226  Machine Learning       Amazon.com      2018-10-30    CA
## 1227           SparkML       Amazon.com      2018-10-30    CA
## 1228                AI       Amazon.com      2018-10-30    CA
## 1229            Python       Amazon.com      2018-10-30    CA
## 1230               AWS       Amazon.com      2018-10-30    CA

Relational Database 2nd set:
#Two relational Tables within ONET (Labor dept) Web Scraping, linked by Primary Key and Foreigh Key

# ONET Characteristics table is the main table from ONET database,
dbWriteTable(mydb,"C", characteristics, overwrite = TRUE)  
dbWriteTable(mydb,"salary", salary, overwrite = TRUE)

join_tb_onetsalary <- "select C.title, C.SOC,   C.OccupationType,   C.TypicalEntryLvlEduc,  
           C.PreEmplExperience, C.PostEmplTraining,
            salary.SOC,  RSE
from C
inner join salary
on C.SOC = salary.SOC"

head (dbGetQuery(mydb,join_tb_onetsalary))
##                                         Title  SOC OccupationType
## 1       Software developers, systems software 1133      Line item
## 2                              Web developers 1134      Line item
## 3                     Database administrators 1141      Line item
## 4 Network and computer systems administrators 1142      Line item
## 5                 Computer network architects 1143      Line item
## 6           Computer user support specialists 1151      Line item
##       TypicalEntryLvlEduc PreEmplExperience PostEmplTraining  SOC   RSE
## 1       Bachelor's degree              None             None 1133 1.3 %
## 2      Associate's degree              None             None 1134 1.3 %
## 3       Bachelor's degree              None             None 1141 1.1 %
## 4       Bachelor's degree              None             None 1142 0.8 %
## 5       Bachelor's degree   5 years or more             None 1143 1.6 %
## 6 Some college, no degree              None             None 1151 0.7 %

Relational Database 3rd set: Link INDEED web scraping and Labor Dept ONET webscraping

# Linkage between those two sets: using the Technical Skills 
# Tech Skills linked with ONET Database
join_tb_ONETTechSkl <- 
  "select C.title, C.SOC,   C.OccupationType,   C.TypicalEntryLvlEduc,  
           C.PreEmplExperience, C.PostEmplTraining,
            TechSkl.job, TechSkl.jobcode,TechSkl.Example
from C
inner join  TechSkl
on C.Title = TechSkl.job"
head (dbGetQuery(mydb,join_tb_ONETTechSkl))
## [1] Title               SOC                 OccupationType     
## [4] TypicalEntryLvlEduc PreEmplExperience   PostEmplTraining   
## [7] job                 jobcode             Example            
## <0 rows> (or 0-length row.names)

Tech Skills linked with Indeed Database

join_tb_IndeedTechSkl <- "select keywords.Keyword, 
 TechSkl.job, TechSkl.jobcode,TechSkl.Example
from TechSkl
inner join keywords
on keywords.Keyword = TechSkl.Example"
head (dbGetQuery(mydb,join_tb_IndeedTechSkl))
## [1] Keyword job     jobcode Example
## <0 rows> (or 0-length row.names)

List all the relational tables within the SQlite database

dbListTables(mydb)
## [1] "C"          "TechSkl"    "keywords"   "salary"     "webscrapes"
# all tables are displayed here, 6 tables (all relational in total)

List the column fields within each individual tables within the SQLite Database

dbListFields(mydb,"C")
## [1] "Title"               "SOC"                 "OccupationType"     
## [4] "TypicalEntryLvlEduc" "PreEmplExperience"   "PostEmplTraining"
# dbListFields(mydb,"SftSkl")
# dbListFields(mydb,"TechSkillswOccupation")
dbListFields(mydb,"salary")
## [1] "SOC"              "No.Employees"     "RSE"             
## [4] "Mean.Hourly.Wage" "Mean.Annual.Wage" "Wage.RSE"
dbListFields(mydb,"TechSkl")
## [1] "X"        "Section"  "Category" "Example"  "job"      "jobcode"
dbListFields(mydb,"keywords")
## [1] "ID"      "Keyword"
dbListFields(mydb,"webscrapes")
## [1] "Data_Scientist_Skills_ID" "Company"                 
## [3] "Web_scrape_date"          "state"                   
## [5] "job_post_link"
setwd('..')