MYSQL Database integration with R

Additional packages installed

Please note below additional packages installed. The formattable package is used to transform vectors and data frames into more readable and impactful tabular formats. Package ‘conflicted’ solved an issue with the rename func as it pertains to knit/html.

install.packages(“formattable”) install.packages(“conflicted”)

library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RMySQL)
## Loading required package: DBI
library(formattable)
##DBconnect was utilized in this assignment to authenticate/pull in the flight delay information into R
my.database10 = dbConnect(MySQL(), user='root', password = 'Password1', dbname='jobs', host='localhost')
dbListTables(my.database10)
## [1] "general_skills" "indeed_jobs"
generalskills_list <- fetch(dbSendQuery(my.database10, "SELECT * FROM general_skills"))
dim(generalskills_list)
## [1] 30  5
colnames(generalskills_list)
## [1] "Keyword"     "LinkedIn"    "Indeed"      "SimplyHired" "Monster"
summary(generalskills_list)
##    Keyword            LinkedIn            Indeed          SimplyHired       
##  Length:30          Length:30          Length:30          Length:30         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Monster         
##  Length:30         
##  Class :character  
##  Mode  :character
##DBconnect was utilized in this assignment to authenticate/pull in the flight delay information into R
my.database10 = dbConnect(MySQL(), user='root', password = 'Password1', dbname='jobs', host='localhost')
dbListTables(my.database10)
## [1] "general_skills" "indeed_jobs"
indeed_list <- fetch(dbSendQuery(my.database10, "SELECT * FROM indeed_jobs"))
dim(indeed_list)
## [1] 500  43
colnames(indeed_list)
##  [1] "MyUnknownColumn"                  "Job_Title"                       
##  [3] "Link"                             "Queried_Salary"                  
##  [5] "Job_Type"                         "Skill"                           
##  [7] "No_of_Skills"                     "Company"                         
##  [9] "No_of_Reviews"                    "No_of_Stars"                     
## [11] "Date_Since_Posted"                "Description"                     
## [13] "Location"                         "Company_Revenue"                 
## [15] "Company_Employees"                "Company_Industry"                
## [17] "python"                           "sql"                             
## [19] "machine learning"                 "r"                               
## [21] "hadoop"                           "tableau"                         
## [23] "sas"                              "spark"                           
## [25] "java"                             "Others"                          
## [27] "CA"                               "NY"                              
## [29] "VA"                               "TX"                              
## [31] "MA"                               "IL"                              
## [33] "WA"                               "MD"                              
## [35] "DC"                               "NC"                              
## [37] "Other_states"                     "Consulting and Business Services"
## [39] "Internet and Software"            "Banks and Financial Services"    
## [41] "Health Care"                      "Insurance"                       
## [43] "Other_industries"

Experimented with the tibble func below for the first time. Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. This makes it much easier to work with large data.

library(tidyverse)
library(dplyr)


indeed_listinfo <- as_tibble(indeed_list)
indeed_listinfo
## # A tibble: 500 x 43
##    MyUnknownColumn Job_Title Link  Queried_Salary Job_Type Skill No_of_Skills
##    <chr>           <chr>     <chr> <chr>          <chr>    <chr>        <int>
##  1 1               Data Sci~ http~ <80000         data_sc~ ['Ma~            5
##  2 2               Data Sci~ http~ <80000         data_sc~ ['Da~            9
##  3 3               Graduate~ http~ <80000         data_sc~ ['Ce~            1
##  4 4               Data Sci~ http~ <80000         data_sc~ ['St~            7
##  5 5               Data Sci~ http~ <80000         data_sc~ ['AI~            6
##  6 6               Geospati~ http~ <80000         data_sc~ ['St~           10
##  7 7               Data Sci~ http~ <80000         data_sc~ ['Ma~            3
##  8 8               Bioinfor~ http~ <80000         data_sc~ ['Li~            4
##  9 10              Data Sci~ http~ <80000         data_sc~ ['Ma~            8
## 10 11              Data Sci~ http~ <80000         data_sc~ ['Ma~            8
## # ... with 490 more rows, and 36 more variables: Company <chr>,
## #   No_of_Reviews <chr>, No_of_Stars <chr>, Date_Since_Posted <int>,
## #   Description <chr>, Location <chr>, Company_Revenue <chr>,
## #   Company_Employees <chr>, Company_Industry <chr>, python <chr>, sql <chr>,
## #   `machine learning` <chr>, r <chr>, hadoop <chr>, tableau <chr>, sas <chr>,
## #   spark <chr>, java <int>, Others <int>, CA <int>, NY <int>, VA <int>,
## #   TX <int>, MA <int>, IL <int>, WA <int>, MD <int>, DC <int>, NC <int>,
## #   Other_states <int>, `Consulting and Business Services` <int>, `Internet and
## #   Software` <int>, `Banks and Financial Services` <int>, `Health Care` <int>,
## #   Insurance <int>, Other_industries <int>
generalskills_listinfo <- as_tibble(generalskills_list)
generalskills_listinfo
## # A tibble: 30 x 5
##    Keyword          LinkedIn Indeed SimplyHired Monster
##    <chr>            <chr>    <chr>  <chr>       <chr>  
##  1 machine learning 5,701    3,439  2,561       2,340  
##  2 analysis         5,168    3,500  2,668       3,306  
##  3 statistics       4,893    2,992  2,308       2,399  
##  4 computer science 4,517    2,739  2,093       1,900  
##  5 communication    3,404    2,344  1,791       2,053  
##  6 mathematics      2,605    1,961  1,497       1,815  
##  7 visualization    1,879    1,413  1,153       1,207  
##  8 AI composite     1,568    1,125  811         687    
##  9 deep learning    1,310    979    675         606    
## 10 NLP composite    1,212    910    660         582    
## # ... with 20 more rows
indeed_listinfo2 <- data.frame(indeed_listinfo$MyUnknownColumn,
                               indeed_listinfo$Job_Title,
                               indeed_listinfo$Job_Type,
                               indeed_listinfo$Skill,
                               indeed_listinfo$No_of_Skills,
                               indeed_listinfo$Description,
                               indeed_listinfo$Location,
                               indeed_listinfo$Company_Industry,
                               indeed_listinfo$python,
                               indeed_listinfo$sql,
                               indeed_listinfo$r,
                               indeed_listinfo$hadoop,
                               indeed_listinfo$tableau,
                               indeed_listinfo$sas,
                               indeed_listinfo$spark,
                               indeed_listinfo$java,
                               indeed_listinfo$Others,
                               indeed_listinfo$CA,
                               indeed_listinfo$NY,
                               indeed_listinfo$VA,
                               indeed_listinfo$TX,
                               indeed_listinfo$MA,
                               indeed_listinfo$IL,
                               indeed_listinfo$WA,
                               indeed_listinfo$MD,
                               indeed_listinfo$DC,
                               indeed_listinfo$NC,
                               indeed_listinfo$Other_states,
                               indeed_listinfo$`Consulting and Business Services`,
                               indeed_listinfo$`Internet and Software`,
                               indeed_listinfo$`Banks and Financial Services`,
                               indeed_listinfo$`Health Care`,
                               indeed_listinfo$Insurance,
                               indeed_listinfo$Other_industries)

(colnames(indeed_listinfo2))
##  [1] "indeed_listinfo.MyUnknownColumn"                   
##  [2] "indeed_listinfo.Job_Title"                         
##  [3] "indeed_listinfo.Job_Type"                          
##  [4] "indeed_listinfo.Skill"                             
##  [5] "indeed_listinfo.No_of_Skills"                      
##  [6] "indeed_listinfo.Description"                       
##  [7] "indeed_listinfo.Location"                          
##  [8] "indeed_listinfo.Company_Industry"                  
##  [9] "indeed_listinfo.python"                            
## [10] "indeed_listinfo.sql"                               
## [11] "indeed_listinfo.r"                                 
## [12] "indeed_listinfo.hadoop"                            
## [13] "indeed_listinfo.tableau"                           
## [14] "indeed_listinfo.sas"                               
## [15] "indeed_listinfo.spark"                             
## [16] "indeed_listinfo.java"                              
## [17] "indeed_listinfo.Others"                            
## [18] "indeed_listinfo.CA"                                
## [19] "indeed_listinfo.NY"                                
## [20] "indeed_listinfo.VA"                                
## [21] "indeed_listinfo.TX"                                
## [22] "indeed_listinfo.MA"                                
## [23] "indeed_listinfo.IL"                                
## [24] "indeed_listinfo.WA"                                
## [25] "indeed_listinfo.MD"                                
## [26] "indeed_listinfo.DC"                                
## [27] "indeed_listinfo.NC"                                
## [28] "indeed_listinfo.Other_states"                      
## [29] "indeed_listinfo..Consulting.and.Business.Services."
## [30] "indeed_listinfo..Internet.and.Software."           
## [31] "indeed_listinfo..Banks.and.Financial.Services."    
## [32] "indeed_listinfo..Health.Care."                     
## [33] "indeed_listinfo.Insurance"                         
## [34] "indeed_listinfo.Other_industries"
library(dplyr)

indeed_listinfo3 <- dplyr::rename(indeed_listinfo2,
          "ID"="indeed_listinfo.MyUnknownColumn",
          "Title" = "indeed_listinfo.Job_Title",
          "Type" = "indeed_listinfo.Job_Type",
          "Skill" = "indeed_listinfo.Skill",
          "Skills" = "indeed_listinfo.No_of_Skills",
          "Description" = "indeed_listinfo.Description",
          "Location" = "indeed_listinfo.Location",
          "Industry" = "indeed_listinfo.Company_Industry",
          "Python" = "indeed_listinfo.python",
          "SQL" = "indeed_listinfo.sql",
          "R" = "indeed_listinfo.r",
          "Hadoop" = "indeed_listinfo.hadoop",
          "Tableau" = "indeed_listinfo.tableau",
          "SAS" = "indeed_listinfo.sas",
          "Spark" = "indeed_listinfo.spark",
          "Java" = "indeed_listinfo.java",
          "Others" = "indeed_listinfo.Others",
          "CA" = "indeed_listinfo.CA",
          "NY" = "indeed_listinfo.NY",
          "VA" = "indeed_listinfo.VA",
          "TX" = "indeed_listinfo.TX",
          "MA" = "indeed_listinfo.MA",
          "IL" = "indeed_listinfo.IL",
          "WA" = "indeed_listinfo.WA",
          "MD" = "indeed_listinfo.MD",
          "DC" = "indeed_listinfo.DC",
          "NC" = "indeed_listinfo.NC",
          "Other" = "indeed_listinfo.Other_states",
          "Consulting" = "indeed_listinfo..Consulting.and.Business.Services.",
          "Internet" = "indeed_listinfo..Internet.and.Software.",
          "Financial" = "indeed_listinfo..Banks.and.Financial.Services.",
          "Healthcare" = "indeed_listinfo..Health.Care.",
          "Insurance" = "indeed_listinfo.Insurance",
          "Varied" = "indeed_listinfo.Other_industries"

)



table(indeed_listinfo3$Python)
## 
##   0   1 
## 165 335
table(indeed_listinfo3$SQL)
## 
##   0   1 
## 253 247
table(indeed_listinfo3$R)
## 
##   0   1 
## 206 294
table(indeed_listinfo3$Hadoop)
## 
##   0   1 
## 419  81
table(indeed_listinfo3$Tableau)
## 
##   0   1 
## 429  71
table(indeed_listinfo3$SAS)
## 
##   0   1 
## 388 112
table(indeed_listinfo3$Spark)
## 
##   0   1 
## 427  73
table(indeed_listinfo3$Java)
## 
##   0   1 
## 408  92
table(indeed_listinfo3$Others)
## 
##   0   1 
##  43 457
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
setDT(indeed_listinfo3)[, .N, by=Python]
##    Python   N
## 1:      1 335
## 2:      0 165
setDT(indeed_listinfo3)[, .N, by=SQL]
##    SQL   N
## 1:   1 247
## 2:   0 253
setDT(indeed_listinfo3)[, .N, by=R]
##    R   N
## 1: 1 294
## 2: 0 206
setDT(indeed_listinfo3)[, .N, by=Hadoop]
##    Hadoop   N
## 1:      0 419
## 2:      1  81
setDT(indeed_listinfo3)[, .N, by=Tableau]
##    Tableau   N
## 1:       0 429
## 2:       1  71
setDT(indeed_listinfo3)[, .N, by=SAS]
##    SAS   N
## 1:   1 112
## 2:   0 388
setDT(indeed_listinfo3)[, .N, by=Spark]
##    Spark   N
## 1:     0 427
## 2:     1  73
setDT(indeed_listinfo3)[, .N, by=Python]
##    Python   N
## 1:      1 335
## 2:      0 165
setDT(indeed_listinfo3)[, .N, by=Java]
##    Java   N
## 1:    0 408
## 2:    1  92