Project 3

MYSQL Database integration with R

Additional packages installed

Please note below additional packages installed. The formattable package is used to transform vectors and data frames into more readable and impactful tabular formats. Package ‘conflicted’ solved an issue with the rename func as it pertains to knit/html.

install.packages(“formattable”) install.packages(“conflicted”)

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(RMySQL)

## Loading required package: DBI

library(formattable)

##DBconnect was utilized in this assignment to authenticate/pull in the flight delay information into R
my.database10 = dbConnect(MySQL(), user='root', password = 'Password1', dbname='jobs', host='localhost')
dbListTables(my.database10)

## [1] "general_skills" "indeed_jobs"

generalskills_list <- fetch(dbSendQuery(my.database10, "SELECT * FROM general_skills"))
dim(generalskills_list)

## [1] 30  5

colnames(generalskills_list)

## [1] "Keyword"     "LinkedIn"    "Indeed"      "SimplyHired" "Monster"

summary(generalskills_list)

##    Keyword            LinkedIn            Indeed          SimplyHired       
##  Length:30          Length:30          Length:30          Length:30         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Monster         
##  Length:30         
##  Class :character  
##  Mode  :character

##DBconnect was utilized in this assignment to authenticate/pull in the flight delay information into R
my.database10 = dbConnect(MySQL(), user='root', password = 'Password1', dbname='jobs', host='localhost')
dbListTables(my.database10)

## [1] "general_skills" "indeed_jobs"

indeed_list <- fetch(dbSendQuery(my.database10, "SELECT * FROM indeed_jobs"))
dim(indeed_list)

## [1] 500  43

colnames(indeed_list)

##  [1] "MyUnknownColumn"                  "Job_Title"                       
##  [3] "Link"                             "Queried_Salary"                  
##  [5] "Job_Type"                         "Skill"                           
##  [7] "No_of_Skills"                     "Company"                         
##  [9] "No_of_Reviews"                    "No_of_Stars"                     
## [11] "Date_Since_Posted"                "Description"                     
## [13] "Location"                         "Company_Revenue"                 
## [15] "Company_Employees"                "Company_Industry"                
## [17] "python"                           "sql"                             
## [19] "machine learning"                 "r"                               
## [21] "hadoop"                           "tableau"                         
## [23] "sas"                              "spark"                           
## [25] "java"                             "Others"                          
## [27] "CA"                               "NY"                              
## [29] "VA"                               "TX"                              
## [31] "MA"                               "IL"                              
## [33] "WA"                               "MD"                              
## [35] "DC"                               "NC"                              
## [37] "Other_states"                     "Consulting and Business Services"
## [39] "Internet and Software"            "Banks and Financial Services"    
## [41] "Health Care"                      "Insurance"                       
## [43] "Other_industries"

Experimented with the tibble func below for the first time. Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. This makes it much easier to work with large data.

library(tidyverse)
library(dplyr)


indeed_listinfo <- as_tibble(indeed_list)
indeed_listinfo

## # A tibble: 500 x 43
##    MyUnknownColumn Job_Title Link  Queried_Salary Job_Type Skill No_of_Skills
##    <chr>           <chr>     <chr> <chr>          <chr>    <chr>        <int>
##  1 1               Data Sci~ http~ <80000         data_sc~ ['Ma~            5
##  2 2               Data Sci~ http~ <80000         data_sc~ ['Da~            9
##  3 3               Graduate~ http~ <80000         data_sc~ ['Ce~            1
##  4 4               Data Sci~ http~ <80000         data_sc~ ['St~            7
##  5 5               Data Sci~ http~ <80000         data_sc~ ['AI~            6
##  6 6               Geospati~ http~ <80000         data_sc~ ['St~           10
##  7 7               Data Sci~ http~ <80000         data_sc~ ['Ma~            3
##  8 8               Bioinfor~ http~ <80000         data_sc~ ['Li~            4
##  9 10              Data Sci~ http~ <80000         data_sc~ ['Ma~            8
## 10 11              Data Sci~ http~ <80000         data_sc~ ['Ma~            8
## # ... with 490 more rows, and 36 more variables: Company <chr>,
## #   No_of_Reviews <chr>, No_of_Stars <chr>, Date_Since_Posted <int>,
## #   Description <chr>, Location <chr>, Company_Revenue <chr>,
## #   Company_Employees <chr>, Company_Industry <chr>, python <chr>, sql <chr>,
## #   `machine learning` <chr>, r <chr>, hadoop <chr>, tableau <chr>, sas <chr>,
## #   spark <chr>, java <int>, Others <int>, CA <int>, NY <int>, VA <int>,
## #   TX <int>, MA <int>, IL <int>, WA <int>, MD <int>, DC <int>, NC <int>,
## #   Other_states <int>, `Consulting and Business Services` <int>, `Internet and
## #   Software` <int>, `Banks and Financial Services` <int>, `Health Care` <int>,
## #   Insurance <int>, Other_industries <int>

generalskills_listinfo <- as_tibble(generalskills_list)
generalskills_listinfo

## # A tibble: 30 x 5
##    Keyword          LinkedIn Indeed SimplyHired Monster
##    <chr>            <chr>    <chr>  <chr>       <chr>  
##  1 machine learning 5,701    3,439  2,561       2,340  
##  2 analysis         5,168    3,500  2,668       3,306  
##  3 statistics       4,893    2,992  2,308       2,399  
##  4 computer science 4,517    2,739  2,093       1,900  
##  5 communication    3,404    2,344  1,791       2,053  
##  6 mathematics      2,605    1,961  1,497       1,815  
##  7 visualization    1,879    1,413  1,153       1,207  
##  8 AI composite     1,568    1,125  811         687    
##  9 deep learning    1,310    979    675         606    
## 10 NLP composite    1,212    910    660         582    
## # ... with 20 more rows

indeed_listinfo2 <- data.frame(indeed_listinfo$MyUnknownColumn,
                               indeed_listinfo$Job_Title,
                               indeed_listinfo$Job_Type,
                               indeed_listinfo$Skill,
                               indeed_listinfo$No_of_Skills,
                               indeed_listinfo$Description,
                               indeed_listinfo$Location,
                               indeed_listinfo$Company_Industry,
                               indeed_listinfo$python,
                               indeed_listinfo$sql,
                               indeed_listinfo$r,
                               indeed_listinfo$hadoop,
                               indeed_listinfo$tableau,
                               indeed_listinfo$sas,
                               indeed_listinfo$spark,
                               indeed_listinfo$java,
                               indeed_listinfo$Others,
                               indeed_listinfo$CA,
                               indeed_listinfo$NY,
                               indeed_listinfo$VA,
                               indeed_listinfo$TX,
                               indeed_listinfo$MA,
                               indeed_listinfo$IL,
                               indeed_listinfo$WA,
                               indeed_listinfo$MD,
                               indeed_listinfo$DC,
                               indeed_listinfo$NC,
                               indeed_listinfo$Other_states,
                               indeed_listinfo$`Consulting and Business Services`,
                               indeed_listinfo$`Internet and Software`,
                               indeed_listinfo$`Banks and Financial Services`,
                               indeed_listinfo$`Health Care`,
                               indeed_listinfo$Insurance,
                               indeed_listinfo$Other_industries)

(colnames(indeed_listinfo2))

##  [1] "indeed_listinfo.MyUnknownColumn"                   
##  [2] "indeed_listinfo.Job_Title"                         
##  [3] "indeed_listinfo.Job_Type"                          
##  [4] "indeed_listinfo.Skill"                             
##  [5] "indeed_listinfo.No_of_Skills"                      
##  [6] "indeed_listinfo.Description"                       
##  [7] "indeed_listinfo.Location"                          
##  [8] "indeed_listinfo.Company_Industry"                  
##  [9] "indeed_listinfo.python"                            
## [10] "indeed_listinfo.sql"                               
## [11] "indeed_listinfo.r"                                 
## [12] "indeed_listinfo.hadoop"                            
## [13] "indeed_listinfo.tableau"                           
## [14] "indeed_listinfo.sas"                               
## [15] "indeed_listinfo.spark"                             
## [16] "indeed_listinfo.java"                              
## [17] "indeed_listinfo.Others"                            
## [18] "indeed_listinfo.CA"                                
## [19] "indeed_listinfo.NY"                                
## [20] "indeed_listinfo.VA"                                
## [21] "indeed_listinfo.TX"                                
## [22] "indeed_listinfo.MA"                                
## [23] "indeed_listinfo.IL"                                
## [24] "indeed_listinfo.WA"                                
## [25] "indeed_listinfo.MD"                                
## [26] "indeed_listinfo.DC"                                
## [27] "indeed_listinfo.NC"                                
## [28] "indeed_listinfo.Other_states"                      
## [29] "indeed_listinfo..Consulting.and.Business.Services."
## [30] "indeed_listinfo..Internet.and.Software."           
## [31] "indeed_listinfo..Banks.and.Financial.Services."    
## [32] "indeed_listinfo..Health.Care."                     
## [33] "indeed_listinfo.Insurance"                         
## [34] "indeed_listinfo.Other_industries"

library(dplyr)

indeed_listinfo3 <- dplyr::rename(indeed_listinfo2,
          "ID"="indeed_listinfo.MyUnknownColumn",
          "Title" = "indeed_listinfo.Job_Title",
          "Type" = "indeed_listinfo.Job_Type",
          "Skill" = "indeed_listinfo.Skill",
          "Skills" = "indeed_listinfo.No_of_Skills",
          "Description" = "indeed_listinfo.Description",
          "Location" = "indeed_listinfo.Location",
          "Industry" = "indeed_listinfo.Company_Industry",
          "Python" = "indeed_listinfo.python",
          "SQL" = "indeed_listinfo.sql",
          "R" = "indeed_listinfo.r",
          "Hadoop" = "indeed_listinfo.hadoop",
          "Tableau" = "indeed_listinfo.tableau",
          "SAS" = "indeed_listinfo.sas",
          "Spark" = "indeed_listinfo.spark",
          "Java" = "indeed_listinfo.java",
          "Others" = "indeed_listinfo.Others",
          "CA" = "indeed_listinfo.CA",
          "NY" = "indeed_listinfo.NY",
          "VA" = "indeed_listinfo.VA",
          "TX" = "indeed_listinfo.TX",
          "MA" = "indeed_listinfo.MA",
          "IL" = "indeed_listinfo.IL",
          "WA" = "indeed_listinfo.WA",
          "MD" = "indeed_listinfo.MD",
          "DC" = "indeed_listinfo.DC",
          "NC" = "indeed_listinfo.NC",
          "Other" = "indeed_listinfo.Other_states",
          "Consulting" = "indeed_listinfo..Consulting.and.Business.Services.",
          "Internet" = "indeed_listinfo..Internet.and.Software.",
          "Financial" = "indeed_listinfo..Banks.and.Financial.Services.",
          "Healthcare" = "indeed_listinfo..Health.Care.",
          "Insurance" = "indeed_listinfo.Insurance",
          "Varied" = "indeed_listinfo.Other_industries"

)



table(indeed_listinfo3$Python)

## 
##   0   1 
## 165 335

table(indeed_listinfo3$SQL)

## 
##   0   1 
## 253 247

table(indeed_listinfo3$R)

## 
##   0   1 
## 206 294

table(indeed_listinfo3$Hadoop)

## 
##   0   1 
## 419  81

table(indeed_listinfo3$Tableau)

## 
##   0   1 
## 429  71

table(indeed_listinfo3$SAS)

## 
##   0   1 
## 388 112

table(indeed_listinfo3$Spark)

## 
##   0   1 
## 427  73

table(indeed_listinfo3$Java)

## 
##   0   1 
## 408  92

table(indeed_listinfo3$Others)

## 
##   0   1 
##  43 457

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

setDT(indeed_listinfo3)[, .N, by=Python]

##    Python   N
## 1:      1 335
## 2:      0 165

setDT(indeed_listinfo3)[, .N, by=SQL]

##    SQL   N
## 1:   1 247
## 2:   0 253

setDT(indeed_listinfo3)[, .N, by=R]

##    R   N
## 1: 1 294
## 2: 0 206

setDT(indeed_listinfo3)[, .N, by=Hadoop]

##    Hadoop   N
## 1:      0 419
## 2:      1  81

setDT(indeed_listinfo3)[, .N, by=Tableau]

##    Tableau   N
## 1:       0 429
## 2:       1  71

setDT(indeed_listinfo3)[, .N, by=SAS]

##    SAS   N
## 1:   1 112
## 2:   0 388

setDT(indeed_listinfo3)[, .N, by=Spark]

##    Spark   N
## 1:     0 427
## 2:     1  73

setDT(indeed_listinfo3)[, .N, by=Python]

##    Python   N
## 1:      1 335
## 2:      0 165

setDT(indeed_listinfo3)[, .N, by=Java]

##    Java   N
## 1:    0 408
## 2:    1  92

Project 3

John Mazon

9/26/2020

MYSQL Database integration with R

Additional packages installed