Please note below additional packages installed. The formattable package is used to transform vectors and data frames into more readable and impactful tabular formats. Package ‘conflicted’ solved an issue with the rename func as it pertains to knit/html.
install.packages(“formattable”) install.packages(“conflicted”)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(RMySQL)
## Loading required package: DBI
library(formattable)
##DBconnect was utilized in this assignment to authenticate/pull in the flight delay information into R
my.database10 = dbConnect(MySQL(), user='root', password = 'Password1', dbname='jobs', host='localhost')
dbListTables(my.database10)
## [1] "general_skills" "indeed_jobs"
generalskills_list <- fetch(dbSendQuery(my.database10, "SELECT * FROM general_skills"))
dim(generalskills_list)
## [1] 30 5
colnames(generalskills_list)
## [1] "Keyword" "LinkedIn" "Indeed" "SimplyHired" "Monster"
summary(generalskills_list)
## Keyword LinkedIn Indeed SimplyHired
## Length:30 Length:30 Length:30 Length:30
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Monster
## Length:30
## Class :character
## Mode :character
##DBconnect was utilized in this assignment to authenticate/pull in the flight delay information into R
my.database10 = dbConnect(MySQL(), user='root', password = 'Password1', dbname='jobs', host='localhost')
dbListTables(my.database10)
## [1] "general_skills" "indeed_jobs"
indeed_list <- fetch(dbSendQuery(my.database10, "SELECT * FROM indeed_jobs"))
dim(indeed_list)
## [1] 500 43
colnames(indeed_list)
## [1] "MyUnknownColumn" "Job_Title"
## [3] "Link" "Queried_Salary"
## [5] "Job_Type" "Skill"
## [7] "No_of_Skills" "Company"
## [9] "No_of_Reviews" "No_of_Stars"
## [11] "Date_Since_Posted" "Description"
## [13] "Location" "Company_Revenue"
## [15] "Company_Employees" "Company_Industry"
## [17] "python" "sql"
## [19] "machine learning" "r"
## [21] "hadoop" "tableau"
## [23] "sas" "spark"
## [25] "java" "Others"
## [27] "CA" "NY"
## [29] "VA" "TX"
## [31] "MA" "IL"
## [33] "WA" "MD"
## [35] "DC" "NC"
## [37] "Other_states" "Consulting and Business Services"
## [39] "Internet and Software" "Banks and Financial Services"
## [41] "Health Care" "Insurance"
## [43] "Other_industries"
Experimented with the tibble func below for the first time. Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. This makes it much easier to work with large data.
library(tidyverse)
library(dplyr)
indeed_listinfo <- as_tibble(indeed_list)
indeed_listinfo
## # A tibble: 500 x 43
## MyUnknownColumn Job_Title Link Queried_Salary Job_Type Skill No_of_Skills
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 1 Data Sci~ http~ <80000 data_sc~ ['Ma~ 5
## 2 2 Data Sci~ http~ <80000 data_sc~ ['Da~ 9
## 3 3 Graduate~ http~ <80000 data_sc~ ['Ce~ 1
## 4 4 Data Sci~ http~ <80000 data_sc~ ['St~ 7
## 5 5 Data Sci~ http~ <80000 data_sc~ ['AI~ 6
## 6 6 Geospati~ http~ <80000 data_sc~ ['St~ 10
## 7 7 Data Sci~ http~ <80000 data_sc~ ['Ma~ 3
## 8 8 Bioinfor~ http~ <80000 data_sc~ ['Li~ 4
## 9 10 Data Sci~ http~ <80000 data_sc~ ['Ma~ 8
## 10 11 Data Sci~ http~ <80000 data_sc~ ['Ma~ 8
## # ... with 490 more rows, and 36 more variables: Company <chr>,
## # No_of_Reviews <chr>, No_of_Stars <chr>, Date_Since_Posted <int>,
## # Description <chr>, Location <chr>, Company_Revenue <chr>,
## # Company_Employees <chr>, Company_Industry <chr>, python <chr>, sql <chr>,
## # `machine learning` <chr>, r <chr>, hadoop <chr>, tableau <chr>, sas <chr>,
## # spark <chr>, java <int>, Others <int>, CA <int>, NY <int>, VA <int>,
## # TX <int>, MA <int>, IL <int>, WA <int>, MD <int>, DC <int>, NC <int>,
## # Other_states <int>, `Consulting and Business Services` <int>, `Internet and
## # Software` <int>, `Banks and Financial Services` <int>, `Health Care` <int>,
## # Insurance <int>, Other_industries <int>
generalskills_listinfo <- as_tibble(generalskills_list)
generalskills_listinfo
## # A tibble: 30 x 5
## Keyword LinkedIn Indeed SimplyHired Monster
## <chr> <chr> <chr> <chr> <chr>
## 1 machine learning 5,701 3,439 2,561 2,340
## 2 analysis 5,168 3,500 2,668 3,306
## 3 statistics 4,893 2,992 2,308 2,399
## 4 computer science 4,517 2,739 2,093 1,900
## 5 communication 3,404 2,344 1,791 2,053
## 6 mathematics 2,605 1,961 1,497 1,815
## 7 visualization 1,879 1,413 1,153 1,207
## 8 AI composite 1,568 1,125 811 687
## 9 deep learning 1,310 979 675 606
## 10 NLP composite 1,212 910 660 582
## # ... with 20 more rows
indeed_listinfo2 <- data.frame(indeed_listinfo$MyUnknownColumn,
indeed_listinfo$Job_Title,
indeed_listinfo$Job_Type,
indeed_listinfo$Skill,
indeed_listinfo$No_of_Skills,
indeed_listinfo$Description,
indeed_listinfo$Location,
indeed_listinfo$Company_Industry,
indeed_listinfo$python,
indeed_listinfo$sql,
indeed_listinfo$r,
indeed_listinfo$hadoop,
indeed_listinfo$tableau,
indeed_listinfo$sas,
indeed_listinfo$spark,
indeed_listinfo$java,
indeed_listinfo$Others,
indeed_listinfo$CA,
indeed_listinfo$NY,
indeed_listinfo$VA,
indeed_listinfo$TX,
indeed_listinfo$MA,
indeed_listinfo$IL,
indeed_listinfo$WA,
indeed_listinfo$MD,
indeed_listinfo$DC,
indeed_listinfo$NC,
indeed_listinfo$Other_states,
indeed_listinfo$`Consulting and Business Services`,
indeed_listinfo$`Internet and Software`,
indeed_listinfo$`Banks and Financial Services`,
indeed_listinfo$`Health Care`,
indeed_listinfo$Insurance,
indeed_listinfo$Other_industries)
(colnames(indeed_listinfo2))
## [1] "indeed_listinfo.MyUnknownColumn"
## [2] "indeed_listinfo.Job_Title"
## [3] "indeed_listinfo.Job_Type"
## [4] "indeed_listinfo.Skill"
## [5] "indeed_listinfo.No_of_Skills"
## [6] "indeed_listinfo.Description"
## [7] "indeed_listinfo.Location"
## [8] "indeed_listinfo.Company_Industry"
## [9] "indeed_listinfo.python"
## [10] "indeed_listinfo.sql"
## [11] "indeed_listinfo.r"
## [12] "indeed_listinfo.hadoop"
## [13] "indeed_listinfo.tableau"
## [14] "indeed_listinfo.sas"
## [15] "indeed_listinfo.spark"
## [16] "indeed_listinfo.java"
## [17] "indeed_listinfo.Others"
## [18] "indeed_listinfo.CA"
## [19] "indeed_listinfo.NY"
## [20] "indeed_listinfo.VA"
## [21] "indeed_listinfo.TX"
## [22] "indeed_listinfo.MA"
## [23] "indeed_listinfo.IL"
## [24] "indeed_listinfo.WA"
## [25] "indeed_listinfo.MD"
## [26] "indeed_listinfo.DC"
## [27] "indeed_listinfo.NC"
## [28] "indeed_listinfo.Other_states"
## [29] "indeed_listinfo..Consulting.and.Business.Services."
## [30] "indeed_listinfo..Internet.and.Software."
## [31] "indeed_listinfo..Banks.and.Financial.Services."
## [32] "indeed_listinfo..Health.Care."
## [33] "indeed_listinfo.Insurance"
## [34] "indeed_listinfo.Other_industries"
library(dplyr)
indeed_listinfo3 <- dplyr::rename(indeed_listinfo2,
"ID"="indeed_listinfo.MyUnknownColumn",
"Title" = "indeed_listinfo.Job_Title",
"Type" = "indeed_listinfo.Job_Type",
"Skill" = "indeed_listinfo.Skill",
"Skills" = "indeed_listinfo.No_of_Skills",
"Description" = "indeed_listinfo.Description",
"Location" = "indeed_listinfo.Location",
"Industry" = "indeed_listinfo.Company_Industry",
"Python" = "indeed_listinfo.python",
"SQL" = "indeed_listinfo.sql",
"R" = "indeed_listinfo.r",
"Hadoop" = "indeed_listinfo.hadoop",
"Tableau" = "indeed_listinfo.tableau",
"SAS" = "indeed_listinfo.sas",
"Spark" = "indeed_listinfo.spark",
"Java" = "indeed_listinfo.java",
"Others" = "indeed_listinfo.Others",
"CA" = "indeed_listinfo.CA",
"NY" = "indeed_listinfo.NY",
"VA" = "indeed_listinfo.VA",
"TX" = "indeed_listinfo.TX",
"MA" = "indeed_listinfo.MA",
"IL" = "indeed_listinfo.IL",
"WA" = "indeed_listinfo.WA",
"MD" = "indeed_listinfo.MD",
"DC" = "indeed_listinfo.DC",
"NC" = "indeed_listinfo.NC",
"Other" = "indeed_listinfo.Other_states",
"Consulting" = "indeed_listinfo..Consulting.and.Business.Services.",
"Internet" = "indeed_listinfo..Internet.and.Software.",
"Financial" = "indeed_listinfo..Banks.and.Financial.Services.",
"Healthcare" = "indeed_listinfo..Health.Care.",
"Insurance" = "indeed_listinfo.Insurance",
"Varied" = "indeed_listinfo.Other_industries"
)
table(indeed_listinfo3$Python)
##
## 0 1
## 165 335
table(indeed_listinfo3$SQL)
##
## 0 1
## 253 247
table(indeed_listinfo3$R)
##
## 0 1
## 206 294
table(indeed_listinfo3$Hadoop)
##
## 0 1
## 419 81
table(indeed_listinfo3$Tableau)
##
## 0 1
## 429 71
table(indeed_listinfo3$SAS)
##
## 0 1
## 388 112
table(indeed_listinfo3$Spark)
##
## 0 1
## 427 73
table(indeed_listinfo3$Java)
##
## 0 1
## 408 92
table(indeed_listinfo3$Others)
##
## 0 1
## 43 457
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
setDT(indeed_listinfo3)[, .N, by=Python]
## Python N
## 1: 1 335
## 2: 0 165
setDT(indeed_listinfo3)[, .N, by=SQL]
## SQL N
## 1: 1 247
## 2: 0 253
setDT(indeed_listinfo3)[, .N, by=R]
## R N
## 1: 1 294
## 2: 0 206
setDT(indeed_listinfo3)[, .N, by=Hadoop]
## Hadoop N
## 1: 0 419
## 2: 1 81
setDT(indeed_listinfo3)[, .N, by=Tableau]
## Tableau N
## 1: 0 429
## 2: 1 71
setDT(indeed_listinfo3)[, .N, by=SAS]
## SAS N
## 1: 1 112
## 2: 0 388
setDT(indeed_listinfo3)[, .N, by=Spark]
## Spark N
## 1: 0 427
## 2: 1 73
setDT(indeed_listinfo3)[, .N, by=Python]
## Python N
## 1: 1 335
## 2: 0 165
setDT(indeed_listinfo3)[, .N, by=Java]
## Java N
## 1: 0 408
## 2: 1 92