library(arules)
## Warning: package 'arules' was built under R version 4.2.1
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(tidygraph)
## Warning: package 'tidygraph' was built under R version 4.2.1
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
##
## filter
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.1
DSSalary<-read.transactions("D:/archive (2)/SM_DSJobs_2.csv", sep=",")
## Warning in asMethod(object): removing duplicated items in transactions
summary(DSSalary)
## transactions as itemMatrix in sparse format with
## 246 rows (elements/itemsets/transactions) and
## 306 columns (items) and a density of 0.02322121
##
## most frequent items:
## 100 L US MI SE (Other)
## 134 132 110 103 77 1192
##
## element (itemset/transaction) length distribution:
## sizes
## 6 7 8
## 10 200 36
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 7.000 7.000 7.106 7.000 8.000
##
## includes extended item information - examples:
## labels
## 1 <100K
## 2 <10K
## 3 <15K
inspect(DSSalary[1:20])
## items
## [1] {company_location,
## company_size,
## employee_residence,
## experience_level,
## job_title,
## remote_ratio,
## Salary bin,
## salary_in_usd}
## [2] {<10K,
## 0,
## 2876,
## Data Scientist,
## MI,
## MX,
## S}
## [3] {<10K,
## 100,
## 4000,
## Data Engineer,
## IR,
## M,
## MI}
## [4] {<10K,
## 0,
## 4000,
## Data Scientist,
## EN,
## M,
## VN}
## [5] {<10K,
## 3D Computer Vision Researcher,
## 50,
## 5423,
## IN,
## M,
## MI}
## [6] {<10K,
## 100,
## 5695,
## Data Scientist,
## IN,
## MI,
## S,
## US}
## [7] {<10K,
## 50,
## 5707,
## Data Science Consultant,
## EN,
## IN,
## M}
## [8] {<10K,
## 0,
## 5898,
## Big Data Engineer,
## CH,
## EN,
## IN,
## L}
## [9] {<10K,
## 0,
## 6072,
## Data Analyst,
## EN,
## IN,
## S}
## [10] {<10K,
## 100,
## 6072,
## IN,
## L,
## MI,
## Product Data Analyst}
## [11] {<10K,
## 50,
## 8000,
## Data Analyst,
## L,
## MI,
## PK}
## [12] {<10K,
## 100,
## 9272,
## BI Data Analyst,
## EN,
## KE,
## S}
## [13] {<15K,
## 100,
## 10000,
## Data Analyst,
## EN,
## NG,
## S}
## [14] {<15K,
## 12000,
## 50,
## M,
## Machine Learning Scientist,
## MI,
## PK}
## [15] {<15K,
## 100,
## 12000,
## AI Scientist,
## EN,
## M,
## PK,
## US}
## [16] {<15K,
## 100,
## 12000,
## AI Scientist,
## BR,
## EN,
## S,
## US}
## [17] {<15K,
## 0,
## 13000,
## BR,
## Data Scientist,
## MI,
## S}
## [18] {<15K,
## 0,
## 13105,
## Data Engineer,
## M,
## MI,
## TR}
## [19] {<15K,
## 100,
## 13400,
## Data Scientist,
## EN,
## L,
## UA}
## [20] {<20K,
## 100,
## 15966,
## DE,
## EN,
## ML Engineer,
## S}
itemFrequency(DSSalary[, 1:20])
## <100K <10K <15K <200K <20K <30K
## 0.211382114 0.044715447 0.028455285 0.300813008 0.028455285 0.065040650
## <50K <70K >200K 0 100 10000
## 0.060975610 0.126016260 0.085365854 0.162601626 0.544715447 0.004065041
## 100000 103000 103750 103954 105000 106000
## 0.016260163 0.004065041 0.004065041 0.004065041 0.012195122 0.004065041
## 109024 110000
## 0.004065041 0.016260163
itemFrequencyPlot(DSSalary, support=0.1, main="Items with 0.1 support")

itemFrequencyPlot(DSSalary, support=0.05, main="Items with 0.05 support")

itemFrequencyPlot(DSSalary, topN = 20, type = "absolute", main = "top 20 most frequent Items")

DSSalRules <- apriori(data = DSSalary, parameter=list(support=0.03, confidence=0.5, minlen=1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.03 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[306 item(s), 246 transaction(s)] done [0.00s].
## sorting and recoding items ... [29 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [263 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(DSSalRules)
## set of 263 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5
## 2 46 132 75 8
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.000 3.156 4.000 5.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.03252 Min. :0.5000 Min. :0.03252 Min. :0.9179
## 1st Qu.:0.03659 1st Qu.:0.5714 1st Qu.:0.05691 1st Qu.:1.2087
## Median :0.05285 Median :0.6667 Median :0.08130 Median :1.4330
## Mean :0.07730 Mean :0.6716 Mean :0.12305 Mean :1.5445
## 3rd Qu.:0.08537 3rd Qu.:0.7500 3rd Qu.:0.13415 3rd Qu.:1.7508
## Max. :0.54472 Max. :1.0000 Max. :1.00000 Max. :6.5600
## count
## Min. : 8.00
## 1st Qu.: 9.00
## Median : 13.00
## Mean : 19.02
## 3rd Qu.: 21.00
## Max. :134.00
##
## mining info:
## data ntransactions support confidence
## DSSalary 246 0.03 0.5
## call
## apriori(data = DSSalary, parameter = list(support = 0.03, confidence = 0.5, minlen = 1))
inspect(sort(DSSalRules, by= c("lift", "confidence"), decreasing= TRUE)[1:10])
## lhs rhs support confidence
## [1] {<100K, 100, US} => {Data Analyst} 0.03252033 0.5333333
## [2] {100, Data Analyst, US} => {<100K} 0.03252033 0.8000000
## [3] {Data Analyst, US} => {<100K} 0.03658537 0.7500000
## [4] {100, Data Analyst} => {<100K} 0.03658537 0.7500000
## [5] {GB, L} => {50} 0.03658537 1.0000000
## [6] {100, M, SE} => {<200K} 0.03658537 0.8181818
## [7] {100, Data Engineer, US} => {<200K} 0.04065041 0.7692308
## [8] {100, M, US} => {<200K} 0.04878049 0.7500000
## [9] {GB} => {50} 0.04878049 0.7058824
## [10] {<100K, 100, US} => {EN} 0.03252033 0.5333333
## coverage lift count
## [1] 0.06097561 6.560000 8
## [2] 0.04065041 3.784615 8
## [3] 0.04878049 3.548077 9
## [4] 0.04878049 3.548077 9
## [5] 0.03658537 3.464789 9
## [6] 0.04471545 2.719902 9
## [7] 0.05284553 2.557173 10
## [8] 0.06504065 2.493243 12
## [9] 0.06910569 2.445733 12
## [10] 0.06097561 2.429630 8
DataSSalaryRules<-subset(DSSalRules, lhs %in% c("<100K", "<200K"))
inspect(sort(DataSSalaryRules, by= c("lift", "confidence"), decreasing= TRUE)[1:10])
## lhs rhs support confidence
## [1] {<100K, 100, US} => {Data Analyst} 0.03252033 0.5333333
## [2] {<100K, 100, US} => {EN} 0.03252033 0.5333333
## [3] {<200K, 100, Data Engineer} => {US} 0.04065041 1.0000000
## [4] {<200K, S} => {SE} 0.03252033 0.6666667
## [5] {<200K, Data Engineer} => {US} 0.04878049 0.9230769
## [6] {<200K, 100, L, SE} => {US} 0.04878049 0.9230769
## [7] {<200K, 100, L, MI} => {US} 0.04878049 0.9230769
## [8] {<100K, Data Analyst} => {US} 0.03658537 0.9000000
## [9] {<200K, 100, L} => {US} 0.10569106 0.8965517
## [10] {<200K, L, SE} => {US} 0.06910569 0.8947368
## coverage lift count
## [1] 0.06097561 6.560000 8
## [2] 0.06097561 2.429630 8
## [3] 0.04065041 2.236364 10
## [4] 0.04878049 2.129870 8
## [5] 0.05284553 2.064336 12
## [6] 0.05284553 2.064336 12
## [7] 0.05284553 2.064336 12
## [8] 0.04065041 2.012727 9
## [9] 0.11788618 2.005016 26
## [10] 0.07723577 2.000957 17
LargeCompanyDSRules<-subset(DSSalRules, lhs %in% "L" )
LargeCompanyDSRules<-sort(LargeCompanyDSRules, by= c("lift", "confidence"), decreasing= TRUE)
inspect(LargeCompanyDSRules[1:10])
## lhs rhs support confidence coverage
## [1] {GB, L} => {50} 0.03658537 1.0000000 0.03658537
## [2] {>200K, 100, L} => {US} 0.05284553 1.0000000 0.05284553
## [3] {Data Scientist, L, US} => {<200K} 0.03252033 0.6666667 0.04878049
## [4] {L, SE, US} => {<200K} 0.06910569 0.6538462 0.10569106
## [5] {<70K, L} => {50} 0.04065041 0.6250000 0.06504065
## [6] {>200K, L} => {US} 0.06504065 0.9411765 0.06910569
## [7] {100, L, SE, US} => {<200K} 0.04878049 0.6315789 0.07723577
## [8] {<200K, 100, L, SE} => {US} 0.04878049 0.9230769 0.05284553
## [9] {<200K, 100, L, MI} => {US} 0.04878049 0.9230769 0.05284553
## [10] {<200K, 100, L} => {US} 0.10569106 0.8965517 0.11788618
## lift count
## [1] 3.464789 9
## [2] 2.236364 13
## [3] 2.216216 8
## [4] 2.173597 17
## [5] 2.165493 10
## [6] 2.104813 16
## [7] 2.099573 12
## [8] 2.064336 12
## [9] 2.064336 12
## [10] 2.005016 26