R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot. ###### Code developed by Ahmed Hossain (adapted for ATP W152 dataset) #######

Topic: Association Rules Mining on ATP W152 Survey Data

Target Variable: DrCell (Distracted Driving - Cell Phone use as a problem)

Database: ATP W152 [cleaned & filtered]

Model: Association Rules Mining (ARM)

———————————————————————–

STEP 0: Setup

———————————————————————–

install.packages(“rmarkdown”) install.packages(“knitr”) rm(list = ls()) getwd() setwd(“C:/Users/asr171/OneDrive - Texas State University/Das, Subasish’s files - PewSurvey/W152/Arka/V3”)

Load libraries

library(ggplot2) library(base) library(randomForest) library(tree) library(stats) library(readr) library(magrittr) library(dplyr) library(arules) library(arulesViz) library(RColorBrewer)

———————————————————————–

STEP 1: Read and prepare data

———————————————————————–

Columns: Area, Regn, Land, Age, Gender, Edu, Race, Born, Marr,

Politics, Income, Ideology, IntUse, Wght (col 14 - numeric), DrCell (col 15 - target)

‘Wght’ (col 14) is a continuous numeric survey weight -> excluded from all transactions

‘DrCell’ (col 15) is the target variable with values:

“Major problem”, “Minor problem”, “Not a problem”, “No Response”, “Refused”

data1 <- read.csv( “C:/Users/asr171/OneDrive - Texas State University/Das, Subasish’s files - PewSurvey/W152/Arka/V3/ATP W152 Data_cleanedFin_AC.csv”, header = TRUE, sep = “,” )

colnames(data1) dim(data1) str(data1)

Convert all character columns to factors

data2 <- data1 %>% mutate_if(is.character, as.factor) str(data2) colnames(data2) attach(data2) summary(data2)

———————————————————————–

STEP 2: Full-dataset ARM (exploratory)

Exclude only Wght (col 14); keep DrCell in transactions

———————————————————————–

data3 <- as(data2[, -14], “transactions”) class(data3)

Run apriori with default settings

rules <- apriori(data3) summary(rules) print(length(rules))

Inspect top 100 rules by lift

options(digits = 2) inspect(head(rules, n = 100, by = “lift”))

Visualize rules

plot(rules, jitter = 0) plot(rules, method = “grouped”, control = list(k = 10)) plot(rules[1:50], method = “graph”)

Most frequent items - Absolute

itemLabels(data3) items_abs <- itemFrequency(data3, type = “absolute”) a <- head(sort(items_abs, decreasing = TRUE), n = 20) a write.csv(a, “absolute_frequency.csv”)

Most frequent items - Relative

items_rel <- itemFrequency(data3, type = “relative”) r <- head(sort(items_rel, decreasing = TRUE), n = 20) r write.csv(r, “relative_frequency.csv”)

Horizontal bar plot - Absolute frequency (top 20)

par(mar = c(3, 5, 2, 2) + .1) itemFrequencyPlot(data3, topN = 20, cex.names = 0.8, las = 2, cex.axis = 0.9, cex.lab = 0.9, ylab = ““, xlab =”Absolute frequency”, main = “Absolute Item Frequency Plot (Top 20 items)”, type = “absolute”, col = brewer.pal(8, ‘Pastel2’), horiz = TRUE)

Vertical bar plot - Relative frequency (top 20)

iF20 <- rev(tail(sort(itemFrequency(data3)), 20)) par(mar = c(12, 4, 1, 1)) barplot(iF20, las = 2, cex.names = 0.9, cex.axis = 0.9, cex.lab = 0.9, xlab = ““, ylab =”Relative frequency”, main = “Relative Item Frequency Plot (Top 20 items)”, col = brewer.pal(8, ‘Pastel2’), ylim = c(0, 1))

———————————————————————–

STEP 3: APPROACH A - Subset-based ARM

Mirrors original code structure exactly:

Subset rows by DrCell category -> remove DrCell from transactions

-> find co-occurring factor patterns WITHIN each DrCell group

———————————————————————–

set.seed(1234)

—- Subset 1: DrCell = “Major problem” —-

data4_major <- subset(data2, DrCell == “Major problem”) dim(data4_major)

Exclude Wght (col 14) AND DrCell (col 15) from transaction matrix

data5_major <- as(data4_major[, c(-14, -15)], “transactions”) class(data5_major) itemLabels(data5_major) summary(data5_major)

2-itemset

rules_major_2 <- apriori(data5_major, parameter = list(minlen = 2, maxlen = 2, supp = 0.05, conf = 0.3, target = “rules”)) rules_major_2.sorted <- sort(subset(rules_major_2, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_major_2.sorted) write(rules_major_2.sorted, file = “result_Major_2itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

Visualize

plot(rules_major_2.sorted, jitter = 0) plot(rules_major_2.sorted, method = “grouped”, control = list(k = 10)) plot(rules_major_2.sorted, method = “graph”)

3-itemset

rules_major_3 <- apriori(data5_major, parameter = list(minlen = 3, maxlen = 3, supp = 0.05, conf = 0.3, target = “rules”)) rules_major_3.sorted <- sort(subset(rules_major_3, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_major_3.sorted[1:100]) write(rules_major_3.sorted, file = “result_Major_3itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

4-itemset

rules_major_4 <- apriori(data5_major, parameter = list(minlen = 4, maxlen = 4, supp = 0.05, conf = 0.3, target = “rules”)) rules_major_4.sorted <- sort(subset(rules_major_4, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_major_4.sorted[1:100]) write(rules_major_4.sorted, file = “result_Major_4itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

—- Subset 2: DrCell = “Minor problem” —-

data4_minor <- subset(data2, DrCell == “Minor problem”) dim(data4_minor)

data5_minor <- as(data4_minor[, c(-14, -15)], “transactions”) class(data5_minor) itemLabels(data5_minor) summary(data5_minor)

2-itemset

rules_minor_2 <- apriori(data5_minor, parameter = list(minlen = 2, maxlen = 2, supp = 0.05, conf = 0.3, target = “rules”)) rules_minor_2.sorted <- sort(subset(rules_minor_2, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_minor_2.sorted) write(rules_minor_2.sorted, file = “result_Minor_2itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

Visualize

plot(rules_minor_2.sorted, jitter = 0) plot(rules_minor_2.sorted, method = “grouped”, control = list(k = 10)) plot(rules_minor_2.sorted, method = “graph”)

3-itemset

rules_minor_3 <- apriori(data5_minor, parameter = list(minlen = 3, maxlen = 3, supp = 0.05, conf = 0.3, target = “rules”)) rules_minor_3.sorted <- sort(subset(rules_minor_3, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_minor_3.sorted[1:100]) write(rules_minor_3.sorted, file = “result_Minor_3itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

4-itemset

rules_minor_4 <- apriori(data5_minor, parameter = list(minlen = 4, maxlen = 4, supp = 0.05, conf = 0.3, target = “rules”)) rules_minor_4.sorted <- sort(subset(rules_minor_4, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_minor_4.sorted[1:100]) write(rules_minor_4.sorted, file = “result_Minor_4itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

—- Subset 3: DrCell = “Not a problem” —-

data4_not <- subset(data2, DrCell == “Not a problem”) dim(data4_not)

data5_not <- as(data4_not[, c(-14, -15)], “transactions”) class(data5_not) itemLabels(data5_not) summary(data5_not)

2-itemset

rules_not_2 <- apriori(data5_not, parameter = list(minlen = 2, maxlen = 2, supp = 0.05, conf = 0.3, target = “rules”)) rules_not_2.sorted <- sort(subset(rules_not_2, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_not_2.sorted) write(rules_not_2.sorted, file = “result_NotProblem_2itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

Visualize

plot(rules_not_2.sorted, jitter = 0) plot(rules_not_2.sorted, method = “grouped”, control = list(k = 10)) plot(rules_not_2.sorted, method = “graph”)

3-itemset

rules_not_3 <- apriori(data5_not, parameter = list(minlen = 3, maxlen = 3, supp = 0.05, conf = 0.3, target = “rules”)) rules_not_3.sorted <- sort(subset(rules_not_3, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_not_3.sorted[1:100]) write(rules_not_3.sorted, file = “result_NotProblem_3itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

4-itemset

rules_not_4 <- apriori(data5_not, parameter = list(minlen = 4, maxlen = 4, supp = 0.05, conf = 0.3, target = “rules”)) rules_not_4.sorted <- sort(subset(rules_not_4, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_not_4.sorted[1:100]) write(rules_not_4.sorted, file = “result_NotProblem_4itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

———————————————————————–

STEP 4: APPROACH B - Predictive ARM using DrCell as RHS target

Uses ALL rows; keeps DrCell in transactions

Finds rules: {factor combinations} => {DrCell = X}

———————————————————————–

Full transaction matrix excluding only Wght (col 14); DrCell stays in

data3_full <- as(data2[, -14], “transactions”) summary(data3_full)

set.seed(1234)

—- Predict DrCell = “Major problem” —-

2-itemset

rules_pred_maj_2 <- apriori(data3_full, parameter = list(minlen = 2, maxlen = 2, supp = 0.001, conf = 0.05, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Major problem”)) rules_pred_maj_2.sorted <- sort(subset(rules_pred_maj_2, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_maj_2.sorted) write(rules_pred_maj_2.sorted, file = “result_Pred_Major_2itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

Visualize

plot(rules_pred_maj_2.sorted, jitter = 0) plot(rules_pred_maj_2.sorted, method = “grouped”, control = list(k = 10)) plot(rules_pred_maj_2.sorted, method = “graph”)

3-itemset

rules_pred_maj_3 <- apriori(data3_full, parameter = list(minlen = 3, maxlen = 3, supp = 0.001, conf = 0.1, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Major problem”)) rules_pred_maj_3.sorted <- sort(subset(rules_pred_maj_3, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_maj_3.sorted[1:100]) write(rules_pred_maj_3.sorted, file = “result_Pred_Major_3itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

4-itemset

rules_pred_maj_4 <- apriori(data3_full, parameter = list(minlen = 4, maxlen = 4, supp = 0.001, conf = 0.1, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Major problem”)) rules_pred_maj_4.sorted <- sort(subset(rules_pred_maj_4, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_maj_4.sorted[1:100]) write(rules_pred_maj_4.sorted, file = “result_Pred_Major_4itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

—- Predict DrCell = “Minor problem” —-

2-itemset

rules_pred_min_2 <- apriori(data3_full, parameter = list(minlen = 2, maxlen = 2, supp = 0.001, conf = 0.05, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Minor problem”)) rules_pred_min_2.sorted <- sort(subset(rules_pred_min_2, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_min_2.sorted) write(rules_pred_min_2.sorted, file = “result_Pred_Minor_2itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

Visualize

plot(rules_pred_min_2.sorted, jitter = 0) plot(rules_pred_min_2.sorted, method = “grouped”, control = list(k = 10)) plot(rules_pred_min_2.sorted, method = “graph”)

3-itemset

rules_pred_min_3 <- apriori(data3_full, parameter = list(minlen = 3, maxlen = 3, supp = 0.001, conf = 0.1, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Minor problem”)) rules_pred_min_3.sorted <- sort(subset(rules_pred_min_3, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_min_3.sorted[1:100]) write(rules_pred_min_3.sorted, file = “result_Pred_Minor_3itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

4-itemset

rules_pred_min_4 <- apriori(data3_full, parameter = list(minlen = 4, maxlen = 4, supp = 0.001, conf = 0.1, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Minor problem”)) rules_pred_min_4.sorted <- sort(subset(rules_pred_min_4, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_min_4.sorted[1:100]) write(rules_pred_min_4.sorted, file = “result_Pred_Minor_4itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

—- Predict DrCell = “Not a problem” —-

2-itemset

rules_pred_not_2 <- apriori(data3_full, parameter = list(minlen = 2, maxlen = 2, supp = 0.001, conf = 0.05, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Not a problem”)) rules_pred_not_2.sorted <- sort(subset(rules_pred_not_2, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_not_2.sorted) write(rules_pred_not_2.sorted, file = “result_Pred_NotProblem_2itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

Visualize

plot(rules_pred_not_2.sorted, jitter = 0) plot(rules_pred_not_2.sorted, method = “grouped”, control = list(k = 10)) plot(rules_pred_not_2.sorted, method = “graph”)

3-itemset

rules_pred_not_3 <- apriori(data3_full, parameter = list(minlen = 3, maxlen = 3, supp = 0.001, conf = 0.1, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Not a problem”)) rules_pred_not_3.sorted <- sort(subset(rules_pred_not_3, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_not_3.sorted[1:100]) write(rules_pred_not_3.sorted, file = “result_Pred_NotProblem_3itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)

4-itemset

rules_pred_not_4 <- apriori(data3_full, parameter = list(minlen = 4, maxlen = 4, supp = 0.001, conf = 0.1, target = “rules”), appearance = list(default = “lhs”, rhs = “DrCell=Not a problem”)) rules_pred_not_4.sorted <- sort(subset(rules_pred_not_4, subset = lift > 1), by = “lift”, decreasing = TRUE) inspect(rules_pred_not_4.sorted[1:100]) write(rules_pred_not_4.sorted, file = “result_Pred_NotProblem_4itemset.csv”, sep = “,”, quote = TRUE, row.names = FALSE)