We use a dataset consists of detailed inventor addresses and technological classifications from the UKIPO for UK patent applications with citation and inventor data from the EPO’s PATSTAT database (MacDonald & Salmanovic, 2023). The main dataset consists of 1’517’797 patents citation information between the years 1980 - 2015. For analysing patents, we grouped data by cited patents and obtained 26’976 original patents. The number of patents decreased because we analysed only original patents (cited by others). Using original patents provides more valuable insights because they are the most impactful ones. Besides date information, we used patent types to analyse the behaviours of different technologies. In this dataset, there are 8 main categories and 313 subcategories. In this research, we focused on differences between high-tech on low-tech patents. For this classification we used EUROSTAT documents for patent classification which has three categories, high-tech, low-tech and biotechnology. We conducted some data preprocessing to obtain meaningful results. We observed that biotechnology class has very few observations in the dataset and decided to drop this class. In addition to technology classification, we added time period information to our dataset ranging from t_1 to t_7 which represents 5 years periods between 1980-2015. Adding these time periods ensured that our results are timely correct.
UK.sf<-st_read("Counties_and_Unitary_Authorities_(December_2022)_UK_BUC.shp")
## Reading layer `Counties_and_Unitary_Authorities_(December_2022)_UK_BUC' from data source `/Users/zehrausta/Desktop/Patent_Spatial_Analysis/Counties_and_Unitary_Authorities_(December_2022)_UK_BUC.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 217 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -116.1928 ymin: 7054.1 xmax: 655653.8 ymax: 1220310
## Projected CRS: OSGB36 / British National Grid
UK.sf<-st_transform(UK.sf, 4326)
data <- read.csv("lks_paper_main_data.csv")
head(data)
## citation_id cited_appln_id citing_appln_id cited_lat cited_lon
## 1 1 21292583 338651101 52.57612 -1.9490919
## 2 2 21292591 21370108 52.62726 -0.5079481
## 3 3 21292591 21421412 52.62726 -0.5079481
## 4 4 21292591 21454772 52.62726 -0.5079481
## 5 5 21292609 21349235 51.39139 -0.0255888
## 6 6 21292609 21447292 51.39139 -0.0255888
## cited_ipc_class cited_date_filed cited_granted citing_appln control_appln_id
## 1 F24F 2000-01-05 0 1 338651101
## 2 A47K 2000-01-05 0 1 21370108
## 3 A47K 2000-01-05 0 1 21421412
## 4 A47K 2000-01-05 0 1 21454772
## 5 B65G 2000-03-14 0 1 21349235
## 6 B65G 2000-03-14 0 1 21447292
## control_lat control_lon control_ipc_class control_date_filed control_granted
## 1 50.94896 -0.1324262 F24F 2011-09-01 0
## 2 50.91302 -3.4681057 A47K 2002-02-05 1
## 3 53.74040 -0.4106374 A47K 2003-07-05 0
## 4 57.13030 -2.1601731 A47K 2004-08-28 0
## 5 52.92302 -1.2016132 B65G 2001-06-19 0
## 6 54.00640 -2.2269800 E04F 2004-05-25 1
## distance inter_class_sample intra_class_sample X11_digit_controls_sample
## 1 219.9102 0 1 1
## 2 278.9237 0 1 1
## 3 123.9459 0 1 1
## 4 511.7086 0 1 1
## 5 188.2525 0 1 0
## 6 326.3890 1 0 1
## early_sample late_sample granted_sample no_citations_for_class
## 1 0 1 0 182
## 2 0 1 0 402
## 3 0 1 0 402
## 4 0 1 0 402
## 5 0 1 0 189
## 6 0 1 0 189
## has_us_filing_sample has_no_us_filing_sample
## 1 0 0
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
grouped_data <- data %>%
group_by(cited_appln_id)
patents <- grouped_data %>%
summarise(
lat = first(cited_lat),
lon = first(cited_lon),
class = first(cited_ipc_class),
date = first(cited_date_filed)
)
patents <- as.data.frame(patents)
patents$general_class <- substr(patents$class, 1, 3)
data.sf <- st_as_sf(patents, coords = c("lon", "lat"),
crs = "+proj=longlat +datum=NAD83")
data.sf$date <- as.Date(data.sf$date)
data.sf$ClassCode <- substr(data.sf$class, 1, 1)
# Define IPC subclasses for high-tech and biotechnology
high_tech_classes <- c("B41J", "G06C", "G06D", "G06E", "G11C", "G06Q","G06G","G06J","G06F","G06M","B64B","B64C","B64D","B64F","B64G","C40B","C12P","C12Q","H01S","H01L","H04B","H04H","H04J","H04K","H04L","H04M","H04N","H04Q","H04R","H04S")
biotech_classes <- c("A01H", "A01H","A61K","C02F","C40B","C12N","C12P","C12Q","C12S","G01N","G01N")
# Classification function to classify each IPC class
classify_ipc <- function(ipc_class) {
if (ipc_class %in% high_tech_classes) {
return("High-Tech")
} else if (ipc_class %in% biotech_classes) {
return("Biotechnology")
} else {
return("Low-Tech")
}
}
# Apply the classification function to your dataframe
data.sf <- data.sf %>%
mutate(
HighTech_BioTech = sapply(class, classify_ipc),
)
table(data.sf$HighTech_BioTech)
##
## Biotechnology High-Tech Low-Tech
## 376 2137 24463
# Summarize the data to get counts for each category
category_counts <- data.sf %>%
group_by(HighTech_BioTech) %>%
summarise(Count = n(), .groups = 'drop')
# Create pie chart
pie_chart <- ggplot(category_counts, aes(x = "", y = Count, fill = HighTech_BioTech)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") + # Use polar coordinates to make it a pie chart
scale_fill_viridis_d(option = "D", direction = -1) +
labs(fill = "Category",
x = NULL,
y = NULL) +
theme_void() + # Remove extra chart elements
theme(legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
pie_chart
data.sf <- data.sf %>%
filter(HighTech_BioTech != "Biotechnology")
# Summarize data by year and HighTech_BioTech category
trends_over_time <- data.sf %>%
group_by(Year = year(date), HighTech_BioTech) %>%
summarise(Count = n(), .groups = 'drop')
# Print the summary to check the result
print(head(trends_over_time))
## Simple feature collection with 6 features and 3 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: -7.64221 ymin: 50.1926 xmax: 1.721395 ymax: 57.4622
## Geodetic CRS: +proj=longlat +datum=NAD83
## # A tibble: 6 × 4
## Year HighTech_BioTech Count geometry
## <dbl> <chr> <int> <GEOMETRY [°]>
## 1 1980 Low-Tech 1 POINT (-2.52231 53.75211)
## 2 1982 High-Tech 53 MULTIPOINT ((0.11265 52.20148), (0.0603785 52.15…
## 3 1982 Low-Tech 1024 MULTIPOINT ((1.37612 52.72473), (1.017839 52.955…
## 4 1983 High-Tech 60 MULTIPOINT ((0.4598534 52.07989), (0.14046 52.21…
## 5 1983 Low-Tech 1089 MULTIPOINT ((1.38051 52.82577), (1.11339 52.7630…
## 6 1984 High-Tech 47 MULTIPOINT ((0.1443644 52.20804), (0.1334444 52.…
# Define custom colors for the categories
custom_colors <- c("Low-Tech" = "#440154FF", "High-Tech" = "#20A486FF") # Replace with your desired colors
# Create the line chart with custom colors and larger axis labels
line_chart <- ggplot(trends_over_time, aes(x = Year, y = Count, color = HighTech_BioTech)) +
geom_line(size = 1) + # Draw lines with increased thickness
geom_point(size = 2) + # Add points at data locations with increased size
scale_color_manual(values = custom_colors) + # Apply custom colors
labs(x = "Year",
y = "Count of Patents",
color = "Category") +
theme_minimal() + # Use a minimal theme
theme(
legend.title = element_text(size = 22),
legend.text = element_text(size = 20),
axis.title.x = element_text(size = 22),
axis.title.y = element_text(size = 22),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the line chart
print(line_chart)
# Summarize data by year and HighTech_BioTech category, calculate total patents per year
yearly_data <- data.sf %>%
mutate(Year = year(date)) %>%
group_by(Year, HighTech_BioTech) %>%
summarise(Count = n(), .groups = 'drop') %>%
group_by(Year) %>%
mutate(Total = sum(Count), Proportion = Count / Total)
# Filtering to only include High-Tech
filtered_data <- yearly_data %>%
filter(HighTech_BioTech %in% c("High-Tech"))
# Plotting the proportions
proportional_chart <- ggplot(filtered_data, aes(x = Year, y = Proportion, color = HighTech_BioTech)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = "#440154FF") +
labs(title = "",
x = "Year",
y = "Proportion (%)",
color = "Category") +
theme_minimal() +
theme(legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
# Display the line chart
print(proportional_chart)
table(data.sf$HighTech_BioTech)
##
## High-Tech Low-Tech
## 2137 24463
# Step 1: Filter data for High-Tech and Low-Tech
high_tech_points <- data.sf %>%
filter(HighTech_BioTech == "High-Tech")
# Step 2: Extract coordinates from sf objects
high_tech_coords <- st_coordinates(high_tech_points) %>%
as.data.frame() %>%
rename(Longitude = X, Latitude = Y)
# Step 1: Filter data for High-Tech and Low-Tech
low_tech_points <- data.sf %>%
filter(HighTech_BioTech == "Low-Tech")
# Step 2: Extract coordinates from sf objects
low_tech_coords <- st_coordinates(low_tech_points) %>%
as.data.frame() %>%
rename(Longitude = X, Latitude = Y)
ggplot() +
geom_sf(data = UK.sf$geometry, size = 0.1) +
stat_density_2d(data = high_tech_coords, aes(x = Longitude, y = Latitude, fill =
after_stat(level), alpha = after_stat(level)),
geom = "polygon",
size = 0.1
) +
scale_fill_viridis_c(option = "D", direction = -1, limits = c(0, 0.75)) +
scale_alpha(range = c(0.75, 0.75), guide = 'none') +
theme_minimal(base_size = 14) + # Adjust base font size for legibility
theme(
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16),
plot.background = element_rect(fill = "white", color = NA),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()
) +
guides(fill = guide_colorbar(title = "Density"))
ggplot() +
geom_sf(data = UK.sf$geometry, size = 0.1) +
stat_density_2d(data = low_tech_coords, aes(x = Longitude, y = Latitude, fill =
after_stat(level), alpha = after_stat(level)),
geom = "polygon",
size = 0.1
) +
scale_fill_viridis_c(option = "D", direction = -1, limits = c(0, 0.75)) +
scale_alpha(range = c(0.75, 0.75), guide = 'none') +
theme_minimal(base_size = 14) + # Adjust base font size for legibility
theme(
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16),
plot.background = element_rect(fill = "white", color = NA),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()
) +
guides(fill = guide_colorbar(title = "Density"))
shapefile <- UK.sf
coords <- st_coordinates(data.sf$geometry)
coordinates <- data.frame(coords)
dbscan_result <- dbscan(coordinates, eps = 0.045, minPts = 5)
data.sf$cluster_label <- dbscan_result$cluster
# Filter out noise points (cluster label 0)
data.sf.clustered <- data.sf[data.sf$cluster_label != 0, ]
cluster_order <- names(sort(table(data.sf.clustered$cluster_label), decreasing = TRUE))
data.sf.clustered$cluster_label <- factor(data.sf.clustered$cluster_label, levels = cluster_order)
data.sf <- data.sf.clustered
data.sf <- data.sf %>%
group_by(cluster_label) %>%
mutate(cluster_size = n()) %>%
ungroup()
ggplot() +
# Draw the UK base map
geom_sf(data = UK.sf, fill = "white", color = "gray70") +
geom_sf(
data = data.sf,
aes(color = cluster_size),
size = 0.25
) +
scale_color_viridis_c(
option = "viridis",
direction = -1,
name = "Cluster Size"
) +
theme_minimal()
# Ensure IsHighTech is added to data.sf
proportions <- data.sf %>%
mutate(IsHighTech = ifelse(HighTech_BioTech == "High-Tech", 1, 0))
# Step 1: Calculate cluster sizes and proportions for all data
proportions <- proportions %>%
group_by(cluster_label) %>%
summarise(
geometry = st_centroid(st_union(geometry)), # Calculate cluster centroid
TotalPoints = n(), # Total points in each cluster
HighTechPoints = sum(IsHighTech), # Count of High-Tech points
ProportionHighTech = HighTechPoints / TotalPoints, # High-Tech proportion
.groups = 'drop'
)
# Step 2: Create the plot without normalizing sizes
plot <- ggplot() +
# Add the base map of the UK
geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
# Plot cluster centroids with colors for proportions and sizes for TotalPoints
geom_sf(data = proportions,
aes(size = TotalPoints, color = ProportionHighTech),
alpha = 0.8, show.legend = TRUE) +
scale_color_viridis_c(name = "High-Tech Proportion", labels = scales::percent, direction = -1) +
scale_size_continuous(name = "Cluster Size") +
labs(
x = "Longitude",
y = "Latitude"
) +
theme_minimal() +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16)
)
# Step 3: Display the plot
print(plot)
library(ggplot2)
library(viridis)
library(dplyr)
# Categorize cluster sizes
proportions <- proportions %>%
mutate(ClusterSizeCategory = case_when(
TotalPoints < 1000 ~ "Small",
TotalPoints >= 1000 & TotalPoints < 3000 ~ "Medium",
TotalPoints >= 3000 ~ "Big"
))
plot <- ggplot() +
# Base UK map
geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
# Cluster centroids: size by category, color by high-tech proportion
geom_sf(data = proportions,
aes(size = ClusterSizeCategory, color = ProportionHighTech),
alpha = 0.8, show.legend = TRUE) +
scale_color_gradientn(
colors = c("#3B4CC0", "#9E0142"),
name = "High-Tech Proportion",
labels = scales::percent
)+
scale_size_manual(
name = "Cluster Size",
values = c("Small" = 1, "Medium" = 3, "Big" = 5)
) +
labs(x = "Longitude", y = "Latitude") +
theme_minimal() +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16)
)
# Display the plot
print(plot)
set.seed(123)
min_count <- min(table(data.sf$HighTech_BioTech))
# Random under-sampling
data.sf <- data.sf %>%
group_by(HighTech_BioTech) %>%
sample_n(min_count) %>% # sample each group down to min_count
ungroup()
table(data.sf$HighTech_BioTech)
##
## High-Tech Low-Tech
## 2053 2053
data<-as.data.frame(data.sf) %>%
mutate(ClassCode = case_when(
ClassCode == "A" ~ "Human Necessities",
ClassCode == "B" ~ "Transporting",
ClassCode == "C" ~ "Chemistry; metallurgy",
ClassCode == "D" ~ "Textiles; paper",
ClassCode == "E" ~ "Fixed constructions",
ClassCode == "F" ~ "Mechanical engineering",
ClassCode == "G" ~ "Physics",
ClassCode == "H" ~ "Electricity",
# Add as many conditions as needed
TRUE ~ ClassCode
))
# Create a new column with class and time period
data <- data %>%
mutate(decade = case_when(
format(date, "%Y") >= 1980 & format(date, "%Y") < 1985 ~ "t1",
format(date, "%Y") >= 1985 & format(date, "%Y") < 1990 ~ "t2",
format(date, "%Y") >= 1990 & format(date, "%Y") < 1995 ~ "t3",
format(date, "%Y") >= 1995 & format(date, "%Y") < 2000 ~ "t4",
format(date, "%Y") >= 2000 & format(date, "%Y") < 2005 ~ "t5",
format(date, "%Y") >= 2005 & format(date, "%Y") < 2010 ~ "t6",
format(date, "%Y") >= 2010 & format(date, "%Y") < 2015 ~ "t7",
TRUE ~ "other" # Add other conditions if necessary
)) %>%
mutate(class_time_period = paste0(class, "_", decade))
data <- data %>%
mutate(class_time_period = paste0(class_time_period, "_", HighTech_BioTech))
# making association rules dataset
grouped_df <- data %>%
group_by(cluster_label) %>%
summarize(PatentList = list (class_time_period))
# Convert the grouped dataframe to a transaction format
trans<-as(grouped_df$PatentList, "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# association rules
# apply apriori algorithm
rules<-apriori(trans, parameter=list(support = 0.02, confidence = 0.1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.1 0.1 1 none FALSE TRUE 5 0.02 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[923 item(s), 248 transaction(s)] done [0.00s].
## sorting and recoding items ... [127 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans, parameter = list(support = 0.02, confidence = 0.1)):
## Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
## done [0.00s].
## writing ... [199362 rule(s)] done [0.01s].
## creating S4 object ... done [0.02s].
#inspect(rules)
# Clean the rules
rules.clean<-rules[!is.redundant(rules)]
rules.clean<-rules.clean[is.significant(rules.clean, trans)]
rules.clean<-rules.clean[is.maximal(rules.clean)]
# Sort by lift
rules.clean <- sort(rules.clean, by = "lift", decreasing = TRUE)
inspect(head(rules.clean))
## lhs rhs support confidence coverage lift count
## [1] {E06B_t4_Low-Tech,
## H04Q_t3_High-Tech} => {H04R_t3_High-Tech} 0.02016129 1 0.02016129 49.6 5
## [2] {B60R_t4_Low-Tech,
## B65D_t3_Low-Tech} => {H01L_t6_High-Tech} 0.02016129 1 0.02016129 49.6 5
## [3] {B60R_t4_Low-Tech,
## H04Q_t5_High-Tech} => {H01L_t6_High-Tech} 0.02016129 1 0.02016129 49.6 5
## [4] {B60R_t4_Low-Tech,
## H04N_t5_High-Tech} => {H01L_t6_High-Tech} 0.02016129 1 0.02016129 49.6 5
## [5] {B60R_t4_Low-Tech,
## G06F_t3_High-Tech} => {H01L_t6_High-Tech} 0.02016129 1 0.02016129 49.6 5
## [6] {B60R_t4_Low-Tech,
## H04M_t4_High-Tech} => {H01L_t6_High-Tech} 0.02016129 1 0.02016129 49.6 5
rules <- read.csv("rules.csv", sep = ",")
rules <- rules %>%
separate(rules, into = c("lhs", "rhs"), sep = " => ", remove = FALSE)
# Step 1: Split LHS into separate columns and process RHS
processed_rules <- rules %>%
filter(!is.na(lhs) & !is.na(rhs)) %>% # Exclude rows with missing rules
mutate(
lhs_split = str_split(lhs, ","), # Split LHS into individual technologies
rhs_split = str_extract(rhs, "\\{(.*?)\\}") %>% # Extract RHS technologies
str_remove_all("\\{|\\}"), # Remove braces from RHS
rhs_class = str_extract(rhs_split, "^[A-Za-z0-9]+") # Extract class codes from RHS
) %>%
unnest_wider(lhs_split, names_sep = "_") %>% # Split LHS into multiple columns (lhs_1, lhs_2, etc.)
rename_with(~ gsub("lhs_split_", "lhs", .x)) %>% # Rename columns for better readability
mutate(across(starts_with("lhs"), ~ str_remove_all(., "\\{|\\}"))) # Remove brackets from all LHS columns
# Step 2: View results
processed_rules
## # A tibble: 1,872 × 15
## X rules lhs rhs support confidence coverage lift count lhs1 lhs2
## <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int> <chr> <chr>
## 1 1470 {G06F_… G06F… {A01… 0.0207 1 0.0207 48.2 5 G06F… G06F…
## 2 2355 {E06B_… E06B… {H01… 0.0207 1 0.0207 48.2 5 E06B… H04N…
## 3 2358 {E06B_… E06B… {H01… 0.0207 1 0.0207 48.2 5 E06B… G06F…
## 4 2364 {E06B_… E06B… {H01… 0.0207 1 0.0207 48.2 5 E06B… H04Q…
## 5 2370 {E06B_… E06B… {H01… 0.0207 1 0.0207 48.2 5 E06B… H04M…
## 6 2376 {E06B_… E06B… {H01… 0.0207 1 0.0207 48.2 5 E06B… G06F…
## 7 2391 {H04N_… H04N… {H01… 0.0207 1 0.0207 48.2 5 H04N… H04Q…
## 8 2400 {H04B_… H04B… {H01… 0.0207 1 0.0207 48.2 5 H04B… H04N…
## 9 4671 {H04M_… H04M… {H04… 0.0207 1 0.0207 48.2 5 H04M… H04M…
## 10 5075 {B65D_… B65D… {A01… 0.0207 1 0.0207 48.2 5 B65D… G06F…
## # ℹ 1,862 more rows
## # ℹ 4 more variables: lhs3 <chr>, lhs4 <chr>, rhs_split <chr>, rhs_class <chr>
df_clean <- processed_rules %>%
# 1) Extract "t4", "t6", etc. from LHS columns
mutate(
lhs1_period = str_extract(lhs1, "t\\d+"),
lhs2_period = str_extract(lhs2, "t\\d+"),
lhs3_period = str_extract(lhs3, "t\\d+"),
lhs4_period = str_extract(lhs4, "t\\d+"),
# 2) Extract "t4", "t6", etc. from the RHS column
rhs_period = str_extract(rhs_split, "t\\d+")
) %>%
# 3) Convert them to numeric
mutate(
lhs1_num = as.integer(str_remove(lhs1_period, "t")),
lhs2_num = as.integer(str_remove(lhs2_period, "t")),
lhs3_num = as.integer(str_remove(lhs3_period, "t")),
lhs4_num = as.integer(str_remove(lhs4_period, "t")),
rhs_num = as.integer(str_remove(rhs_period, "t"))
)
df_clean <- df_clean %>%
# We do rowwise so we can compute max across columns easily
rowwise() %>%
mutate(
# Gather all LHS numeric periods into a vector
lhs_periods = list(c(lhs1_num, lhs2_num, lhs3_num, lhs4_num)),
# Compute the maximum LHS period (ignoring NA)
max_lhs = max(lhs_periods, na.rm = TRUE)
) %>%
ungroup() %>%
# Keep rows where max_lhs <= rhs_num
filter(max_lhs <= rhs_num | is.na(rhs_num))
df_expanded <- df_clean %>%
# Remove braces if you still have them: {G06F_t2_High-Tech, H04N_t4_High-Tech}
mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
# Split by commas into multiple rows
separate_rows(lhs_clean, sep = ",") %>%
# Trim whitespace
mutate(lhs_clean = str_trim(lhs_clean)) %>%
# Possibly do the same for the RHS if it can contain multiple items
mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
separate_rows(rhs_clean, sep = ",") %>%
mutate(rhs_clean = str_trim(rhs_clean))
pair_counts <- df_expanded %>%
group_by(lhs_clean, rhs_clean) %>%
summarise(freq = n(), lifts = list(lift), .groups = "drop") %>%
arrange(desc(freq))
pair_counts
## # A tibble: 401 × 4
## lhs_clean rhs_clean freq lifts
## <chr> <chr> <int> <list>
## 1 G06F_t5_High-Tech H04M_t5_High-Tech 32 <dbl [32]>
## 2 H04M_t5_High-Tech H04Q_t5_High-Tech 29 <dbl [29]>
## 3 G06F_t5_High-Tech H04Q_t5_High-Tech 27 <dbl [27]>
## 4 H04M_t4_High-Tech H04N_t5_High-Tech 26 <dbl [26]>
## 5 H04M_t5_High-Tech H01L_t6_High-Tech 26 <dbl [26]>
## 6 G06F_t3_High-Tech H04M_t5_High-Tech 25 <dbl [25]>
## 7 H04M_t5_High-Tech G06F_t5_High-Tech 25 <dbl [25]>
## 8 G06F_t5_High-Tech G06F_t6_High-Tech 24 <dbl [24]>
## 9 H04M_t4_High-Tech H04M_t5_High-Tech 24 <dbl [24]>
## 10 G06F_t3_High-Tech H01L_t6_High-Tech 23 <dbl [23]>
## # ℹ 391 more rows
# 1. Pick top 10
top30 <- pair_counts %>%
slice_max(order_by = freq, n = 30)
top30
## # A tibble: 31 × 4
## lhs_clean rhs_clean freq lifts
## <chr> <chr> <int> <list>
## 1 G06F_t5_High-Tech H04M_t5_High-Tech 32 <dbl [32]>
## 2 H04M_t5_High-Tech H04Q_t5_High-Tech 29 <dbl [29]>
## 3 G06F_t5_High-Tech H04Q_t5_High-Tech 27 <dbl [27]>
## 4 H04M_t4_High-Tech H04N_t5_High-Tech 26 <dbl [26]>
## 5 H04M_t5_High-Tech H01L_t6_High-Tech 26 <dbl [26]>
## 6 G06F_t3_High-Tech H04M_t5_High-Tech 25 <dbl [25]>
## 7 H04M_t5_High-Tech G06F_t5_High-Tech 25 <dbl [25]>
## 8 G06F_t5_High-Tech G06F_t6_High-Tech 24 <dbl [24]>
## 9 H04M_t4_High-Tech H04M_t5_High-Tech 24 <dbl [24]>
## 10 G06F_t3_High-Tech H01L_t6_High-Tech 23 <dbl [23]>
## # ℹ 21 more rows
ggplot(top30, aes(x = reorder(paste(lhs_clean, "->", rhs_clean), freq), y = freq)) +
# Use geom_col for a cleaner approach (same as geom_bar(stat="identity"))
geom_col(fill = "#440154FF", width = 0.7) +
# Flip coordinates to make it a horizontal bar chart
coord_flip() +
# Expand the y-scale so labels don't get clipped (if you add geom_text)
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
# Title and axis labels
labs(
title = "Top 30 Association Rules by Frequency",
x = "Rule (LHS -> RHS)",
y = "Frequency (Count)"
) +
# A clean, minimal theme with some custom tweaks
theme_minimal(base_size = 12) +
theme(
panel.grid.minor = element_blank(), # remove minor grid lines
panel.grid.major.y = element_blank(), # remove horizontal grid lines
plot.title = element_text(face = "bold") # make title bold
)
df_expanded <- df_clean %>%
# Remove braces if present (e.g., "{G06F_t2_High-Tech, H04N_t4_High-Tech}")
mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
# Split multiple LHS items into separate rows by comma
separate_rows(lhs_clean, sep = ",") %>%
# Trim whitespace
mutate(lhs_clean = str_trim(lhs_clean)) %>%
# Remove the "_t<number>" part to keep just the main code + "_High-Tech"/"_Low-Tech"
mutate(lhs_clean = str_remove(lhs_clean, "_t\\d+")) %>%
# Do the same for the RHS
mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
separate_rows(rhs_clean, sep = ",") %>%
mutate(rhs_clean = str_trim(rhs_clean)) %>%
mutate(rhs_clean = str_remove(rhs_clean, "_t\\d+"))
pair_counts <- df_expanded %>%
group_by(lhs_clean, rhs_clean) %>%
summarise(
freq = n(),
lifts = list(lift), # each pair's list of lift values
.groups = "drop"
) %>%
arrange(desc(freq))
# Pick top 30 (or 20, 30, etc.)
top_pairs <- pair_counts %>%
slice_max(order_by = freq, n = 30)
top_pairs
## # A tibble: 30 × 4
## lhs_clean rhs_clean freq lifts
## <chr> <chr> <int> <list>
## 1 G06F_High-Tech H04M_High-Tech 118 <dbl [118]>
## 2 H04M_High-Tech G06F_High-Tech 88 <dbl [88]>
## 3 G06F_High-Tech H04Q_High-Tech 78 <dbl [78]>
## 4 H04M_High-Tech H04Q_High-Tech 73 <dbl [73]>
## 5 G06F_High-Tech G06F_High-Tech 68 <dbl [68]>
## 6 H04M_High-Tech H04M_High-Tech 67 <dbl [67]>
## 7 G06F_High-Tech H01L_High-Tech 62 <dbl [62]>
## 8 G06F_High-Tech H04N_High-Tech 59 <dbl [59]>
## 9 H04M_High-Tech H04N_High-Tech 58 <dbl [58]>
## 10 G06F_High-Tech H04B_High-Tech 56 <dbl [56]>
## # ℹ 20 more rows
#Top pairs without time period
ggplot(top_pairs,
aes(x = reorder(paste(lhs_clean, "->", rhs_clean), freq),
y = freq)) +
geom_bar(stat = "identity", fill = "#440154FF") +
coord_flip() +
theme_minimal() +
labs(
x = "Pair (LHS -> RHS)",
y = "Frequency (Count)"
)
# Step 1: Clean the data and prepare LHS and RHS
df_expanded <- df_clean %>%
# Remove braces (e.g., "{G06F_t2_High-Tech, H04N_t4_High-Tech}")
mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
separate_rows(lhs_clean, sep = ",") %>%
mutate(lhs_clean = str_trim(lhs_clean)) %>%
mutate(lhs_clean = str_remove(lhs_clean, "_t\\d+")) %>%
mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
separate_rows(rhs_clean, sep = ",") %>%
mutate(rhs_clean = str_trim(rhs_clean)) %>%
mutate(rhs_clean = str_remove(rhs_clean, "_t\\d+"))
# Step 2: Group by pairs and create a list of lift values
pair_lift_values <- df_expanded %>%
group_by(lhs_clean, rhs_clean) %>%
summarise(
freq = n(), # Frequency of the pair
lifts = list(lift), # List of lift values for each pair
.groups = "drop"
)
#data_matrix
#HIGH-TECH LOW TECH MATRIX
data_matrix <- df_clean[, c("lhs1", "lhs2", "lhs3","lhs4","rhs_split","lift")]
# Step 1: Extract "High-Tech" or "Low-Tech" from all columns
data_clean <- data_matrix %>%
mutate(
across(starts_with("lhs"), ~str_extract(., "Low-Tech|High-Tech")),
rhs_split = str_extract(rhs_split, "Low-Tech|High-Tech")
)
data_labeled <- data_clean %>%
rowwise() %>%
mutate(
mixed = any(c(lhs1, lhs2, lhs3, lhs4) == "High-Tech", na.rm = TRUE) &
any(c(lhs1, lhs2, lhs3, lhs4) == "Low-Tech", na.rm = TRUE)
) %>%
ungroup()
data_processed <- data_labeled %>%
filter(mixed != TRUE)
data_processed <- data_processed[c("lhs1","rhs_split")]
colnames(data_processed)[1] <- "lhs"
mixed_data <- data_labeled %>%
filter(mixed != FALSE)
# Select only the `rhs_split` column
mixed_data <- mixed_data %>%
select(rhs_split) %>%
mutate(lhs = "High-Tech") %>% # Add a column `lhs` with value "High-Tech"
bind_rows(
mixed_data %>%
select(rhs_split) %>% # Duplicate the data
mutate(lhs = "Low-Tech") # Add a column `lhs` with value "Low-Tech"
)
# Reorder columns
mixed_data <- mixed_data %>%
select(lhs, rhs_split, everything()) # Specify the order of the columns
# Combine datasets by rows
merged_data <- rbind(data_processed, mixed_data)
# Count combinations of lhs and rhs_split
pair_counts <- merged_data %>%
count(lhs, rhs_split) %>%
pivot_wider(names_from = rhs_split, values_from = n, values_fill = 0)
pair_counts
## # A tibble: 2 × 3
## lhs `High-Tech` `Low-Tech`
## <chr> <int> <int>
## 1 High-Tech 664 60
## 2 Low-Tech 163 39
# Format the matrix for easier interpretation
matrix_counts <- as.matrix(pair_counts[, -1]) # Exclude the lhs column
rownames(matrix_counts) <- pair_counts$lhs # Set row names to lhs values
#MOST ASYMMETRIC PAIRS.
# Create a new dataset with selected columns
cleaned_data <- df_clean %>%
select(lhs1, lhs2, lhs3, lhs4, rhs_split, lift) %>% # Select relevant columns
mutate(
lhs1 = str_remove(lhs1, "_t\\d+"), # Remove time period (_tX) from lhs1
lhs2 = str_remove(lhs2, "_t\\d+"), # Remove time period (_tX) from lhs2
lhs3 = str_remove(lhs3, "_t\\d+"), # Remove time period (_tX) from lhs3
lhs4 = str_remove(lhs4, "_t\\d+"), # Remove time period (_tX) from lhs4
rhs_split = str_remove(rhs_split, "_t\\d+") # Remove time period (_tX) from rhs_split
)
cleaned_data
## # A tibble: 766 × 6
## lhs1 lhs2 lhs3 lhs4 rhs_split lift
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 G06F_High-Tech G06F_High-Tech <NA> <NA> A01G_Low-Tech 48.2
## 2 E06B_Low-Tech H04N_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 3 E06B_Low-Tech G06F_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 4 E06B_Low-Tech H04Q_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 5 E06B_Low-Tech H04M_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 6 E06B_Low-Tech G06F_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 7 H04N_High-Tech H04Q_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 8 H04B_High-Tech H04N_High-Tech <NA> <NA> H01L_High-Tech 48.2
## 9 B65D_Low-Tech G06F_High-Tech H04N_High-Tech <NA> A01G_Low-Tech 48.2
## 10 G06F_High-Tech H04M_High-Tech H04N_High-Tech <NA> A01G_Low-Tech 48.2
## # ℹ 756 more rows
# Step 1: Combine all LHS columns into one and count occurrences
lhs_occurrences <- cleaned_data %>%
pivot_longer(cols = starts_with("lhs"), names_to = "lhs_position", values_to = "item") %>%
filter(!is.na(item)) %>%
group_by(item) %>%
summarise(LHS_Count = n(), .groups = "drop")
# Step 2: Count occurrences in the RHS column
rhs_occurrences <- cleaned_data %>%
group_by(rhs_split) %>%
summarise(RHS_Count = n(), .groups = "drop") %>%
rename(item = rhs_split)
# Step 3: Combine LHS and RHS counts and calculate the difference
item_differences <- lhs_occurrences %>%
full_join(rhs_occurrences, by = "item") %>%
mutate(
LHS_Count = replace_na(LHS_Count, 0),
RHS_Count = replace_na(RHS_Count, 0),
Difference = LHS_Count - RHS_Count # Calculate the difference
) %>%
arrange(desc(Difference)) # Sort by difference
occurrence_plot <- ggplot(item_differences, aes(x = reorder(item, Difference), y = Difference)) +
geom_bar(stat = "identity", fill = "#440154FF") +
coord_flip() + # Flip the coordinates for better readability
labs(
x = "Pairs",
y = "Difference (LHS - RHS)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
# Display the plot
print(occurrence_plot)
library(dplyr)
relative_asymmetry <- lhs_occurrences %>%
full_join(rhs_occurrences, by = "item") %>%
mutate(
LHS_Count = replace_na(LHS_Count, 0),
RHS_Count = replace_na(RHS_Count, 0),
Total = LHS_Count + RHS_Count,
Asymmetry_Percent = if_else(Total > 0, (LHS_Count - RHS_Count) / Total * 100, NA_real_)
) %>%
arrange(desc(Asymmetry_Percent))
library(ggplot2)
ggplot(relative_asymmetry, aes(x = reorder(item, Asymmetry_Percent), y = Asymmetry_Percent)) +
geom_bar(stat = "identity", fill = "#440154FF") +
coord_flip() +
labs(
title = "Relative Asymmetry of Technologies (LHS vs RHS)",
x = "Technology",
y = "Asymmetry (%)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
relative_asymmetry
## # A tibble: 29 × 5
## item LHS_Count RHS_Count Total Asymmetry_Percent
## <chr> <int> <int> <int> <dbl>
## 1 B60R_Low-Tech 18 0 18 100
## 2 E01F_Low-Tech 4 0 4 100
## 3 E04H_Low-Tech 2 0 2 100
## 4 E05B_Low-Tech 7 0 7 100
## 5 F16L_Low-Tech 4 0 4 100
## 6 G01B_Low-Tech 2 0 2 100
## 7 G08B_Low-Tech 1 0 1 100
## 8 H01R_Low-Tech 2 0 2 100
## 9 B65D_Low-Tech 138 15 153 80.4
## 10 B64C_High-Tech 9 1 10 80
## # ℹ 19 more rows
lhs_occurrences
## # A tibble: 25 × 2
## item LHS_Count
## <chr> <int>
## 1 A01K_Low-Tech 4
## 2 A47C_Low-Tech 2
## 3 A47G_Low-Tech 9
## 4 B60R_Low-Tech 18
## 5 B64C_High-Tech 9
## 6 B65D_Low-Tech 138
## 7 E01F_Low-Tech 4
## 8 E04D_Low-Tech 1
## 9 E04H_Low-Tech 2
## 10 E05B_Low-Tech 7
## # ℹ 15 more rows
# For Loop for counting occurences of High-Tech Low-Tech Pairs
count <- 0
for (i in 1:nrow(merged_data)) {
if (merged_data$lhs[i] == "High-Tech" && merged_data$rhs_split[i] == "Low-Tech") {
count <- count + 1
}
}
count
## [1] 60
# Step 1: Combine all LHS columns into one and pair with RHS
item_pairs <- cleaned_data %>%
# Combine all LHS columns into a single column
pivot_longer(cols = starts_with("lhs"), names_to = "lhs_position", values_to = "lhs_item") %>%
filter(!is.na(lhs_item)) %>% # Remove NA values
# Group by LHS-RHS pairs and summarize counts and lifts
group_by(lhs_item, rhs_split) %>%
summarise(
Pair_Count = n(), # Count occurrences
Lifts = list(lift), # Collect lift values into a list
.groups = "drop"
) %>%
# Sort by Pair_Count in descending order
arrange(desc(Pair_Count))
# Generate a heatmap from the item_pairs data
heatmap_plot <- ggplot(item_pairs, aes(x = rhs_split, y = lhs_item, fill = Pair_Count)) +
geom_tile() +
scale_fill_viridis_c(name = "Pair Count", direction = -1) + # Use a Viridis color scale for better visibility
labs(
x = "RHS",
y = "LHS"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
axis.text.y = element_text(size = 10),
legend.position = "right",
plot.title = element_text(face = "bold", size = 16)
)
# Display the heatmap
print(heatmap_plot)
library(grid)
set.seed(111)
#Network graph of patents over 20 observation
graph_data <- item_pairs %>%
filter(Pair_Count > 20) %>%
select(from = lhs_item, to = rhs_split, weight = Pair_Count)
# Step 2: Create a tidygraph object
network <- tbl_graph(edges = graph_data, directed = TRUE)
ggraph(network, layout = "fr") +
geom_edge_fan(
aes(width = weight, color = weight),
alpha = 0.8,
arrow = arrow(length = unit(3, 'mm'), type = "closed"), # Add this
end_cap = circle(3, 'mm') # Optional: makes space at node ends
) +
geom_node_point(size = 5, color = "#440154FF") +
geom_node_text(aes(label = name), repel = TRUE, size = 4, fontface = "bold") +
scale_edge_width(range = c(0.5, 3)) +
scale_edge_color_viridis(option = "D", direction = -1, name = "Pair Count") +
theme_void()
## Warning: `guide_colourbar()` cannot be used for edge_colour.
## ℹ Use one of colour, color, or fill instead.