Reproducible codes for Asymmetric Relatedness

This document shows how to reproduce the graphics and analysis that were presented in a paper on “Technological drivers of innovativeness: Asymmetric relations of patent activity in spatio-temporal clusters in the UK in 1980-2015”

Reading the data for the analysis

We use a dataset consists of detailed inventor addresses and technological classifications from the UKIPO for UK patent applications with citation and inventor data from the EPO’s PATSTAT database (MacDonald & Salmanovic, 2023). The main dataset consists of 1’517’797 patents citation information between the years 1980 - 2015. For analysing patents, we grouped data by cited patents and obtained 26’976 original patents. The number of patents decreased because we analysed only original patents (cited by others). Using original patents provides more valuable insights because they are the most impactful ones. Besides date information, we used patent types to analyse the behaviours of different technologies. In this dataset, there are 8 main categories and 313 subcategories. In this research, we focused on differences between high-tech on low-tech patents. For this classification we used EUROSTAT documents for patent classification which has three categories, high-tech, low-tech and biotechnology. We conducted some data preprocessing to obtain meaningful results. We observed that biotechnology class has very few observations in the dataset and decided to drop this class. In addition to technology classification, we added time period information to our dataset ranging from t_1 to t_7 which represents 5 years periods between 1980-2015. Adding these time periods ensured that our results are timely correct.

UK.sf<-st_read("Counties_and_Unitary_Authorities_(December_2022)_UK_BUC.shp")
## Reading layer `Counties_and_Unitary_Authorities_(December_2022)_UK_BUC' from data source `/Users/zehrausta/Desktop/Patent_Spatial_Analysis/Counties_and_Unitary_Authorities_(December_2022)_UK_BUC.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 217 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -116.1928 ymin: 7054.1 xmax: 655653.8 ymax: 1220310
## Projected CRS: OSGB36 / British National Grid
UK.sf<-st_transform(UK.sf, 4326) 

data <- read.csv("lks_paper_main_data.csv")
head(data)
##   citation_id cited_appln_id citing_appln_id cited_lat  cited_lon
## 1           1       21292583       338651101  52.57612 -1.9490919
## 2           2       21292591        21370108  52.62726 -0.5079481
## 3           3       21292591        21421412  52.62726 -0.5079481
## 4           4       21292591        21454772  52.62726 -0.5079481
## 5           5       21292609        21349235  51.39139 -0.0255888
## 6           6       21292609        21447292  51.39139 -0.0255888
##   cited_ipc_class cited_date_filed cited_granted citing_appln control_appln_id
## 1            F24F       2000-01-05             0            1        338651101
## 2            A47K       2000-01-05             0            1         21370108
## 3            A47K       2000-01-05             0            1         21421412
## 4            A47K       2000-01-05             0            1         21454772
## 5            B65G       2000-03-14             0            1         21349235
## 6            B65G       2000-03-14             0            1         21447292
##   control_lat control_lon control_ipc_class control_date_filed control_granted
## 1    50.94896  -0.1324262              F24F         2011-09-01               0
## 2    50.91302  -3.4681057              A47K         2002-02-05               1
## 3    53.74040  -0.4106374              A47K         2003-07-05               0
## 4    57.13030  -2.1601731              A47K         2004-08-28               0
## 5    52.92302  -1.2016132              B65G         2001-06-19               0
## 6    54.00640  -2.2269800              E04F         2004-05-25               1
##   distance inter_class_sample intra_class_sample X11_digit_controls_sample
## 1 219.9102                  0                  1                         1
## 2 278.9237                  0                  1                         1
## 3 123.9459                  0                  1                         1
## 4 511.7086                  0                  1                         1
## 5 188.2525                  0                  1                         0
## 6 326.3890                  1                  0                         1
##   early_sample late_sample granted_sample no_citations_for_class
## 1            0           1              0                    182
## 2            0           1              0                    402
## 3            0           1              0                    402
## 4            0           1              0                    402
## 5            0           1              0                    189
## 6            0           1              0                    189
##   has_us_filing_sample has_no_us_filing_sample
## 1                    0                       0
## 2                    0                       1
## 3                    0                       1
## 4                    0                       1
## 5                    0                       1
## 6                    0                       1

Data Preprocessing

grouped_data <- data %>% 
  group_by(cited_appln_id)

patents <- grouped_data %>%
  summarise(
    lat = first(cited_lat),
    lon = first(cited_lon),
    class = first(cited_ipc_class),
    date = first(cited_date_filed)
  )
patents <- as.data.frame(patents)

patents$general_class <- substr(patents$class, 1, 3)

data.sf <- st_as_sf(patents, coords = c("lon", "lat"), 
                    crs = "+proj=longlat +datum=NAD83")

data.sf$date <- as.Date(data.sf$date)

data.sf$ClassCode <- substr(data.sf$class, 1, 1)

# Define IPC subclasses for high-tech and biotechnology
high_tech_classes <- c("B41J", "G06C", "G06D", "G06E", "G11C", "G06Q","G06G","G06J","G06F","G06M","B64B","B64C","B64D","B64F","B64G","C40B","C12P","C12Q","H01S","H01L","H04B","H04H","H04J","H04K","H04L","H04M","H04N","H04Q","H04R","H04S","A01H", "A01H","A61K","C02F","C40B","C12N","C12P","C12Q","C12S","G01N","G01N")

# Classification function to classify each IPC class
classify_ipc <- function(ipc_class) {
   if (ipc_class %in% high_tech_classes) {
     return("High-Tech")
   } else {
     return("Low-Tech")
   }
 }

# Apply the classification function to your dataframe
data.sf <- data.sf %>%
   mutate(
     HighTech_LowTech = sapply(class, classify_ipc),
   )

table(data.sf$HighTech_LowTech)
## 
## High-Tech  Low-Tech 
##      2513     24463
# Summarize the data to get counts for each category

category_counts <- data.sf %>%
  group_by(HighTech_LowTech) %>%
  summarise(Count = n(), .groups = 'drop')

Data Visualization

# Create pie chart
pie_chart <- ggplot(category_counts, aes(x = "", y = Count, fill = HighTech_LowTech)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +  # Use polar coordinates to make it a pie chart
  scale_fill_viridis_d(option = "D", direction = -1) +
  labs(fill = "Category",
       x = NULL,
       y = NULL) +
  theme_void() +  # Remove extra chart elements
  theme(legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))

pie_chart

# Summarize data by year and HighTech_BioTech category
trends_over_time <- data.sf %>%
  group_by(Year = year(date), HighTech_LowTech) %>%
  summarise(Count = n(), .groups = 'drop')

# Print the summary to check the result
print(head(trends_over_time))
## Simple feature collection with 6 features and 3 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -7.64221 ymin: 50.1926 xmax: 1.721395 ymax: 57.4622
## Geodetic CRS:  +proj=longlat +datum=NAD83
## # A tibble: 6 × 4
##    Year HighTech_LowTech Count                                          geometry
##   <dbl> <chr>            <int>                                    <GEOMETRY [°]>
## 1  1980 Low-Tech             1                         POINT (-2.52231 53.75211)
## 2  1982 High-Tech           71 MULTIPOINT ((0.11265 52.20148), (0.0672778 52.25…
## 3  1982 Low-Tech          1024 MULTIPOINT ((1.37612 52.72473), (1.017839 52.955…
## 4  1983 High-Tech           81 MULTIPOINT ((0.4598534 52.07989), (0.14046 52.21…
## 5  1983 Low-Tech          1089 MULTIPOINT ((1.38051 52.82577), (1.11339 52.7630…
## 6  1984 High-Tech           70 MULTIPOINT ((0.3949892 52.25248), (0.1443644 52.…
# Define custom colors for the categories
custom_colors <- c("Low-Tech" = "#440154FF", "High-Tech" = "#20A486FF")  # Replace with your desired colors

# Create the line chart with custom colors and larger axis labels
line_chart <- ggplot(trends_over_time, aes(x = Year, y = Count, color = HighTech_LowTech)) +
  geom_line(size = 1) +  # Draw lines with increased thickness
  geom_point(size = 2) +  # Add points at data locations with increased size
  scale_color_manual(values = custom_colors) +  # Apply custom colors
  labs(x = "Year",
       y = "Count of Patents",
       color = "Category") +
  theme_minimal() +  # Use a minimal theme
  theme(
    legend.title = element_text(size = 22), 
    legend.text = element_text(size = 20),
    axis.title.x = element_text(size = 22),
    axis.title.y = element_text(size = 22),  
    axis.text.x = element_text(size = 12),   
    axis.text.y = element_text(size = 12)    
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the line chart
print(line_chart)

# Summarize data by year and HighTech_BioTech category, calculate total patents per year
yearly_data <- data.sf %>%
  mutate(Year = year(date)) %>%
  group_by(Year, HighTech_LowTech) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  group_by(Year) %>%
  mutate(Total = sum(Count), Proportion = Count / Total)

# Filtering to only include High-Tech
filtered_data <- yearly_data %>%
  filter(HighTech_LowTech %in% c("High-Tech"))

# Plotting the proportions
proportional_chart <- ggplot(filtered_data, aes(x = Year, y = Proportion, color = HighTech_LowTech)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = scales::percent) +  
  scale_color_manual(values = "#440154FF") +  
  labs(title = "",
       x = "Year",
       y = "Proportion (%)",
       color = "Category") +
  theme_minimal() +
  theme(legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))

# Display the line chart
print(proportional_chart)

### Kernel Density Estimation of High-tech and Low-tech Patents

# Step 1: Filter data for High-Tech and Low-Tech
high_tech_points <- data.sf %>%
  filter(HighTech_LowTech == "High-Tech")


# Step 2: Extract coordinates from sf objects
high_tech_coords <- st_coordinates(high_tech_points) %>%
  as.data.frame() %>%
  rename(Longitude = X, Latitude = Y)


# Step 1: Filter data for High-Tech and Low-Tech
low_tech_points <- data.sf %>%
  filter(HighTech_LowTech == "Low-Tech")


# Step 2: Extract coordinates from sf objects
low_tech_coords <- st_coordinates(low_tech_points) %>%
  as.data.frame() %>%
  rename(Longitude = X, Latitude = Y)


ggplot() +
  geom_sf(data = UK.sf$geometry, size = 0.1) +
  stat_density_2d(data = high_tech_coords, aes(x = Longitude, y = Latitude, fill =
                                      after_stat(level), alpha = after_stat(level)),
                   geom = "polygon",
                   size = 0.1
  ) +
  scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.75)) +
  scale_alpha(range = c(0.75, 0.75), guide = 'none') +
  theme_minimal(base_size = 14) + # Adjust base font size for legibility
  theme(
    legend.position = "right",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    plot.title = element_text(face = "bold", size = 16),
    plot.background = element_rect(fill = "white", color = NA),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank()
    
  ) +
  guides(fill = guide_colorbar(title = "Density"))

ggplot() +
  geom_sf(data = UK.sf$geometry, size = 0.1) +
  stat_density_2d(data = low_tech_coords, aes(x = Longitude, y = Latitude, fill =
                                      after_stat(level), alpha = after_stat(level)),
                   geom = "polygon",
                   size = 0.1
  ) +
  scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.75)) +
  scale_alpha(range = c(0.75, 0.75), guide = 'none') +
  theme_minimal(base_size = 14) + # Adjust base font size for legibility
  theme(
    legend.position = "right",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    plot.title = element_text(face = "bold", size = 16),
    plot.background = element_rect(fill = "white", color = NA),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank()
    
  ) +
  guides(fill = guide_colorbar(title = "Density"))

Density-Based Spatial Clustering of Applications with Noise (DBSCAN)

DBSCAN clustering for 26976 objects.

Parameters: eps = 0.045, minPts = 5

The clustering contains 341 cluster(s) and 1799 noise points.

shapefile <- UK.sf

coords <- st_coordinates(data.sf$geometry)

coordinates <- data.frame(coords)

dbscan_result <- dbscan(coordinates, eps = 0.045, minPts = 5)

data.sf$cluster_label <- dbscan_result$cluster

# Filter out noise points (cluster label 0)
data.sf.clustered <- data.sf[data.sf$cluster_label != 0, ]

cluster_order <- names(sort(table(data.sf.clustered$cluster_label), decreasing = TRUE))

data.sf.clustered$cluster_label <- factor(data.sf.clustered$cluster_label, levels = cluster_order)

data.sf <- data.sf.clustered

data.sf <- data.sf %>%
  group_by(cluster_label) %>%
  mutate(cluster_size = n()) %>%
  ungroup()

ggplot() +
  # Draw the UK base map
  geom_sf(data = UK.sf, fill = "white", color = "gray70") +
  
  geom_sf(
    data = data.sf, 
    aes(color = cluster_size), 
    size = 0.25
  ) +
  
  scale_color_viridis_c(
    option = "viridis", 
    direction = -1,
    name = "Cluster Size"
   ) +
  
  theme_minimal()

library(dplyr)
library(sf)

cluster_summary <- data.sf %>%
  group_by(cluster_label) %>%
  summarise(
    cluster_geom = st_convex_hull(st_union(geometry)),
    points_in_cluster = n()
  ) %>%
  mutate(
    cluster_area = as.numeric(st_area(cluster_geom))
  )

cluster_summary$area_km2 <- cluster_summary$cluster_area / 1e6

Scatterplot of Cluster Size vs Area

library(ggplot2)

ggplot(cluster_summary, aes(x = points_in_cluster, y = area_km2)) +
  geom_point(size = 3, color = "#440154FF") +
  geom_text(aes(label = cluster_label), vjust = -0.5, size = 3.5) +  # Optional: label clusters
  labs(
    x = "Number of Points in Cluster",
    y = "Cluster Area (km²)"
  ) +
  theme_minimal(base_size = 14)

Scatterplot of Cluster Size vs Area (Outliers Removed)

# IQR filtering for outlier removal
filtered_data <- cluster_summary %>%
  filter(
    between(points_in_cluster,
            quantile(points_in_cluster, 0.25) - 1.5 * IQR(points_in_cluster),
            quantile(points_in_cluster, 0.75) + 1.5 * IQR(points_in_cluster)),
    between(area_km2,
            quantile(area_km2, 0.25) - 1.5 * IQR(area_km2),
            quantile(area_km2, 0.75) + 1.5 * IQR(area_km2))
  )

ggplot(filtered_data, aes(x = points_in_cluster, y = area_km2)) +
  geom_point(size = 3, color = "#440154FF") +
  geom_text(aes(label = cluster_label), vjust = -0.5, size = 3.5) +
  labs(
    x = "Number of Points in Cluster",
    y = "Cluster Area (km²)"
  ) +
  theme_minimal(base_size = 14)

# Ensure IsHighTech is added to data.sf
proportions <- data.sf %>%
  mutate(IsHighTech = ifelse(HighTech_LowTech == "High-Tech", 1, 0))

# Step 1: Calculate cluster sizes and proportions for all data
proportions <- proportions %>%
  group_by(cluster_label) %>%
  summarise(
    geometry = st_centroid(st_union(geometry)),  # Calculate cluster centroid
    TotalPoints = n(),                           # Total points in each cluster
    HighTechPoints = sum(IsHighTech),            # Count of High-Tech points
    ProportionHighTech = HighTechPoints / TotalPoints,  # High-Tech proportion
    .groups = 'drop'
  )
proportions
## Simple feature collection with 341 features and 4 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -7.322678 ymin: 50.11518 xmax: 1.721305 ymax: 57.46559
## Geodetic CRS:  +proj=longlat +datum=NAD83
## # A tibble: 341 × 5
##    cluster_label              geometry TotalPoints HighTechPoints
##    <fct>                   <POINT [°]>       <int>          <dbl>
##  1 3             (-0.2450684 51.53052)        8371           1216
##  2 9              (-2.500207 53.46553)        2691            135
##  3 1              (-2.020032 52.48576)        1902             62
##  4 4              (-1.346618 52.71753)        1432             86
##  5 13               (-1.57341 53.6281)        1181             41
##  6 15             (-1.245748 50.91359)         736            192
##  7 7              (-2.134572 51.86484)         436             23
##  8 21             (-2.598136 51.47312)         431             78
##  9 11              (-4.22014 55.84052)         344             23
## 10 17             (-3.149633 51.56316)         319             19
## # ℹ 331 more rows
## # ℹ 1 more variable: ProportionHighTech <dbl>
# Step 2: Create the plot without normalizing sizes
plot <- ggplot() +
  # Add the base map of the UK
  geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
  # Plot cluster centroids with colors for proportions and sizes for TotalPoints
  geom_sf(data = proportions, 
          aes(size = TotalPoints, color = ProportionHighTech), 
          alpha = 0.8, show.legend = TRUE) +
  scale_color_viridis_c(option="C", name = "High-Tech Proportion", labels = scales::percent, direction = -1) +
  scale_size_continuous(name = "Cluster Size") +
  labs(
    x = "Longitude", 
    y = "Latitude"
  ) +
  theme_minimal() +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    plot.title = element_text(face = "bold", size = 16)
  )

# Step 3: Display the plot
print(plot)

# Step 2: Create the plot without normalizing sizes
plot <- ggplot() +
  # Add the base map of the UK
  geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
  # Plot cluster centroids with colors for proportions and sizes for TotalPoints
  geom_sf(data = proportions, 
          aes(size = TotalPoints, color = ProportionHighTech), 
          alpha = 0.8, show.legend = TRUE) +
  scale_color_viridis_c(name = "High-Tech Proportion", labels = scales::percent, direction = -1) +
  scale_size_continuous(name = "Cluster Size") +
  labs(
    x = "Longitude", 
    y = "Latitude"
  ) +
  theme_minimal() +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    plot.title = element_text(face = "bold", size = 16)
  )

# Step 3: Display the plot
print(plot)

library(ggplot2)
library(viridis)
library(dplyr)

# Categorize cluster sizes
proportions <- proportions %>%
  mutate(ClusterSizeCategory = case_when(
    TotalPoints < 1000 ~ "Small",
    TotalPoints >= 1000 & TotalPoints < 3000 ~ "Medium",
    TotalPoints >= 3000 ~ "Big"
  ))

plot <- ggplot() +
  # Base UK map
  geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
  
  # Cluster centroids: size by category, color by high-tech proportion
  geom_sf(data = proportions, 
          aes(size = ClusterSizeCategory, color = ProportionHighTech), 
          alpha = 0.8, show.legend = TRUE) +
  
 scale_color_gradientn(
  colors = c("#3B4CC0", "#9E0142")


,
  name = "High-Tech Proportion",
  labels = scales::percent
)+
  scale_size_manual(
    name = "Cluster Size",
    values = c("Small" = 1, "Medium" = 3, "Big" = 5)
  ) +

  labs(x = "Longitude", y = "Latitude") +
  theme_minimal() +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    plot.title = element_text(face = "bold", size = 16)
  )

# Display the plot
print(plot)

Association Rule Mining

set.seed(123)

min_count <- min(table(data.sf$HighTech_LowTech))

# Random under-sampling
data.sf <- data.sf %>%
  group_by(HighTech_LowTech) %>%
  sample_n(min_count) %>%    # sample each group down to min_count
  ungroup()

table(data.sf$HighTech_LowTech)
## 
## High-Tech  Low-Tech 
##      2403      2403
data<-as.data.frame(data.sf) %>%
  mutate(ClassCode = case_when(
    ClassCode == "A" ~ "Human Necessities",
    ClassCode == "B" ~ "Transporting",
    ClassCode == "C" ~ "Chemistry; metallurgy",
    ClassCode == "D" ~ "Textiles; paper",
    ClassCode == "E" ~ "Fixed constructions",
    ClassCode == "F" ~ "Mechanical engineering",
    ClassCode == "G" ~ "Physics",
    ClassCode == "H" ~ "Electricity",
    # Add as many conditions as needed
    TRUE ~ ClassCode
  ))


# Create a new column with class and time period
data <- data %>%
  mutate(decade = case_when(
    format(date, "%Y") >= 1980 & format(date, "%Y") < 1985 ~ "t1",
    format(date, "%Y") >= 1985 & format(date, "%Y") < 1990 ~ "t2",
    format(date, "%Y") >= 1990 & format(date, "%Y") < 1995 ~ "t3",
    format(date, "%Y") >= 1995 & format(date, "%Y") < 2000 ~ "t4",
    format(date, "%Y") >= 2000 & format(date, "%Y") < 2005 ~ "t5",
    format(date, "%Y") >= 2005 & format(date, "%Y") < 2010 ~ "t6",
    format(date, "%Y") >= 2010 & format(date, "%Y") < 2015 ~ "t7",
    TRUE ~ "other"  # Add other conditions if necessary
  )) %>%
  mutate(class_time_period = paste0(class, "_", decade))

data <- data %>%
  mutate(class_time_period = paste0(class_time_period, "_", HighTech_LowTech))
rules <- as(rules.clean, "data.frame")

rules <- rules %>%
  separate(rules, into = c("lhs", "rhs"), sep = " => ", remove = FALSE)

# Step 1: Split LHS into separate columns and process RHS
processed_rules <- rules %>%
  filter(!is.na(lhs) & !is.na(rhs)) %>% # Exclude rows with missing rules
  mutate(
    lhs_split = str_split(lhs, ","), # Split LHS into individual technologies
    rhs_split = str_extract(rhs, "\\{(.*?)\\}") %>% # Extract RHS technologies
      str_remove_all("\\{|\\}"), # Remove braces from RHS
    rhs_class = str_extract(rhs_split, "^[A-Za-z0-9]+") # Extract class codes from RHS
  ) %>%
  unnest_wider(lhs_split, names_sep = "_") %>% # Split LHS into multiple columns (lhs_1, lhs_2, etc.)
  rename_with(~ gsub("lhs_split_", "lhs", .x)) %>% # Rename columns for better readability
  mutate(across(starts_with("lhs"), ~ str_remove_all(., "\\{|\\}"))) # Remove brackets from all LHS columns

# Step 2: View results
processed_rules
## # A tibble: 1,207 × 14
##    rules   lhs   rhs   support confidence coverage  lift count lhs1  lhs2  lhs3 
##    <chr>   <chr> <chr>   <dbl>      <dbl>    <dbl> <dbl> <int> <chr> <chr> <chr>
##  1 {B65D_… B65D… {F16…  0.0226      0.857   0.0264  37.9     6 B65D… H04N… <NA> 
##  2 {B65D_… B65D… {F16…  0.0226      0.857   0.0264  37.9     6 B65D… H04M… <NA> 
##  3 {B65D_… B65D… {F16…  0.0226      0.857   0.0264  37.9     6 B65D… H04M… <NA> 
##  4 {E06B_… E06B… {B60…  0.0226      1       0.0226  37.9     6 E06B… H04N… <NA> 
##  5 {H04M_… H04M… {F16…  0.0226      0.857   0.0264  37.9     6 H04M… H04N… H04N…
##  6 {H04M_… H04M… {F16…  0.0226      0.857   0.0264  37.9     6 H04M… H04N… H04N…
##  7 {H04M_… H04M… {F16…  0.0226      0.857   0.0264  37.9     6 H04M… H04M… H04N…
##  8 {H04M_… H04M… {E06…  0.0226      1       0.0226  37.9     6 H04M… H04M… H04N…
##  9 {A47K_… A47K… {B60…  0.0226      1       0.0226  37.9     6 A47K… E06B… H04M…
## 10 {E06B_… E06B… {B60…  0.0226      1       0.0226  37.9     6 E06B… H04M… H04M…
## # ℹ 1,197 more rows
## # ℹ 3 more variables: lhs4 <chr>, rhs_split <chr>, rhs_class <chr>
df_clean <- processed_rules %>%
  # 1) Extract "t4", "t6", etc. from LHS columns
  mutate(
    lhs1_period = str_extract(lhs1, "t\\d+"),
    lhs2_period = str_extract(lhs2, "t\\d+"),
    lhs3_period = str_extract(lhs3, "t\\d+"),
    lhs4_period = str_extract(lhs4, "t\\d+"),
    
    # 2) Extract "t4", "t6", etc. from the RHS column
    rhs_period  = str_extract(rhs_split, "t\\d+")
  ) %>%
  
  # 3) Convert them to numeric
  mutate(
    lhs1_num = as.integer(str_remove(lhs1_period, "t")),
    lhs2_num = as.integer(str_remove(lhs2_period, "t")),
    lhs3_num = as.integer(str_remove(lhs3_period, "t")),
    lhs4_num = as.integer(str_remove(lhs4_period, "t")),
    rhs_num  = as.integer(str_remove(rhs_period, "t"))
  )

df_clean <- df_clean %>%
  # We do rowwise so we can compute max across columns easily
  rowwise() %>%
  mutate(
    # Gather all LHS numeric periods into a vector
    lhs_periods = list(c(lhs1_num, lhs2_num, lhs3_num, lhs4_num)),
    # Compute the maximum LHS period (ignoring NA)
    max_lhs     = max(lhs_periods, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  
  # Keep rows where max_lhs <= rhs_num
  filter(max_lhs <= rhs_num | is.na(rhs_num))
df_expanded <- df_clean %>%
  # Remove braces if you still have them: {G06F_t2_High-Tech, H04N_t4_High-Tech}
  mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
  # Split by commas into multiple rows
  separate_rows(lhs_clean, sep = ",") %>%
  # Trim whitespace
  mutate(lhs_clean = str_trim(lhs_clean)) %>%
  
  # Possibly do the same for the RHS if it can contain multiple items
  mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
  separate_rows(rhs_clean, sep = ",") %>%
  mutate(rhs_clean = str_trim(rhs_clean))

pair_counts <- df_expanded %>%
  group_by(lhs_clean, rhs_clean) %>%
  summarise(freq = n(), lifts  = list(lift), .groups = "drop") %>%
  arrange(desc(freq))


pair_counts
## # A tibble: 376 × 4
##    lhs_clean         rhs_clean          freq lifts     
##    <chr>             <chr>             <int> <list>    
##  1 G06F_t5_High-Tech H04M_t5_High-Tech    26 <dbl [26]>
##  2 H04M_t4_High-Tech H04N_t5_High-Tech    20 <dbl [20]>
##  3 H04M_t5_High-Tech G06F_t5_High-Tech    19 <dbl [19]>
##  4 G01N_t2_High-Tech G06F_t5_High-Tech    13 <dbl [13]>
##  5 G01N_t2_High-Tech H04B_t4_High-Tech    13 <dbl [13]>
##  6 G01N_t2_High-Tech H04Q_t5_High-Tech    12 <dbl [12]>
##  7 H04B_t4_High-Tech H04M_t5_High-Tech    12 <dbl [12]>
##  8 H04M_t4_High-Tech G06F_t5_High-Tech    12 <dbl [12]>
##  9 H04M_t4_High-Tech H04M_t5_High-Tech    12 <dbl [12]>
## 10 G06F_t3_High-Tech H04M_t5_High-Tech    11 <dbl [11]>
## # ℹ 366 more rows
# 1. Pick top 10
top30 <- pair_counts %>%
  slice_max(order_by = freq, n = 30)

top30
## # A tibble: 30 × 4
##    lhs_clean         rhs_clean          freq lifts     
##    <chr>             <chr>             <int> <list>    
##  1 G06F_t5_High-Tech H04M_t5_High-Tech    26 <dbl [26]>
##  2 H04M_t4_High-Tech H04N_t5_High-Tech    20 <dbl [20]>
##  3 H04M_t5_High-Tech G06F_t5_High-Tech    19 <dbl [19]>
##  4 G01N_t2_High-Tech G06F_t5_High-Tech    13 <dbl [13]>
##  5 G01N_t2_High-Tech H04B_t4_High-Tech    13 <dbl [13]>
##  6 G01N_t2_High-Tech H04Q_t5_High-Tech    12 <dbl [12]>
##  7 H04B_t4_High-Tech H04M_t5_High-Tech    12 <dbl [12]>
##  8 H04M_t4_High-Tech G06F_t5_High-Tech    12 <dbl [12]>
##  9 H04M_t4_High-Tech H04M_t5_High-Tech    12 <dbl [12]>
## 10 G06F_t3_High-Tech H04M_t5_High-Tech    11 <dbl [11]>
## # ℹ 20 more rows
ggplot(top30, aes(x = reorder(paste(lhs_clean, "->", rhs_clean), freq), y = freq)) +
  # Use geom_col for a cleaner approach (same as geom_bar(stat="identity"))
  geom_col(fill = "#440154FF", width = 0.7) +
  
  # Flip coordinates to make it a horizontal bar chart
  coord_flip() +
  
  # Expand the y-scale so labels don't get clipped (if you add geom_text)
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  
  # Title and axis labels
  labs(
    title = "Top 30 Association Rules by Frequency",
    x = "Rule (LHS -> RHS)",
    y = "Frequency (Count)"
  ) +
  
  # A clean, minimal theme with some custom tweaks
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = element_blank(),        # remove minor grid lines
    panel.grid.major.y = element_blank(),      # remove horizontal grid lines
    plot.title = element_text(face = "bold")   # make title bold
  )

Absolute differences of occurances (LHS-RHS)

# Step 1: Clean the data and prepare LHS and RHS
df_expanded <- df_clean %>%
  # Remove braces (e.g., "{G06F_t2_High-Tech, H04N_t4_High-Tech}")
  mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
  separate_rows(lhs_clean, sep = ",") %>%
  mutate(lhs_clean = str_trim(lhs_clean)) %>%
  mutate(lhs_clean = str_remove(lhs_clean, "_t\\d+")) %>%
  mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
  separate_rows(rhs_clean, sep = ",") %>%
  mutate(rhs_clean = str_trim(rhs_clean)) %>%
  mutate(rhs_clean = str_remove(rhs_clean, "_t\\d+"))

# Step 2: Group by pairs and create a list of lift values
pair_lift_values <- df_expanded %>%
  group_by(lhs_clean, rhs_clean) %>%
  summarise(
    freq = n(),  # Frequency of the pair
    lifts = list(lift),  # List of lift values for each pair
    .groups = "drop"
  )

#data_matrix

#HIGH-TECH LOW TECH MATRIX

data_matrix <- df_clean[, c("lhs1", "lhs2", "lhs3","lhs4","rhs_split","lift")]

# Step 1: Extract "High-Tech" or "Low-Tech" from all columns
data_clean <- data_matrix %>%
  mutate(
    across(starts_with("lhs"), ~str_extract(., "Low-Tech|High-Tech")),
    rhs_split = str_extract(rhs_split, "Low-Tech|High-Tech")
  )

data_labeled <- data_clean %>%
  rowwise() %>%
  mutate(
    mixed = any(c(lhs1, lhs2, lhs3, lhs4) == "High-Tech", na.rm = TRUE) & 
      any(c(lhs1, lhs2, lhs3, lhs4) == "Low-Tech", na.rm = TRUE)
  ) %>%
  ungroup()

data_processed <- data_labeled %>%
  filter(mixed != TRUE)

data_processed <- data_processed[c("lhs1","rhs_split")]

colnames(data_processed)[1] <- "lhs"

mixed_data <-  data_labeled %>%
  filter(mixed != FALSE)

# Select only the `rhs_split` column
mixed_data <- mixed_data %>%
  select(rhs_split) %>%
  mutate(lhs = "High-Tech") %>%  # Add a column `lhs` with value "High-Tech"
  bind_rows(
    mixed_data %>%
      select(rhs_split) %>%       # Duplicate the data
      mutate(lhs = "Low-Tech")    # Add a column `lhs` with value "Low-Tech"
  )


# Reorder columns
mixed_data <- mixed_data %>%
  select(lhs, rhs_split, everything())  # Specify the order of the columns


# Combine datasets by rows
merged_data <- rbind(data_processed, mixed_data)

# Count combinations of lhs and rhs_split
pair_counts <- merged_data %>%
  count(lhs, rhs_split) %>%
  pivot_wider(names_from = rhs_split, values_from = n, values_fill = 0)

pair_counts
## # A tibble: 2 × 3
##   lhs       `High-Tech` `Low-Tech`
##   <chr>           <int>      <int>
## 1 High-Tech         374         45
## 2 Low-Tech          100         31
# Format the matrix for easier interpretation
matrix_counts <- as.matrix(pair_counts[, -1])  # Exclude the lhs column
rownames(matrix_counts) <- pair_counts$lhs  # Set row names to lhs values


#MOST ASYMMETRIC PAIRS. 


# Create a new dataset with selected columns
cleaned_data <- df_clean %>%
  select(lhs1, lhs2, lhs3, lhs4, rhs_split, lift) %>%  # Select relevant columns
  mutate(
    lhs1 = str_remove(lhs1, "_t\\d+"),  # Remove time period (_tX) from lhs1
    lhs2 = str_remove(lhs2, "_t\\d+"),  # Remove time period (_tX) from lhs2
    lhs3 = str_remove(lhs3, "_t\\d+"),  # Remove time period (_tX) from lhs3
    lhs4 = str_remove(lhs4, "_t\\d+"),  # Remove time period (_tX) from lhs4
    rhs_split = str_remove(rhs_split, "_t\\d+")  # Remove time period (_tX) from rhs_split
    
  )

cleaned_data
## # A tibble: 456 × 6
##    lhs1           lhs2           lhs3           lhs4  rhs_split       lift
##    <chr>          <chr>          <chr>          <chr> <chr>          <dbl>
##  1 E06B_Low-Tech  G06F_High-Tech H04M_High-Tech <NA>  E05B_Low-Tech   33.1
##  2 E06B_Low-Tech  G01N_High-Tech G06F_High-Tech <NA>  E05B_Low-Tech   33.1
##  3 H04L_High-Tech H04M_High-Tech <NA>           <NA>  G06Q_High-Tech  32.4
##  4 A47K_Low-Tech  G06F_High-Tech <NA>           <NA>  G01N_High-Tech  29.4
##  5 A47K_Low-Tech  G06F_High-Tech <NA>           <NA>  G01N_High-Tech  29.4
##  6 G01N_High-Tech H04B_High-Tech <NA>           <NA>  A47K_Low-Tech   29.4
##  7 E06B_Low-Tech  H04B_High-Tech <NA>           <NA>  A47K_Low-Tech   29.4
##  8 A47K_Low-Tech  G01N_High-Tech H04Q_High-Tech <NA>  G01N_High-Tech  29.4
##  9 A47K_Low-Tech  G01N_High-Tech H04B_High-Tech <NA>  G01N_High-Tech  29.4
## 10 A47K_Low-Tech  G01N_High-Tech G06F_High-Tech <NA>  G01N_High-Tech  29.4
## # ℹ 446 more rows
# Step 1: Combine all LHS columns into one and count occurrences
lhs_occurrences <- cleaned_data %>%
  pivot_longer(cols = starts_with("lhs"), names_to = "lhs_position", values_to = "item") %>%
  filter(!is.na(item)) %>%
  group_by(item) %>%
  summarise(LHS_Count = n(), .groups = "drop")

# Step 2: Count occurrences in the RHS column
rhs_occurrences <- cleaned_data %>%
  group_by(rhs_split) %>%
  summarise(RHS_Count = n(), .groups = "drop") %>%
  rename(item = rhs_split)

# Step 3: Combine LHS and RHS counts and calculate the difference
item_differences <- lhs_occurrences %>%
  full_join(rhs_occurrences, by = "item") %>%
  mutate(
    LHS_Count = replace_na(LHS_Count, 0),  
    RHS_Count = replace_na(RHS_Count, 0), 
    Difference = LHS_Count - RHS_Count    # Calculate the difference
  ) %>%
  arrange(desc(Difference))  # Sort by difference


occurrence_plot <- ggplot(item_differences, aes(x = reorder(item, Difference), y = Difference)) +
  geom_bar(stat = "identity", fill = "#440154FF") +
  coord_flip() +  # Flip the coordinates for better readability
  labs(
    x = "Pairs",
    y = "Difference (LHS - RHS)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

# Display the plot
print(occurrence_plot)

Relative Asymmetry of Technologies (LHS vs RHS)

Patents that are shown in the Left of the plot are catalysts patents and patents that are shown in the right are follower patents

library(dplyr)

relative_asymmetry <- lhs_occurrences %>%
  full_join(rhs_occurrences, by = "item") %>%
  mutate(
    LHS_Count = replace_na(LHS_Count, 0),
    RHS_Count = replace_na(RHS_Count, 0),
    Total = LHS_Count + RHS_Count,
    Asymmetry_Percent = if_else(Total > 0, (LHS_Count - RHS_Count) / Total * -100, NA_real_)
  ) %>%
  arrange(desc(Asymmetry_Percent))

library(ggplot2)

ggplot(relative_asymmetry, aes(x = reorder(item, Asymmetry_Percent), y = Asymmetry_Percent)) +
  geom_bar(stat = "identity", fill = "#440154FF") +
  coord_flip() +
  labs(
    x = "Technology",
    y = "Asymmetry (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

merged_data
## # A tibble: 550 × 2
##    lhs       rhs_split
##    <chr>     <chr>    
##  1 High-Tech High-Tech
##  2 High-Tech Low-Tech 
##  3 High-Tech High-Tech
##  4 High-Tech High-Tech
##  5 High-Tech High-Tech
##  6 High-Tech High-Tech
##  7 High-Tech High-Tech
##  8 Low-Tech  Low-Tech 
##  9 Low-Tech  High-Tech
## 10 Low-Tech  Low-Tech 
## # ℹ 540 more rows
# For Loop for counting occurences of High-Tech Low-Tech Pairs
count <- 0

for (i in 1:nrow(merged_data)) {
  if (merged_data$lhs[i] == "Low-Tech" && merged_data$rhs_split[i] == "High-Tech") {
    count <- count + 1
  }
}

count
## [1] 100

High-Tech Low-Tech Matrix

# Filter only relevant rows
filtered_data <- merged_data[merged_data$lhs %in% c("High-Tech", "Low-Tech") & 
                             merged_data$rhs_split %in% c("High-Tech", "Low-Tech"), ]

# Create a table of combinations
comb_counts <- table(filtered_data$lhs, filtered_data$rhs_split)

print(comb_counts)
##            
##             High-Tech Low-Tech
##   High-Tech       374       45
##   Low-Tech        100       31

Heatmap of Patent interactions

# Step 1: Combine all LHS columns into one and pair with RHS
item_pairs <- cleaned_data %>%
  # Combine all LHS columns into a single column
  pivot_longer(cols = starts_with("lhs"), names_to = "lhs_position", values_to = "lhs_item") %>%
  filter(!is.na(lhs_item)) %>%  # Remove NA values
  
  # Group by LHS-RHS pairs and summarize counts and lifts
  group_by(lhs_item, rhs_split) %>%
  summarise(
    Pair_Count = n(),  # Count occurrences
    Lifts = list(lift),  # Collect lift values into a list
    .groups = "drop"
  ) %>%
  
  # Sort by Pair_Count in descending order
  arrange(desc(Pair_Count))

# Generate a heatmap from the item_pairs data
heatmap_plot <- ggplot(item_pairs, aes(x = rhs_split, y = lhs_item, fill = Pair_Count)) +
  geom_tile() +
  scale_fill_viridis_c(name = "Pair Count", direction = -1) +  # Use a Viridis color scale for better visibility
  labs(
    x = "RHS",
    y = "LHS"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for readability
    axis.text.y = element_text(size = 10),
    legend.position = "right",
    plot.title = element_text(face = "bold", size = 16)
  )

# Display the heatmap
print(heatmap_plot)

Network graph of Patent interactions

library(grid)

#set.seed(111)

#Network graph of patents over 20 observation

graph_data <- item_pairs %>%
  filter(Pair_Count > 20) %>%
  select(from = lhs_item, to = rhs_split, weight = Pair_Count)

# Step 2: Create a tidygraph object
network <- tbl_graph(edges = graph_data, directed = TRUE)

ggraph(network, layout = "fr") +
  geom_edge_fan(
    aes(width = weight, color = weight),
    alpha = 0.8,
    arrow = arrow(length = unit(3, 'mm'), type = "closed"),  # Add this
    end_cap = circle(3, 'mm')  # Optional: makes space at node ends
  ) +
  geom_node_point(size = 5, color = "#440154FF") +
  geom_node_text(aes(label = name), repel = TRUE, size = 4, fontface = "bold") +
  scale_edge_width(range = c(0.5, 3)) +
  scale_edge_color_viridis(option = "D", direction = -1, name = "Pair Count") +
  theme_void()
## Warning: `guide_colourbar()` cannot be used for edge_colour.
## ℹ Use one of colour, color, or fill instead.

#Network graph of patents over 10 observation

graph_data <- item_pairs %>%
  filter(Pair_Count > 10) %>%
  select(from = lhs_item, to = rhs_split, weight = Pair_Count)

# Step 2: Create a tidygraph object
network <- tbl_graph(edges = graph_data, directed = TRUE)

ggraph(network, layout = "fr") +
  geom_edge_fan(
    aes(width = weight, color = weight),
    alpha = 0.8,
    arrow = arrow(length = unit(3, 'mm'), type = "closed"),  # Add this
    end_cap = circle(3, 'mm')  # Optional: makes space at node ends
  ) +
  geom_node_point(size = 5, color = "#440154FF") +
  geom_node_text(aes(label = name), repel = TRUE, size = 4, fontface = "bold") +
  scale_edge_width(range = c(0.5, 3)) +
  scale_edge_color_viridis(option = "D", direction = -1, name = "Pair Count") +
  theme_void()
## Warning: `guide_colourbar()` cannot be used for edge_colour.
## ℹ Use one of colour, color, or fill instead.