We use a dataset consists of detailed inventor addresses and technological classifications from the UKIPO for UK patent applications with citation and inventor data from the EPO’s PATSTAT database (MacDonald & Salmanovic, 2023). The main dataset consists of 1’517’797 patents citation information between the years 1980 - 2015. For analysing patents, we grouped data by cited patents and obtained 26’976 original patents. The number of patents decreased because we analysed only original patents (cited by others). Using original patents provides more valuable insights because they are the most impactful ones. Besides date information, we used patent types to analyse the behaviours of different technologies. In this dataset, there are 8 main categories and 313 subcategories. In this research, we focused on differences between high-tech on low-tech patents. For this classification we used EUROSTAT documents for patent classification which has three categories, high-tech, low-tech and biotechnology. We conducted some data preprocessing to obtain meaningful results. We observed that biotechnology class has very few observations in the dataset and decided to drop this class. In addition to technology classification, we added time period information to our dataset ranging from t_1 to t_7 which represents 5 years periods between 1980-2015. Adding these time periods ensured that our results are timely correct.
UK.sf<-st_read("Counties_and_Unitary_Authorities_(December_2022)_UK_BUC.shp")
## Reading layer `Counties_and_Unitary_Authorities_(December_2022)_UK_BUC' from data source `/Users/zehrausta/Desktop/Patent_Spatial_Analysis/Counties_and_Unitary_Authorities_(December_2022)_UK_BUC.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 217 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -116.1928 ymin: 7054.1 xmax: 655653.8 ymax: 1220310
## Projected CRS: OSGB36 / British National Grid
UK.sf<-st_transform(UK.sf, 4326)
data <- read.csv("lks_paper_main_data.csv")
head(data)
## citation_id cited_appln_id citing_appln_id cited_lat cited_lon
## 1 1 21292583 338651101 52.57612 -1.9490919
## 2 2 21292591 21370108 52.62726 -0.5079481
## 3 3 21292591 21421412 52.62726 -0.5079481
## 4 4 21292591 21454772 52.62726 -0.5079481
## 5 5 21292609 21349235 51.39139 -0.0255888
## 6 6 21292609 21447292 51.39139 -0.0255888
## cited_ipc_class cited_date_filed cited_granted citing_appln control_appln_id
## 1 F24F 2000-01-05 0 1 338651101
## 2 A47K 2000-01-05 0 1 21370108
## 3 A47K 2000-01-05 0 1 21421412
## 4 A47K 2000-01-05 0 1 21454772
## 5 B65G 2000-03-14 0 1 21349235
## 6 B65G 2000-03-14 0 1 21447292
## control_lat control_lon control_ipc_class control_date_filed control_granted
## 1 50.94896 -0.1324262 F24F 2011-09-01 0
## 2 50.91302 -3.4681057 A47K 2002-02-05 1
## 3 53.74040 -0.4106374 A47K 2003-07-05 0
## 4 57.13030 -2.1601731 A47K 2004-08-28 0
## 5 52.92302 -1.2016132 B65G 2001-06-19 0
## 6 54.00640 -2.2269800 E04F 2004-05-25 1
## distance inter_class_sample intra_class_sample X11_digit_controls_sample
## 1 219.9102 0 1 1
## 2 278.9237 0 1 1
## 3 123.9459 0 1 1
## 4 511.7086 0 1 1
## 5 188.2525 0 1 0
## 6 326.3890 1 0 1
## early_sample late_sample granted_sample no_citations_for_class
## 1 0 1 0 182
## 2 0 1 0 402
## 3 0 1 0 402
## 4 0 1 0 402
## 5 0 1 0 189
## 6 0 1 0 189
## has_us_filing_sample has_no_us_filing_sample
## 1 0 0
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
grouped_data <- data %>%
group_by(cited_appln_id)
patents <- grouped_data %>%
summarise(
lat = first(cited_lat),
lon = first(cited_lon),
class = first(cited_ipc_class),
date = first(cited_date_filed)
)
patents <- as.data.frame(patents)
patents$general_class <- substr(patents$class, 1, 3)
data.sf <- st_as_sf(patents, coords = c("lon", "lat"),
crs = "+proj=longlat +datum=NAD83")
data.sf$date <- as.Date(data.sf$date)
data.sf$ClassCode <- substr(data.sf$class, 1, 1)
# Define IPC subclasses for high-tech and biotechnology
high_tech_classes <- c("B41J", "G06C", "G06D", "G06E", "G11C", "G06Q","G06G","G06J","G06F","G06M","B64B","B64C","B64D","B64F","B64G","C40B","C12P","C12Q","H01S","H01L","H04B","H04H","H04J","H04K","H04L","H04M","H04N","H04Q","H04R","H04S","A01H", "A01H","A61K","C02F","C40B","C12N","C12P","C12Q","C12S","G01N","G01N")
# Classification function to classify each IPC class
classify_ipc <- function(ipc_class) {
if (ipc_class %in% high_tech_classes) {
return("High-Tech")
} else {
return("Low-Tech")
}
}
# Apply the classification function to your dataframe
data.sf <- data.sf %>%
mutate(
HighTech_LowTech = sapply(class, classify_ipc),
)
table(data.sf$HighTech_LowTech)
##
## High-Tech Low-Tech
## 2513 24463
# Summarize the data to get counts for each category
category_counts <- data.sf %>%
group_by(HighTech_LowTech) %>%
summarise(Count = n(), .groups = 'drop')
# Create pie chart
pie_chart <- ggplot(category_counts, aes(x = "", y = Count, fill = HighTech_LowTech)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") + # Use polar coordinates to make it a pie chart
scale_fill_viridis_d(option = "D", direction = -1) +
labs(fill = "Category",
x = NULL,
y = NULL) +
theme_void() + # Remove extra chart elements
theme(legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
pie_chart
# Summarize data by year and HighTech_BioTech category
trends_over_time <- data.sf %>%
group_by(Year = year(date), HighTech_LowTech) %>%
summarise(Count = n(), .groups = 'drop')
# Print the summary to check the result
print(head(trends_over_time))
## Simple feature collection with 6 features and 3 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: -7.64221 ymin: 50.1926 xmax: 1.721395 ymax: 57.4622
## Geodetic CRS: +proj=longlat +datum=NAD83
## # A tibble: 6 × 4
## Year HighTech_LowTech Count geometry
## <dbl> <chr> <int> <GEOMETRY [°]>
## 1 1980 Low-Tech 1 POINT (-2.52231 53.75211)
## 2 1982 High-Tech 71 MULTIPOINT ((0.11265 52.20148), (0.0672778 52.25…
## 3 1982 Low-Tech 1024 MULTIPOINT ((1.37612 52.72473), (1.017839 52.955…
## 4 1983 High-Tech 81 MULTIPOINT ((0.4598534 52.07989), (0.14046 52.21…
## 5 1983 Low-Tech 1089 MULTIPOINT ((1.38051 52.82577), (1.11339 52.7630…
## 6 1984 High-Tech 70 MULTIPOINT ((0.3949892 52.25248), (0.1443644 52.…
# Define custom colors for the categories
custom_colors <- c("Low-Tech" = "#440154FF", "High-Tech" = "#20A486FF") # Replace with your desired colors
# Create the line chart with custom colors and larger axis labels
line_chart <- ggplot(trends_over_time, aes(x = Year, y = Count, color = HighTech_LowTech)) +
geom_line(size = 1) + # Draw lines with increased thickness
geom_point(size = 2) + # Add points at data locations with increased size
scale_color_manual(values = custom_colors) + # Apply custom colors
labs(x = "Year",
y = "Count of Patents",
color = "Category") +
theme_minimal() + # Use a minimal theme
theme(
legend.title = element_text(size = 22),
legend.text = element_text(size = 20),
axis.title.x = element_text(size = 22),
axis.title.y = element_text(size = 22),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the line chart
print(line_chart)
# Summarize data by year and HighTech_BioTech category, calculate total patents per year
yearly_data <- data.sf %>%
mutate(Year = year(date)) %>%
group_by(Year, HighTech_LowTech) %>%
summarise(Count = n(), .groups = 'drop') %>%
group_by(Year) %>%
mutate(Total = sum(Count), Proportion = Count / Total)
# Filtering to only include High-Tech
filtered_data <- yearly_data %>%
filter(HighTech_LowTech %in% c("High-Tech"))
# Plotting the proportions
proportional_chart <- ggplot(filtered_data, aes(x = Year, y = Proportion, color = HighTech_LowTech)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = "#440154FF") +
labs(title = "",
x = "Year",
y = "Proportion (%)",
color = "Category") +
theme_minimal() +
theme(legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
# Display the line chart
print(proportional_chart)
### Kernel Density Estimation of High-tech and Low-tech Patents
# Step 1: Filter data for High-Tech and Low-Tech
high_tech_points <- data.sf %>%
filter(HighTech_LowTech == "High-Tech")
# Step 2: Extract coordinates from sf objects
high_tech_coords <- st_coordinates(high_tech_points) %>%
as.data.frame() %>%
rename(Longitude = X, Latitude = Y)
# Step 1: Filter data for High-Tech and Low-Tech
low_tech_points <- data.sf %>%
filter(HighTech_LowTech == "Low-Tech")
# Step 2: Extract coordinates from sf objects
low_tech_coords <- st_coordinates(low_tech_points) %>%
as.data.frame() %>%
rename(Longitude = X, Latitude = Y)
ggplot() +
geom_sf(data = UK.sf$geometry, size = 0.1) +
stat_density_2d(data = high_tech_coords, aes(x = Longitude, y = Latitude, fill =
after_stat(level), alpha = after_stat(level)),
geom = "polygon",
size = 0.1
) +
scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.75)) +
scale_alpha(range = c(0.75, 0.75), guide = 'none') +
theme_minimal(base_size = 14) + # Adjust base font size for legibility
theme(
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16),
plot.background = element_rect(fill = "white", color = NA),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()
) +
guides(fill = guide_colorbar(title = "Density"))
ggplot() +
geom_sf(data = UK.sf$geometry, size = 0.1) +
stat_density_2d(data = low_tech_coords, aes(x = Longitude, y = Latitude, fill =
after_stat(level), alpha = after_stat(level)),
geom = "polygon",
size = 0.1
) +
scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.75)) +
scale_alpha(range = c(0.75, 0.75), guide = 'none') +
theme_minimal(base_size = 14) + # Adjust base font size for legibility
theme(
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16),
plot.background = element_rect(fill = "white", color = NA),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()
) +
guides(fill = guide_colorbar(title = "Density"))
DBSCAN clustering for 26976 objects.
Parameters: eps = 0.045, minPts = 5
The clustering contains 341 cluster(s) and 1799 noise points.
shapefile <- UK.sf
coords <- st_coordinates(data.sf$geometry)
coordinates <- data.frame(coords)
dbscan_result <- dbscan(coordinates, eps = 0.045, minPts = 5)
data.sf$cluster_label <- dbscan_result$cluster
# Filter out noise points (cluster label 0)
data.sf.clustered <- data.sf[data.sf$cluster_label != 0, ]
cluster_order <- names(sort(table(data.sf.clustered$cluster_label), decreasing = TRUE))
data.sf.clustered$cluster_label <- factor(data.sf.clustered$cluster_label, levels = cluster_order)
data.sf <- data.sf.clustered
data.sf <- data.sf %>%
group_by(cluster_label) %>%
mutate(cluster_size = n()) %>%
ungroup()
ggplot() +
# Draw the UK base map
geom_sf(data = UK.sf, fill = "white", color = "gray70") +
geom_sf(
data = data.sf,
aes(color = cluster_size),
size = 0.25
) +
scale_color_viridis_c(
option = "viridis",
direction = -1,
name = "Cluster Size"
) +
theme_minimal()
library(dplyr)
library(sf)
cluster_summary <- data.sf %>%
group_by(cluster_label) %>%
summarise(
cluster_geom = st_convex_hull(st_union(geometry)),
points_in_cluster = n()
) %>%
mutate(
cluster_area = as.numeric(st_area(cluster_geom))
)
cluster_summary$area_km2 <- cluster_summary$cluster_area / 1e6
Scatterplot of Cluster Size vs Area
library(ggplot2)
ggplot(cluster_summary, aes(x = points_in_cluster, y = area_km2)) +
geom_point(size = 3, color = "#440154FF") +
geom_text(aes(label = cluster_label), vjust = -0.5, size = 3.5) + # Optional: label clusters
labs(
x = "Number of Points in Cluster",
y = "Cluster Area (km²)"
) +
theme_minimal(base_size = 14)
Scatterplot of Cluster Size vs Area (Outliers Removed)
# IQR filtering for outlier removal
filtered_data <- cluster_summary %>%
filter(
between(points_in_cluster,
quantile(points_in_cluster, 0.25) - 1.5 * IQR(points_in_cluster),
quantile(points_in_cluster, 0.75) + 1.5 * IQR(points_in_cluster)),
between(area_km2,
quantile(area_km2, 0.25) - 1.5 * IQR(area_km2),
quantile(area_km2, 0.75) + 1.5 * IQR(area_km2))
)
ggplot(filtered_data, aes(x = points_in_cluster, y = area_km2)) +
geom_point(size = 3, color = "#440154FF") +
geom_text(aes(label = cluster_label), vjust = -0.5, size = 3.5) +
labs(
x = "Number of Points in Cluster",
y = "Cluster Area (km²)"
) +
theme_minimal(base_size = 14)
# Ensure IsHighTech is added to data.sf
proportions <- data.sf %>%
mutate(IsHighTech = ifelse(HighTech_LowTech == "High-Tech", 1, 0))
# Step 1: Calculate cluster sizes and proportions for all data
proportions <- proportions %>%
group_by(cluster_label) %>%
summarise(
geometry = st_centroid(st_union(geometry)), # Calculate cluster centroid
TotalPoints = n(), # Total points in each cluster
HighTechPoints = sum(IsHighTech), # Count of High-Tech points
ProportionHighTech = HighTechPoints / TotalPoints, # High-Tech proportion
.groups = 'drop'
)
proportions
## Simple feature collection with 341 features and 4 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -7.322678 ymin: 50.11518 xmax: 1.721305 ymax: 57.46559
## Geodetic CRS: +proj=longlat +datum=NAD83
## # A tibble: 341 × 5
## cluster_label geometry TotalPoints HighTechPoints
## <fct> <POINT [°]> <int> <dbl>
## 1 3 (-0.2450684 51.53052) 8371 1216
## 2 9 (-2.500207 53.46553) 2691 135
## 3 1 (-2.020032 52.48576) 1902 62
## 4 4 (-1.346618 52.71753) 1432 86
## 5 13 (-1.57341 53.6281) 1181 41
## 6 15 (-1.245748 50.91359) 736 192
## 7 7 (-2.134572 51.86484) 436 23
## 8 21 (-2.598136 51.47312) 431 78
## 9 11 (-4.22014 55.84052) 344 23
## 10 17 (-3.149633 51.56316) 319 19
## # ℹ 331 more rows
## # ℹ 1 more variable: ProportionHighTech <dbl>
# Step 2: Create the plot without normalizing sizes
plot <- ggplot() +
# Add the base map of the UK
geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
# Plot cluster centroids with colors for proportions and sizes for TotalPoints
geom_sf(data = proportions,
aes(size = TotalPoints, color = ProportionHighTech),
alpha = 0.8, show.legend = TRUE) +
scale_color_viridis_c(option="C", name = "High-Tech Proportion", labels = scales::percent, direction = -1) +
scale_size_continuous(name = "Cluster Size") +
labs(
x = "Longitude",
y = "Latitude"
) +
theme_minimal() +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16)
)
# Step 3: Display the plot
print(plot)
# Step 2: Create the plot without normalizing sizes
plot <- ggplot() +
# Add the base map of the UK
geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
# Plot cluster centroids with colors for proportions and sizes for TotalPoints
geom_sf(data = proportions,
aes(size = TotalPoints, color = ProportionHighTech),
alpha = 0.8, show.legend = TRUE) +
scale_color_viridis_c(name = "High-Tech Proportion", labels = scales::percent, direction = -1) +
scale_size_continuous(name = "Cluster Size") +
labs(
x = "Longitude",
y = "Latitude"
) +
theme_minimal() +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16)
)
# Step 3: Display the plot
print(plot)
library(ggplot2)
library(viridis)
library(dplyr)
# Categorize cluster sizes
proportions <- proportions %>%
mutate(ClusterSizeCategory = case_when(
TotalPoints < 1000 ~ "Small",
TotalPoints >= 1000 & TotalPoints < 3000 ~ "Medium",
TotalPoints >= 3000 ~ "Big"
))
plot <- ggplot() +
# Base UK map
geom_sf(data = UK.sf, fill = "white", color = "gray", size = 0.25) +
# Cluster centroids: size by category, color by high-tech proportion
geom_sf(data = proportions,
aes(size = ClusterSizeCategory, color = ProportionHighTech),
alpha = 0.8, show.legend = TRUE) +
scale_color_gradientn(
colors = c("#3B4CC0", "#9E0142")
,
name = "High-Tech Proportion",
labels = scales::percent
)+
scale_size_manual(
name = "Cluster Size",
values = c("Small" = 1, "Medium" = 3, "Big" = 5)
) +
labs(x = "Longitude", y = "Latitude") +
theme_minimal() +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16)
)
# Display the plot
print(plot)
set.seed(123)
min_count <- min(table(data.sf$HighTech_LowTech))
# Random under-sampling
data.sf <- data.sf %>%
group_by(HighTech_LowTech) %>%
sample_n(min_count) %>% # sample each group down to min_count
ungroup()
table(data.sf$HighTech_LowTech)
##
## High-Tech Low-Tech
## 2403 2403
data<-as.data.frame(data.sf) %>%
mutate(ClassCode = case_when(
ClassCode == "A" ~ "Human Necessities",
ClassCode == "B" ~ "Transporting",
ClassCode == "C" ~ "Chemistry; metallurgy",
ClassCode == "D" ~ "Textiles; paper",
ClassCode == "E" ~ "Fixed constructions",
ClassCode == "F" ~ "Mechanical engineering",
ClassCode == "G" ~ "Physics",
ClassCode == "H" ~ "Electricity",
# Add as many conditions as needed
TRUE ~ ClassCode
))
# Create a new column with class and time period
data <- data %>%
mutate(decade = case_when(
format(date, "%Y") >= 1980 & format(date, "%Y") < 1985 ~ "t1",
format(date, "%Y") >= 1985 & format(date, "%Y") < 1990 ~ "t2",
format(date, "%Y") >= 1990 & format(date, "%Y") < 1995 ~ "t3",
format(date, "%Y") >= 1995 & format(date, "%Y") < 2000 ~ "t4",
format(date, "%Y") >= 2000 & format(date, "%Y") < 2005 ~ "t5",
format(date, "%Y") >= 2005 & format(date, "%Y") < 2010 ~ "t6",
format(date, "%Y") >= 2010 & format(date, "%Y") < 2015 ~ "t7",
TRUE ~ "other" # Add other conditions if necessary
)) %>%
mutate(class_time_period = paste0(class, "_", decade))
data <- data %>%
mutate(class_time_period = paste0(class_time_period, "_", HighTech_LowTech))
rules <- as(rules.clean, "data.frame")
rules <- rules %>%
separate(rules, into = c("lhs", "rhs"), sep = " => ", remove = FALSE)
# Step 1: Split LHS into separate columns and process RHS
processed_rules <- rules %>%
filter(!is.na(lhs) & !is.na(rhs)) %>% # Exclude rows with missing rules
mutate(
lhs_split = str_split(lhs, ","), # Split LHS into individual technologies
rhs_split = str_extract(rhs, "\\{(.*?)\\}") %>% # Extract RHS technologies
str_remove_all("\\{|\\}"), # Remove braces from RHS
rhs_class = str_extract(rhs_split, "^[A-Za-z0-9]+") # Extract class codes from RHS
) %>%
unnest_wider(lhs_split, names_sep = "_") %>% # Split LHS into multiple columns (lhs_1, lhs_2, etc.)
rename_with(~ gsub("lhs_split_", "lhs", .x)) %>% # Rename columns for better readability
mutate(across(starts_with("lhs"), ~ str_remove_all(., "\\{|\\}"))) # Remove brackets from all LHS columns
# Step 2: View results
processed_rules
## # A tibble: 1,207 × 14
## rules lhs rhs support confidence coverage lift count lhs1 lhs2 lhs3
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int> <chr> <chr> <chr>
## 1 {B65D_… B65D… {F16… 0.0226 0.857 0.0264 37.9 6 B65D… H04N… <NA>
## 2 {B65D_… B65D… {F16… 0.0226 0.857 0.0264 37.9 6 B65D… H04M… <NA>
## 3 {B65D_… B65D… {F16… 0.0226 0.857 0.0264 37.9 6 B65D… H04M… <NA>
## 4 {E06B_… E06B… {B60… 0.0226 1 0.0226 37.9 6 E06B… H04N… <NA>
## 5 {H04M_… H04M… {F16… 0.0226 0.857 0.0264 37.9 6 H04M… H04N… H04N…
## 6 {H04M_… H04M… {F16… 0.0226 0.857 0.0264 37.9 6 H04M… H04N… H04N…
## 7 {H04M_… H04M… {F16… 0.0226 0.857 0.0264 37.9 6 H04M… H04M… H04N…
## 8 {H04M_… H04M… {E06… 0.0226 1 0.0226 37.9 6 H04M… H04M… H04N…
## 9 {A47K_… A47K… {B60… 0.0226 1 0.0226 37.9 6 A47K… E06B… H04M…
## 10 {E06B_… E06B… {B60… 0.0226 1 0.0226 37.9 6 E06B… H04M… H04M…
## # ℹ 1,197 more rows
## # ℹ 3 more variables: lhs4 <chr>, rhs_split <chr>, rhs_class <chr>
df_clean <- processed_rules %>%
# 1) Extract "t4", "t6", etc. from LHS columns
mutate(
lhs1_period = str_extract(lhs1, "t\\d+"),
lhs2_period = str_extract(lhs2, "t\\d+"),
lhs3_period = str_extract(lhs3, "t\\d+"),
lhs4_period = str_extract(lhs4, "t\\d+"),
# 2) Extract "t4", "t6", etc. from the RHS column
rhs_period = str_extract(rhs_split, "t\\d+")
) %>%
# 3) Convert them to numeric
mutate(
lhs1_num = as.integer(str_remove(lhs1_period, "t")),
lhs2_num = as.integer(str_remove(lhs2_period, "t")),
lhs3_num = as.integer(str_remove(lhs3_period, "t")),
lhs4_num = as.integer(str_remove(lhs4_period, "t")),
rhs_num = as.integer(str_remove(rhs_period, "t"))
)
df_clean <- df_clean %>%
# We do rowwise so we can compute max across columns easily
rowwise() %>%
mutate(
# Gather all LHS numeric periods into a vector
lhs_periods = list(c(lhs1_num, lhs2_num, lhs3_num, lhs4_num)),
# Compute the maximum LHS period (ignoring NA)
max_lhs = max(lhs_periods, na.rm = TRUE)
) %>%
ungroup() %>%
# Keep rows where max_lhs <= rhs_num
filter(max_lhs <= rhs_num | is.na(rhs_num))
df_expanded <- df_clean %>%
# Remove braces if you still have them: {G06F_t2_High-Tech, H04N_t4_High-Tech}
mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
# Split by commas into multiple rows
separate_rows(lhs_clean, sep = ",") %>%
# Trim whitespace
mutate(lhs_clean = str_trim(lhs_clean)) %>%
# Possibly do the same for the RHS if it can contain multiple items
mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
separate_rows(rhs_clean, sep = ",") %>%
mutate(rhs_clean = str_trim(rhs_clean))
pair_counts <- df_expanded %>%
group_by(lhs_clean, rhs_clean) %>%
summarise(freq = n(), lifts = list(lift), .groups = "drop") %>%
arrange(desc(freq))
pair_counts
## # A tibble: 376 × 4
## lhs_clean rhs_clean freq lifts
## <chr> <chr> <int> <list>
## 1 G06F_t5_High-Tech H04M_t5_High-Tech 26 <dbl [26]>
## 2 H04M_t4_High-Tech H04N_t5_High-Tech 20 <dbl [20]>
## 3 H04M_t5_High-Tech G06F_t5_High-Tech 19 <dbl [19]>
## 4 G01N_t2_High-Tech G06F_t5_High-Tech 13 <dbl [13]>
## 5 G01N_t2_High-Tech H04B_t4_High-Tech 13 <dbl [13]>
## 6 G01N_t2_High-Tech H04Q_t5_High-Tech 12 <dbl [12]>
## 7 H04B_t4_High-Tech H04M_t5_High-Tech 12 <dbl [12]>
## 8 H04M_t4_High-Tech G06F_t5_High-Tech 12 <dbl [12]>
## 9 H04M_t4_High-Tech H04M_t5_High-Tech 12 <dbl [12]>
## 10 G06F_t3_High-Tech H04M_t5_High-Tech 11 <dbl [11]>
## # ℹ 366 more rows
# 1. Pick top 10
top30 <- pair_counts %>%
slice_max(order_by = freq, n = 30)
top30
## # A tibble: 30 × 4
## lhs_clean rhs_clean freq lifts
## <chr> <chr> <int> <list>
## 1 G06F_t5_High-Tech H04M_t5_High-Tech 26 <dbl [26]>
## 2 H04M_t4_High-Tech H04N_t5_High-Tech 20 <dbl [20]>
## 3 H04M_t5_High-Tech G06F_t5_High-Tech 19 <dbl [19]>
## 4 G01N_t2_High-Tech G06F_t5_High-Tech 13 <dbl [13]>
## 5 G01N_t2_High-Tech H04B_t4_High-Tech 13 <dbl [13]>
## 6 G01N_t2_High-Tech H04Q_t5_High-Tech 12 <dbl [12]>
## 7 H04B_t4_High-Tech H04M_t5_High-Tech 12 <dbl [12]>
## 8 H04M_t4_High-Tech G06F_t5_High-Tech 12 <dbl [12]>
## 9 H04M_t4_High-Tech H04M_t5_High-Tech 12 <dbl [12]>
## 10 G06F_t3_High-Tech H04M_t5_High-Tech 11 <dbl [11]>
## # ℹ 20 more rows
ggplot(top30, aes(x = reorder(paste(lhs_clean, "->", rhs_clean), freq), y = freq)) +
# Use geom_col for a cleaner approach (same as geom_bar(stat="identity"))
geom_col(fill = "#440154FF", width = 0.7) +
# Flip coordinates to make it a horizontal bar chart
coord_flip() +
# Expand the y-scale so labels don't get clipped (if you add geom_text)
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
# Title and axis labels
labs(
title = "Top 30 Association Rules by Frequency",
x = "Rule (LHS -> RHS)",
y = "Frequency (Count)"
) +
# A clean, minimal theme with some custom tweaks
theme_minimal(base_size = 12) +
theme(
panel.grid.minor = element_blank(), # remove minor grid lines
panel.grid.major.y = element_blank(), # remove horizontal grid lines
plot.title = element_text(face = "bold") # make title bold
)
Absolute differences of occurances (LHS-RHS)
# Step 1: Clean the data and prepare LHS and RHS
df_expanded <- df_clean %>%
# Remove braces (e.g., "{G06F_t2_High-Tech, H04N_t4_High-Tech}")
mutate(lhs_clean = str_remove_all(lhs, "[{}]")) %>%
separate_rows(lhs_clean, sep = ",") %>%
mutate(lhs_clean = str_trim(lhs_clean)) %>%
mutate(lhs_clean = str_remove(lhs_clean, "_t\\d+")) %>%
mutate(rhs_clean = str_remove_all(rhs, "[{}]")) %>%
separate_rows(rhs_clean, sep = ",") %>%
mutate(rhs_clean = str_trim(rhs_clean)) %>%
mutate(rhs_clean = str_remove(rhs_clean, "_t\\d+"))
# Step 2: Group by pairs and create a list of lift values
pair_lift_values <- df_expanded %>%
group_by(lhs_clean, rhs_clean) %>%
summarise(
freq = n(), # Frequency of the pair
lifts = list(lift), # List of lift values for each pair
.groups = "drop"
)
#data_matrix
#HIGH-TECH LOW TECH MATRIX
data_matrix <- df_clean[, c("lhs1", "lhs2", "lhs3","lhs4","rhs_split","lift")]
# Step 1: Extract "High-Tech" or "Low-Tech" from all columns
data_clean <- data_matrix %>%
mutate(
across(starts_with("lhs"), ~str_extract(., "Low-Tech|High-Tech")),
rhs_split = str_extract(rhs_split, "Low-Tech|High-Tech")
)
data_labeled <- data_clean %>%
rowwise() %>%
mutate(
mixed = any(c(lhs1, lhs2, lhs3, lhs4) == "High-Tech", na.rm = TRUE) &
any(c(lhs1, lhs2, lhs3, lhs4) == "Low-Tech", na.rm = TRUE)
) %>%
ungroup()
data_processed <- data_labeled %>%
filter(mixed != TRUE)
data_processed <- data_processed[c("lhs1","rhs_split")]
colnames(data_processed)[1] <- "lhs"
mixed_data <- data_labeled %>%
filter(mixed != FALSE)
# Select only the `rhs_split` column
mixed_data <- mixed_data %>%
select(rhs_split) %>%
mutate(lhs = "High-Tech") %>% # Add a column `lhs` with value "High-Tech"
bind_rows(
mixed_data %>%
select(rhs_split) %>% # Duplicate the data
mutate(lhs = "Low-Tech") # Add a column `lhs` with value "Low-Tech"
)
# Reorder columns
mixed_data <- mixed_data %>%
select(lhs, rhs_split, everything()) # Specify the order of the columns
# Combine datasets by rows
merged_data <- rbind(data_processed, mixed_data)
# Count combinations of lhs and rhs_split
pair_counts <- merged_data %>%
count(lhs, rhs_split) %>%
pivot_wider(names_from = rhs_split, values_from = n, values_fill = 0)
pair_counts
## # A tibble: 2 × 3
## lhs `High-Tech` `Low-Tech`
## <chr> <int> <int>
## 1 High-Tech 374 45
## 2 Low-Tech 100 31
# Format the matrix for easier interpretation
matrix_counts <- as.matrix(pair_counts[, -1]) # Exclude the lhs column
rownames(matrix_counts) <- pair_counts$lhs # Set row names to lhs values
#MOST ASYMMETRIC PAIRS.
# Create a new dataset with selected columns
cleaned_data <- df_clean %>%
select(lhs1, lhs2, lhs3, lhs4, rhs_split, lift) %>% # Select relevant columns
mutate(
lhs1 = str_remove(lhs1, "_t\\d+"), # Remove time period (_tX) from lhs1
lhs2 = str_remove(lhs2, "_t\\d+"), # Remove time period (_tX) from lhs2
lhs3 = str_remove(lhs3, "_t\\d+"), # Remove time period (_tX) from lhs3
lhs4 = str_remove(lhs4, "_t\\d+"), # Remove time period (_tX) from lhs4
rhs_split = str_remove(rhs_split, "_t\\d+") # Remove time period (_tX) from rhs_split
)
cleaned_data
## # A tibble: 456 × 6
## lhs1 lhs2 lhs3 lhs4 rhs_split lift
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 E06B_Low-Tech G06F_High-Tech H04M_High-Tech <NA> E05B_Low-Tech 33.1
## 2 E06B_Low-Tech G01N_High-Tech G06F_High-Tech <NA> E05B_Low-Tech 33.1
## 3 H04L_High-Tech H04M_High-Tech <NA> <NA> G06Q_High-Tech 32.4
## 4 A47K_Low-Tech G06F_High-Tech <NA> <NA> G01N_High-Tech 29.4
## 5 A47K_Low-Tech G06F_High-Tech <NA> <NA> G01N_High-Tech 29.4
## 6 G01N_High-Tech H04B_High-Tech <NA> <NA> A47K_Low-Tech 29.4
## 7 E06B_Low-Tech H04B_High-Tech <NA> <NA> A47K_Low-Tech 29.4
## 8 A47K_Low-Tech G01N_High-Tech H04Q_High-Tech <NA> G01N_High-Tech 29.4
## 9 A47K_Low-Tech G01N_High-Tech H04B_High-Tech <NA> G01N_High-Tech 29.4
## 10 A47K_Low-Tech G01N_High-Tech G06F_High-Tech <NA> G01N_High-Tech 29.4
## # ℹ 446 more rows
# Step 1: Combine all LHS columns into one and count occurrences
lhs_occurrences <- cleaned_data %>%
pivot_longer(cols = starts_with("lhs"), names_to = "lhs_position", values_to = "item") %>%
filter(!is.na(item)) %>%
group_by(item) %>%
summarise(LHS_Count = n(), .groups = "drop")
# Step 2: Count occurrences in the RHS column
rhs_occurrences <- cleaned_data %>%
group_by(rhs_split) %>%
summarise(RHS_Count = n(), .groups = "drop") %>%
rename(item = rhs_split)
# Step 3: Combine LHS and RHS counts and calculate the difference
item_differences <- lhs_occurrences %>%
full_join(rhs_occurrences, by = "item") %>%
mutate(
LHS_Count = replace_na(LHS_Count, 0),
RHS_Count = replace_na(RHS_Count, 0),
Difference = LHS_Count - RHS_Count # Calculate the difference
) %>%
arrange(desc(Difference)) # Sort by difference
occurrence_plot <- ggplot(item_differences, aes(x = reorder(item, Difference), y = Difference)) +
geom_bar(stat = "identity", fill = "#440154FF") +
coord_flip() + # Flip the coordinates for better readability
labs(
x = "Pairs",
y = "Difference (LHS - RHS)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
# Display the plot
print(occurrence_plot)
Relative Asymmetry of Technologies (LHS vs RHS)
Patents that are shown in the Left of the plot are catalysts patents and patents that are shown in the right are follower patents
library(dplyr)
relative_asymmetry <- lhs_occurrences %>%
full_join(rhs_occurrences, by = "item") %>%
mutate(
LHS_Count = replace_na(LHS_Count, 0),
RHS_Count = replace_na(RHS_Count, 0),
Total = LHS_Count + RHS_Count,
Asymmetry_Percent = if_else(Total > 0, (LHS_Count - RHS_Count) / Total * -100, NA_real_)
) %>%
arrange(desc(Asymmetry_Percent))
library(ggplot2)
ggplot(relative_asymmetry, aes(x = reorder(item, Asymmetry_Percent), y = Asymmetry_Percent)) +
geom_bar(stat = "identity", fill = "#440154FF") +
coord_flip() +
labs(
x = "Technology",
y = "Asymmetry (%)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
merged_data
## # A tibble: 550 × 2
## lhs rhs_split
## <chr> <chr>
## 1 High-Tech High-Tech
## 2 High-Tech Low-Tech
## 3 High-Tech High-Tech
## 4 High-Tech High-Tech
## 5 High-Tech High-Tech
## 6 High-Tech High-Tech
## 7 High-Tech High-Tech
## 8 Low-Tech Low-Tech
## 9 Low-Tech High-Tech
## 10 Low-Tech Low-Tech
## # ℹ 540 more rows
# For Loop for counting occurences of High-Tech Low-Tech Pairs
count <- 0
for (i in 1:nrow(merged_data)) {
if (merged_data$lhs[i] == "Low-Tech" && merged_data$rhs_split[i] == "High-Tech") {
count <- count + 1
}
}
count
## [1] 100
High-Tech Low-Tech Matrix
# Filter only relevant rows
filtered_data <- merged_data[merged_data$lhs %in% c("High-Tech", "Low-Tech") &
merged_data$rhs_split %in% c("High-Tech", "Low-Tech"), ]
# Create a table of combinations
comb_counts <- table(filtered_data$lhs, filtered_data$rhs_split)
print(comb_counts)
##
## High-Tech Low-Tech
## High-Tech 374 45
## Low-Tech 100 31
Heatmap of Patent interactions
# Step 1: Combine all LHS columns into one and pair with RHS
item_pairs <- cleaned_data %>%
# Combine all LHS columns into a single column
pivot_longer(cols = starts_with("lhs"), names_to = "lhs_position", values_to = "lhs_item") %>%
filter(!is.na(lhs_item)) %>% # Remove NA values
# Group by LHS-RHS pairs and summarize counts and lifts
group_by(lhs_item, rhs_split) %>%
summarise(
Pair_Count = n(), # Count occurrences
Lifts = list(lift), # Collect lift values into a list
.groups = "drop"
) %>%
# Sort by Pair_Count in descending order
arrange(desc(Pair_Count))
# Generate a heatmap from the item_pairs data
heatmap_plot <- ggplot(item_pairs, aes(x = rhs_split, y = lhs_item, fill = Pair_Count)) +
geom_tile() +
scale_fill_viridis_c(name = "Pair Count", direction = -1) + # Use a Viridis color scale for better visibility
labs(
x = "RHS",
y = "LHS"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
axis.text.y = element_text(size = 10),
legend.position = "right",
plot.title = element_text(face = "bold", size = 16)
)
# Display the heatmap
print(heatmap_plot)
Network graph of Patent interactions
library(grid)
#set.seed(111)
#Network graph of patents over 20 observation
graph_data <- item_pairs %>%
filter(Pair_Count > 20) %>%
select(from = lhs_item, to = rhs_split, weight = Pair_Count)
# Step 2: Create a tidygraph object
network <- tbl_graph(edges = graph_data, directed = TRUE)
ggraph(network, layout = "fr") +
geom_edge_fan(
aes(width = weight, color = weight),
alpha = 0.8,
arrow = arrow(length = unit(3, 'mm'), type = "closed"), # Add this
end_cap = circle(3, 'mm') # Optional: makes space at node ends
) +
geom_node_point(size = 5, color = "#440154FF") +
geom_node_text(aes(label = name), repel = TRUE, size = 4, fontface = "bold") +
scale_edge_width(range = c(0.5, 3)) +
scale_edge_color_viridis(option = "D", direction = -1, name = "Pair Count") +
theme_void()
## Warning: `guide_colourbar()` cannot be used for edge_colour.
## ℹ Use one of colour, color, or fill instead.
#Network graph of patents over 10 observation
graph_data <- item_pairs %>%
filter(Pair_Count > 10) %>%
select(from = lhs_item, to = rhs_split, weight = Pair_Count)
# Step 2: Create a tidygraph object
network <- tbl_graph(edges = graph_data, directed = TRUE)
ggraph(network, layout = "fr") +
geom_edge_fan(
aes(width = weight, color = weight),
alpha = 0.8,
arrow = arrow(length = unit(3, 'mm'), type = "closed"), # Add this
end_cap = circle(3, 'mm') # Optional: makes space at node ends
) +
geom_node_point(size = 5, color = "#440154FF") +
geom_node_text(aes(label = name), repel = TRUE, size = 4, fontface = "bold") +
scale_edge_width(range = c(0.5, 3)) +
scale_edge_color_viridis(option = "D", direction = -1, name = "Pair Count") +
theme_void()
## Warning: `guide_colourbar()` cannot be used for edge_colour.
## ℹ Use one of colour, color, or fill instead.