total <- read.csv("~/Desktop/PKP/TotalAggregated.csv", header = TRUE, sep = ",", dec = ".")

region_name_mapping <- c("EU" = "Europe", "US" = "North America",
                           "JUS" = "South America", "ANZ" = "Oceania",
                           "ASIA" = "Asia")

names(total)[names(total) == "Country"] <- "Region"


# Update the Country column in your dataset using the mapping
total <- total %>%
  mutate(Region = region_name_mapping[Region])
total$Year <- lubridate::year(total$Date)

# Group by Region, Year, and all other columns in the original dataset
# Summarize by summing variables (excluding Region and Date) for each year
aggregated_data <- total %>%
  group_by(across(-c(Region, Date))) %>%
  dplyr::summarize(across(where(is.numeric), sum))
## `summarise()` has grouped output by 'Papers', 'Paper_Citations', 'Articles',
## 'Patents', 'Citations', 'AI_Projects', 'Projects', 'Funding', 'norm_Papers',
## 'norm_Paper_Citations', 'norm_Articles', 'norm_Patents', 'norm_Citations',
## 'norm_AI_Projects', 'norm_Projects', 'norm_Funding'. You can override using the
## `.groups` argument.
# Filter out the rows with specific years
filtered_data <- total %>%
  filter(Year %in% 2010:2022)

# Group by Region and Year, and summarize by summing variables (excluding Region and Date) for each year
summed_data <- filtered_data %>%
  group_by(Region, Year) %>%
  dplyr::summarize(across(where(is.numeric), sum))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
# Append the aggregated rows back to the original dataset
total_aggregated <- bind_rows(aggregated_data, summed_data)

total_aggregated <- total_aggregated %>%
  filter(!is.na(Region))

total_aggregated <- total_aggregated %>%
  arrange(Region, Year)

total_aggregated <- total_aggregated %>%
  select(Year, Region, everything())

print(total_aggregated)
## # A tibble: 65 × 18
## # Groups:   Papers, Paper_Citations, Articles, Patents, Citations, AI_Projects,
## #   Projects, Funding, norm_Papers, norm_Paper_Citations, norm_Articles,
## #   norm_Patents, norm_Citations, norm_AI_Projects, norm_Projects, norm_Funding
## #   [65]
##     Year Region  Papers Paper_…¹ Artic…² Patents Citat…³ AI_Pr…⁴ Proje…⁵ Funding
##    <dbl> <chr>    <dbl>    <dbl>   <int>   <int>   <int>   <int>   <int>   <dbl>
##  1  2010 Asia   150193.  972822.       0   11578  127443     105       9  1.30e6
##  2  2011 Asia   154602. 1060488.       0   12055  116129     288       8  1.51e6
##  3  2012 Asia   145329. 1019775.       0   11677   97100     612       7  6.50e5
##  4  2013 Asia   144896. 1103366.       0   12777   94105    1342       6  6.57e5
##  5  2014 Asia   141779. 1203931.    3422   13023   83764    2846       4  9.52e5
##  6  2015 Asia   130611. 1339516.    7304   11537   61359    5935       5  7.10e5
##  7  2016 Asia   124234. 1432777.   33292    9110   35998   12397      20  7.09e5
##  8  2017 Asia   134931. 1629925.   63831    9034   25576   30076      11  3.41e6
##  9  2018 Asia   162308. 1818648.  129247   12084   20566   46888       9  1.55e6
## 10  2019 Asia   192570. 1888756.  281760   13356   10650   56660      15  1.96e6
## # … with 55 more rows, 8 more variables: norm_Papers <dbl>,
## #   norm_Paper_Citations <dbl>, norm_Articles <dbl>, norm_Patents <dbl>,
## #   norm_Citations <dbl>, norm_AI_Projects <dbl>, norm_Projects <dbl>,
## #   norm_Funding <dbl>, and abbreviated variable names ¹​Paper_Citations,
## #   ²​Articles, ³​Citations, ⁴​AI_Projects, ⁵​Projects
sum_patent <- read.csv("indicators_econ/Patents/CountryTimeSeries.csv", header=TRUE, sep=",", dec=".")
sum_patent <- rename(sum_patent, Month = YearMonth, 
                     Patents = X..Patents, Patents_norm = X..Patents..normalized.,
                     Citations = X..Citations, Citations_norm = X..Citations..normalized.)
sum_patent$Date <- as.Date(paste(sum_patent$Month, "-01", sep = ""), format = "%Y-%m-%d")
sum_patent <- sum_patent[sum_patent$Date >= as.Date("2010-01-01"), ]
sum_patent$Month <- NULL
sum_patent <- sum_patent %>%
  group_by(Country) %>%
  dplyr::summarise(Sum_Patents = sum(Patents))
sum_patent <- sum_patent[!sum_patent$Country == "ALL",]
sum_patent <- sum_patent[!sum_patent$Country == "G7",]
sum_patent <- sum_patent[!sum_patent$Country == "G20",]
sum_patent <- sum_patent[!sum_patent$Country == "OECD",]
sum_patent <- sum_patent[!sum_patent$Country == "#Type!",]

sum_patent[sum_patent$Country == "CHN", 2] <- 
  sum_patent[sum_patent$Country == "CHN", 2] + sum_patent[sum_patent$Country == "TWN", 2]
sum_patent <- sum_patent[!sum_patent$Country == "TWN",]

sum_patent <- sum_patent %>%
  arrange(desc(Sum_Patents))

sum_patent <- sum_patent[1:20,]

gdp <- data.frame(
  gpd_per_capita = c(76398.6, 37149.6, 33815.3, 32254.6, 40963.8, 12720.2, 48432.5, 45850.4, 
                     34158.0, 92101.5, 55873.2, 54966.5, 64491.4, 53757.9, 50536.6,
                     2388.6, 55985.4, 54659.8, 82807.6, 48983.6)
)

sum_patent$gdp_per_capita <- gdp$gpd_per_capita

head(sum_patent)
## # A tibble: 6 × 3
##   Country Sum_Patents gdp_per_capita
##   <chr>         <int>          <dbl>
## 1 USA          180516         76399.
## 2 EU27          74500         37150.
## 3 JPN           62378         33815.
## 4 KOR           38667         32255.
## 5 FRA           32278         40964.
## 6 CHN           29863         12720.
average_by_year <- total_aggregated %>%
  group_by(Year) %>%
  dplyr::summarize(average_norm_Papers = mean(norm_Papers, na.rm = TRUE))

ggplot(total_aggregated, aes(x = Year, y = norm_Papers, color = Region, group = Region)) +
  geom_line(size = 1) +
  geom_line(data = average_by_year, aes(x = Year, y = average_norm_Papers, group = 1),
            color = "red", linetype = "dashed") +
  labs(title = "Number of Research Papers per million people\nfrom 2010-2022 by Region",
       x = "Year", y = "Number of Scientific Papers per\nmillion inhabitants") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2), limits = c(2010, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

# Calculate average norm_Papers value for each date
average_papers_by_date <- total_aggregated %>%
  group_by(Year) %>%
  dplyr::summarize(Average_norm_Papers = mean(norm_Papers))

# Merge the average back into the original data
total_papers_aggregated <- merge(total_aggregated, average_papers_by_date, by = "Year")

# Calculate the index values
total_papers_aggregated$Index <- total_papers_aggregated$norm_Papers - 
  total_papers_aggregated$Average_norm_Papers
# Create a bar chart using ggplot
ggplot(total_papers_aggregated, aes(x = Year, y = Index, fill = Region)) +
  geom_bar(stat = "identity") +
  labs(title = "Number of Research Papers above or below average\nby Region and Year from 2010-2022",
       x = "Year", y = "Index") +
  theme_minimal() +
  scale_fill_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +

theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2))

year_divisors <- list(
  list(year = "2010", divisor = 13),
  list(year = "2011", divisor = 12),
  list(year = "2012", divisor = 11),
  list(year = "2013", divisor = 10),
  list(year = "2014", divisor = 9),
  list(year = "2015", divisor = 8),
  list(year = "2016", divisor = 7),
  list(year = "2017", divisor = 6),
  list(year = "2018", divisor = 5),
  list(year = "2019", divisor = 4),
  list(year = "2020", divisor = 3),
  list(year = "2021", divisor = 2),
  list(year = "2022", divisor = 1)
)

# Initialize new columns for the divided values
total_aggregated$norm_Paper_Citations_year <- NA

# Apply the division operation and populate new columns
for (pair in year_divisors) {
  selected_year <- pair$year
  divisor <- pair$divisor
  
  selected_rows <- total_aggregated %>%
    filter(Year == selected_year)
  
  total_aggregated$norm_Paper_Citations_year[total_aggregated$Year %in% selected_rows$Year] <- 
    selected_rows$norm_Paper_Citations / divisor
}
ggplot(total_aggregated, 
       aes(x = Year, y = norm_Paper_Citations_year, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Number of Research Paper Citations (per Year)\nper million people from 2010-2022 by Region",
       x = "Year", y = "Number of Citations per Year") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2), limits = c(2010, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)

ggplot(total_aggregated[total_aggregated$Year >= 2014, ], 
       aes(x = Year, y = norm_Articles, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Number of AI related news Articles per million\npeople from 2010-2022 by Region",
       x = "Year", y = "Number of AI news Articles\nper million inhabitants") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2014, 2022, by = 2), limits = c(2014, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)

ggplot(total_aggregated[total_aggregated$Year <= 2021, ], 
       aes(x = Year, y = norm_Patents, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Number of AI related Patents per million people\nfrom 2010-2022 by Region",
       x = "Year", y = "Number of AI Patents per\nmillion inhabitants") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2021, by = 2), limits = c(2010, 2021)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)

sum_patent[sum_patent$Country == "JPN", 1] <- "Japan"
sum_patent[sum_patent$Country == "GBR", 1] <- "UK"
sum_patent[sum_patent$Country == "DEU", 1] <- "Germany"
sum_patent[sum_patent$Country == "ITA", 1] <- "Italy"
sum_patent[sum_patent$Country == "FRA", 1] <- "France"
sum_patent[sum_patent$Country == "KOR", 1] <- "South Korea"
sum_patent[sum_patent$Country == "CHN", 1] <- "China"
sum_patent[sum_patent$Country == "CHE", 1] <- "Switzerland"
ggplot(sum_patent, aes(x = gdp_per_capita, y = Sum_Patents, label = Country)) +
  geom_point(shape = 16, size = 3, color = "black") +
  labs(title = "Scatter Plot of GDP per Capita vs. Patents",
       x = "GDP per Capita", y = "Patents") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_blank(),
        legend.text = element_text(size = 12))+
  geom_text(nudge_x = 4000, nudge_y = 200) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 100000, by = 25000), limits = c(0, 100000)) 

ggplot(sum_patent[1:10,], 
       aes(x = gdp_per_capita, y = Sum_Patents, label = Country)) +
  geom_point(shape = 16, size = 4, color = "black") +
  geom_segment(aes(xend = gdp_per_capita, yend = Sum_Patents), color = "grey80") +
  geom_text_repel(nudge_x = 2000, nudge_y = 10) +
  labs(title = "Scatter Plot of GDP per Capita vs. Patents",
       x = "GDP per Capita (in USD)", y = "Number of Patents") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 100000, by = 20000), limits = c(0, 100000))

ggplot(total_aggregated, aes(x = Year, y = norm_AI_Projects, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Number of AI software related Projects per million people\nfrom 2010-2022 by Region",
       x = "Year", y = "Number of AI software Projects\nper million inhabitants") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2), limits = c(2010, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)

ggplot(total_aggregated, aes(x = Year, y = norm_Projects, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Number of AI related Projects per million people\nfrom 2010-2022 by Region",
       x = "Year", y = "Number of AI Projects") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2), limits = c(2010, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)

ggplot(total_aggregated, aes(x = Year, y = norm_Funding, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Amount of AI Project Funding per million people\nfrom 2010-2022 by Region",
       x = "Year", y = "Amount of AI Project Funding") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2), limits = c(2010, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)

job <- read.csv("indicators_econ/Adzuna/CountryTimeSeries.csv", header=TRUE, sep=",", dec=".") 
job <- rename(job, Postings = X..Job.postings, Month = YearMonth)
job$Date <- as.Date(paste(job$Month, "-01", sep = ""), format = "%Y-%m-%d")
job <- job[job$Date >= as.Date("2010-01-01"), ]
job$Month <- NULL
job$Year <- lubridate::year(job$Date)

# Group by Country, Year, and all other columns in the original dataset
# Summarize by summing variables (excluding Country and Date) for each year
aggregated_data <- job %>%
  group_by(across(-c(Country, Date))) %>%
  dplyr::summarize(across(where(is.numeric), sum))
## `summarise()` has grouped output by 'Postings'. You can override using the
## `.groups` argument.
# Filter out the rows with specific years
filtered_data <- job %>%
  filter(Year %in% 2010:2022)

# Group by Country and Year, and summarize by summing variables (excluding Country and Date) for each year
summed_data <- filtered_data %>%
  dplyr::group_by(Country, Year) %>%
  dplyr::summarize(dplyr::across(where(is.numeric), sum))
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
# Append the aggregated rows back to the original dataset
job_aggregated <- bind_rows(aggregated_data, summed_data)

job_aggregated <- job_aggregated %>%
  filter(!is.na(Country))

job_aggregated <- job_aggregated %>%
  arrange(Country, Year)

job_aggregated <- job_aggregated %>%
  select(Year, Country, everything())

print(job_aggregated)
## # A tibble: 102 × 3
## # Groups:   Postings [102]
##     Year Country Postings
##    <dbl> <chr>      <int>
##  1  2017 ALL        91424
##  2  2018 ALL       173064
##  3  2019 ALL       190336
##  4  2020 ALL       201221
##  5  2021 ALL       164823
##  6  2022 ALL       322907
##  7  2017 AUS         2917
##  8  2018 AUS         5749
##  9  2019 AUS         5664
## 10  2020 AUS         3999
## # … with 92 more rows
ggplot(job_aggregated[job_aggregated$Country == "ALL",], aes(x = Year, y = Postings)) +
  geom_area(size = 1, fill = "lightgrey", col = "black") +
  labs(title = "Number of total Job Postings from 2017-2022", x = "Year", y = "Number of Job Postings related to AI") +
  scale_fill_manual(values = "skyblue") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2017, 2022, by = 1), limits = c(2017, 2022)) +
  scale_y_continuous(labels = scales::comma)

head(job_aggregated, 10)
## # A tibble: 10 × 3
## # Groups:   Postings [10]
##     Year Country Postings
##    <dbl> <chr>      <int>
##  1  2017 ALL        91424
##  2  2018 ALL       173064
##  3  2019 ALL       190336
##  4  2020 ALL       201221
##  5  2021 ALL       164823
##  6  2022 ALL       322907
##  7  2017 AUS         2917
##  8  2018 AUS         5749
##  9  2019 AUS         5664
## 10  2020 AUS         3999
total_aggregated$norm_cpp <- 
  as.numeric(total_aggregated$norm_Paper_Citations_year/total_aggregated$norm_Papers)

head(total_aggregated)
## # A tibble: 6 × 20
## # Groups:   Papers, Paper_Citations, Articles, Patents, Citations, AI_Projects,
## #   Projects, Funding, norm_Papers, norm_Paper_Citations, norm_Articles,
## #   norm_Patents, norm_Citations, norm_AI_Projects, norm_Projects, norm_Funding
## #   [6]
##    Year Region  Papers Paper_C…¹ Artic…² Patents Citat…³ AI_Pr…⁴ Proje…⁵ Funding
##   <dbl> <chr>    <dbl>     <dbl>   <int>   <int>   <int>   <int>   <int>   <dbl>
## 1  2010 Asia   150193.   972822.       0   11578  127443     105       9  1.30e6
## 2  2011 Asia   154602.  1060488.       0   12055  116129     288       8  1.51e6
## 3  2012 Asia   145329.  1019775.       0   11677   97100     612       7  6.50e5
## 4  2013 Asia   144896.  1103366.       0   12777   94105    1342       6  6.57e5
## 5  2014 Asia   141779.  1203931.    3422   13023   83764    2846       4  9.52e5
## 6  2015 Asia   130611.  1339516.    7304   11537   61359    5935       5  7.10e5
## # … with 10 more variables: norm_Papers <dbl>, norm_Paper_Citations <dbl>,
## #   norm_Articles <dbl>, norm_Patents <dbl>, norm_Citations <dbl>,
## #   norm_AI_Projects <dbl>, norm_Projects <dbl>, norm_Funding <dbl>,
## #   norm_Paper_Citations_year <dbl>, norm_cpp <dbl>, and abbreviated variable
## #   names ¹​Paper_Citations, ²​Articles, ³​Citations, ⁴​AI_Projects, ⁵​Projects
ggplot(total_aggregated, 
       aes(x = Year, y = norm_cpp, color = Region, group = Region)) +
  geom_line(size = 1) +
  labs(title = "Number of Annual Citations per Scientific Paper\n from 2010-2022 by Region",
       x = "Year", y = "Number of Annual Citations\nper Scientific paper") +
  scale_color_manual(values = c("Europe" = "black", "North America" = "grey40",
                                "South America" = "grey60", "Oceania" = "grey80",
                                "Asia" = "grey20")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_x_continuous(breaks = seq(2010, 2022, by = 2), limits = c(2010, 2022)) +
  geom_point(aes(shape = Region, col = Region), size = 3) +
  scale_y_continuous(labels = scales::comma)