PCA

total <- read.csv("TotalAggregatedSLO.csv", header=TRUE, sep=",", dec=".")

total$Year <- lubridate::year(total$Date)

summed_data <- total %>%
  dplyr::group_by(Country, Year) %>%
  dplyr::summarize(across(where(is.numeric), sum))

## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.

summed_data <- summed_data[!(summed_data$Country == "HRV"),]
total_b <- summed_data[summed_data$Year == 2014, c(11, 13, 16, 18)]
total_bb <- summed_data[summed_data$Year == 2014, c(1, 2, 11, 13, 16, 18)]
total_ee <- summed_data[summed_data$Year == 2021, c(1, 2, 11, 13, 16, 18)]
total_e <- summed_data[summed_data$Year == 2021, c(11, 13, 16, 18)]

round(stat.desc(total_b, basic = FALSE), 2)

##              norm_Papers norm_Articles norm_AI_Projects norm_Funding
## median            186.01         17.28             8.43 1.986106e+05
## mean              194.52         15.22            12.18 2.773213e+05
## SE.mean            24.91          3.39             3.54 1.062001e+05
## CI.mean.0.95       56.34          7.66             8.02 2.402414e+05
## var              6202.66        114.60           125.56 1.127847e+11
## std.dev            78.76         10.70            11.21 3.358343e+05
## coef.var            0.40          0.70             0.92 1.210000e+00

R <- cor(total_b) # At least 0.3
round(R, 3)

##                  norm_Papers norm_Articles norm_AI_Projects norm_Funding
## norm_Papers            1.000         0.240            0.960        0.914
## norm_Articles          0.240         1.000            0.337        0.328
## norm_AI_Projects       0.960         0.337            1.000        0.937
## norm_Funding           0.914         0.328            0.937        1.000

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

cortest.bartlett(R, n = nrow(total_b))

## $chisq
## [1] 33.47428
## 
## $p.value
## [1] 8.497053e-06
## 
## $df
## [1] 6

det(R)

## [1] 0.007456458

library(psych)
KMO(R)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA =  0.74
## MSA for each item = 
##      norm_Papers    norm_Articles norm_AI_Projects     norm_Funding 
##             0.72             0.58             0.69             0.88

library(FactoMineR) 
components <- PCA(total_b, 
                  scale.unit = TRUE,
                  graph = FALSE) 

library(factoextra) 
get_eigenvalue(components)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.01013193       75.2532982                    75.25330
## Dim.2 0.87074822       21.7687055                    97.02200
## Dim.3 0.08606590        2.1516476                    99.17365
## Dim.4 0.03305395        0.8263486                   100.00000

library(factoextra)
fviz_eig(components,
         choice = "eigenvalue",
         main = "Scree plot",
         ylab = "Eigenvalue",
         xlab = "Principal component",
         addlabels = TRUE)

library(psych)
fa.parallel(total_b, 
            sim = F, 
            fa = "pc")

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  1

library(FactoMineR)
components <- PCA(total_b, 
                  scale.unit = TRUE, 
                  graph = FALSE,
                  ncp = 1) 

components

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 10 individuals, described by 4 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

print(components$var$cor)

##                       Dim.1
## norm_Papers      -0.9602830
## norm_Articles    -0.4369144
## norm_AI_Projects -0.9819344
## norm_Funding     -0.9658670

print(components$var$contrib)

##                      Dim.1
## norm_Papers      30.634653
## norm_Articles     6.341722
## norm_AI_Projects 32.031661
## norm_Funding     30.991965

round(stat.desc(total_e, basic = FALSE), 2)

##              norm_Papers norm_Articles norm_AI_Projects norm_Funding
## median            235.56        370.28            64.42 1.905343e+06
## mean              273.96        511.72            77.27 2.123846e+06
## SE.mean            40.99        132.62            18.21 6.310766e+05
## CI.mean.0.95       92.73        300.00            41.20 1.427594e+06
## var             16802.02     175872.67          3317.80 3.982576e+12
## std.dev           129.62        419.37            57.60 1.995639e+06
## coef.var            0.47          0.82             0.75 9.400000e-01

R_e <- cor(total_e) # At least 0.3
round(R_e, 3)

##                  norm_Papers norm_Articles norm_AI_Projects norm_Funding
## norm_Papers            1.000         0.860            0.912        0.919
## norm_Articles          0.860         1.000            0.806        0.680
## norm_AI_Projects       0.912         0.806            1.000        0.882
## norm_Funding           0.919         0.680            0.882        1.000

library(psych)
cortest.bartlett(R_e, n = nrow(total_b))

## $chisq
## [1] 37.74658
## 
## $p.value
## [1] 1.259047e-06
## 
## $df
## [1] 6

det(R_e)

## [1] 0.003990296

library(psych)
KMO(R_e)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R_e)
## Overall MSA =  0.71
## MSA for each item = 
##      norm_Papers    norm_Articles norm_AI_Projects     norm_Funding 
##             0.68             0.65             0.89             0.66

library(FactoMineR) 
components_e <- PCA(total_e, 
                  scale.unit = TRUE,
                  graph = FALSE) 

library(factoextra) 
get_eigenvalue(components_e)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.53428331       88.3570829                    88.35708
## Dim.2 0.32821872        8.2054680                    96.56255
## Dim.3 0.10461771        2.6154427                    99.17799
## Dim.4 0.03288026        0.8220064                   100.00000

library(factoextra)
fviz_eig(components_e,
         choice = "eigenvalue",
         main = "Scree plot",
         ylab = "Eigenvalue",
         xlab = "Principal component",
         addlabels = TRUE)

library(psych)
fa.parallel(total_e, 
            sim = F, 
            fa = "pc")

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  1

library(FactoMineR)
components_e <- PCA(total_e, 
                  scale.unit = TRUE, 
                  graph = FALSE,
                  ncp = 1) 

components_e

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 10 individuals, described by 4 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

print(components_e$var$cor)

##                       Dim.1
## norm_Papers      -0.9827915
## norm_Articles    -0.8873341
## norm_AI_Projects -0.9589102
## norm_Funding     -0.9281882

print(components_e$var$contrib)

##                     Dim.1
## norm_Papers      27.32886
## norm_Articles    22.27784
## norm_AI_Projects 26.01684
## norm_Funding     24.37647

total_bb$Indicators_2014 <- (components$ind$coord[ , 1])*-1


head(total_bb, 3)

## # A tibble: 3 × 7
## # Groups:   Country [3]
##   Country  Year norm_Papers norm_Articles norm_AI_Projects norm_Funding Indica…¹
##   <chr>   <dbl>       <dbl>         <dbl>            <dbl>        <dbl>    <dbl>
## 1 AUT      2014        223.          29.6            14.0       494341.    1.05 
## 2 CHE      2014        372.          20.1            40.2      1112193.    4.39 
## 3 CZE      2014        191.          20.5             9.41       33320.   -0.466
## # … with abbreviated variable name ¹Indicators_2014

library(ggplot2)
ggplot(total_bb, aes(y=Indicators_2014, x=Country)) +
  theme_linedraw() +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

total_bb$Indicators_2021 <- (components_e$ind$coord[ , 1])*-1
total_ee$PCA2 <- (components_e$ind$coord[ , 1])*-1

head(total_ee, 3)

## # A tibble: 3 × 7
## # Groups:   Country [3]
##   Country  Year norm_Papers norm_Articles norm_AI_Projects norm_Funding   PCA2
##   <chr>   <dbl>       <dbl>         <dbl>            <dbl>        <dbl>  <dbl>
## 1 AUT      2021        334.          344.             75.9     4473403.  0.656
## 2 CHE      2021        583.         1446.            226.      6539250.  4.97 
## 3 CZE      2021        217.          292.             47.4      602850. -1.18

ggplot(total_ee, aes(y=PCA2, x=Country)) +
  theme_linedraw() +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(total_bb, 
       aes(x = Indicators_2014, y = Indicators_2021, label = Country)) +
  geom_point(shape = 16, size = 4, color = "black") +
  geom_segment(aes(xend = Indicators_2014, yend = Indicators_2021), color = "grey80") +
  geom_text_repel(nudge_x = 0, nudge_y = 0) +
  labs(title = "PCA scatterplot",
       x = "2014", y = "2021") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5, lineheight = 1.2),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12)) +
  scale_y_continuous(labels = scales::comma, limits = c(-6, 6)) +
  scale_x_continuous(labels = scales::comma, limits = c(-6, 6)) +
  geom_vline(xintercept = 0, color = "black") +
  geom_hline(yintercept = 0, color = "black")

total_bb[total_bb$Country == "CHE", 1] <- "CH"
total_long <- tidyr::gather(total_bb, "Variable", "Value", Indicators_2014, Indicators_2021)

# Define the custom order of countries
custom_order <- c("CH", "AUT", "DEU", "FRA", "ITA", "SVN", "CZE", "HUN", "SVK", "POL")  # Replace with your desired order

# Reorder the "Country" variable as a factor with the custom order
total_long$Country <- factor(total_long$Country, levels = custom_order)

ggplot(total_long, aes(x = Country, y = Value, fill = Variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("Indicators_2014" = "gray30", "Indicators_2021" = "gray70")) +
  labs(fill = "Variable") +  # Rename the legend
  ylab("Combined AI Indicators") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  theme_linedraw() +
  geom_vline(xintercept = which(total_long$Country == "HUN") - 0.5, linetype = "dashed") +  # Vertical line before SVN
  geom_vline(xintercept = which(total_long$Country == "HUN") + 0.5, linetype = "dashed")   # Vertical line after SVN