Activity 3.2 - PC concepts

SUBMISSION INSTRUCTIONS

Render to html
Publish your html to RPubs
Submit a link to your published solutions

Question 1

The data set we will analyze for this question are on the 10 events in the Men’s 2024 Olympic Decathlon in Paris.

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

decathlon <- read.csv('C:/Users/lr7273ow/OneDrive - Minnesota State/Documents/Github/DSCI_415/Activities/Data/mens_decathlon_paris2024.csv')
head(decathlon)

         Athlete  Medal      Nation Overall X100m LongJump ShotPut HighJump
1   Markus Rooth   Gold      Norway    8796 10.71     7.80   15.25     1.99
2 Leo Neugebauer Silver     Germany    8748 10.67     7.98   16.55     2.05
3  Lindon Victor Bronze     Grenada    8711 10.56     7.48   15.71     2.02
4    Sven Roosen   None Netherlands    8607 10.52     7.56   15.10     1.87
5  Janek Õiglane   None     Estonia    8572 10.89     7.25   14.58     1.99
6   Johannes Erm   None     Estonia    8569 10.64     7.66   14.61     2.08
  X400m X110mHurdle Discus PoleVault Javelin X1500m
1 47.69       14.25  49.80       5.3   66.87  279.6
2 47.70       14.51  53.33       5.0   56.64  284.7
3 47.84       14.62  53.91       4.9   68.22  283.5
4 46.40       13.99  46.88       4.7   63.72  258.5
5 48.02       14.45  43.49       5.3   71.89  265.6
6 47.19       14.35  46.29       4.6   59.58  259.7

For the purposes of this question, assume we have 10-dimensional data - that is, ignore the Overall column.

A)

Explain why we need to scale this data set before performing PCA.

#We need to scale the data set before it becomes PCA #is because the data itself is sensitive to the 
#disparities between certain columns in their sheer magnitude
#The values of the columns range from incredibly small values 
#
#
#
#we need to make sure the data retains #explained variability before actually translating the data to a Principal Component Analysis. 
#
#

B)

Use svd() to find the first 2 principal component scores and their loadings. Full credit will only be granted if you use the svd() ingredients u, d, and v. What percent of the overall variability do the first two PCs explain?

#First we gotta take the numeric columns

decathlon_select <- decathlon %>%
select(X100m, LongJump, ShotPut, HighJump, X400m, X110mHurdle, Discus, PoleVault, Javelin, X1500m) 

#Then we decide to take the scale of those numeric columns 
(decathlon_scaled <- scale(decathlon_select))

           X100m     LongJump    ShotPut   HighJump       X400m X110mHurdle
 [1,] -0.1448846  1.172765295  0.4466776 -0.1091089 -0.34369762 -0.30326285
 [2,] -0.3036622  1.672996937  1.9471451  0.7637626 -0.33551434  0.38838926
 [3,] -0.7403006  0.283464597  0.9776123  0.3273268 -0.22094847  0.68101131
 [4,] -0.8990782  0.505789772  0.2735468 -1.8548521 -1.39934030 -0.99491496
 [5,]  0.5696147 -0.355720279 -0.3266402 -0.1091089 -0.07364949  0.22877724
 [6,] -0.4227454  0.783696240 -0.2920141  1.2001984 -0.75286145 -0.03724281
 [7,] -0.5021342  0.116720717  0.9199020 -0.5455447 -1.14565873 -0.22345684
 [8,]  2.3558627 -0.800370628 -0.7190702  0.3273268  0.83469422  0.44159327
 [9,] -1.5738830  0.783696240  0.3543412  0.3273268 -1.58755567 -0.72889492
[10,]  0.6490035  0.005558129 -0.4420608  2.5095057  0.76104473 -0.86190494
[11,]  1.0459475 -0.605836100  1.1738273 -0.1091089  0.45826349  0.78741933
[12,] -0.1051902 -0.772579981  2.0510236 -0.1091089 -0.51554643 -1.07472098
[13,] -0.5815230 -0.050023164 -0.7767805  0.3273268  1.08019252 -1.76637309
[14,] -0.3433566 -0.383510926 -1.0307057 -0.9819805  0.54827953 -0.96831296
[15,] -0.8990782  0.644743006 -1.0422478 -0.5455447 -0.12274915  0.01596120
[16,] -0.4227454 -0.300138985 -0.6844440 -0.9819805 -0.31096451  1.10664338
[17,] -0.5815230 -1.439555504  0.1927524 -0.5455447  2.19311814  1.98450953
[18,]  0.1329762  1.811950171 -0.6382758  1.6366342 -0.89197715 -0.56928289
[19,]  2.2764739 -2.162112321 -1.1230422 -0.9819805  1.84942052  2.09091754
[20,]  0.4902259 -0.911533215 -1.2615469 -0.5455447 -0.02454983 -0.19685483
           Discus  PoleVault    Javelin      X1500m
 [1,]  0.96204598  0.6246390  0.7380189  0.30891685
 [2,]  1.88387613  0.4252861 -0.7019836  0.72848434
 [3,]  2.03533831  0.3588352  0.9280485  0.62976257
 [4,]  0.19951225  0.2259332  0.2946163 -1.42694080
 [5,] -0.68575807  0.6246390  1.4446476 -0.84283704
 [6,]  0.04543865  0.1594823 -0.2881413 -1.32821904
 [7,]  0.20734650  0.4917371 -1.4719556 -1.33644586
 [8,]  0.04282724  0.2923842  2.2737400 -1.71487928
 [9,] -0.71970649  0.2923842 -1.4719556  0.37473136
[10,] -0.64136398  0.2259332  0.8393680  0.39941180
[11,]  1.04822273  0.2259332  0.3565519 -0.77702254
[12,] -0.02245819  0.2259332 -1.2115446 -0.02015569
[13,] -0.01201252  0.2923842 -0.2529506  1.64166064
[14,] -0.85027733  0.2923842  1.2602485  1.13982501
[15,] -0.92078559  0.2259332 -0.3500768  0.78607203
[16,] -1.77733032  0.2259332 -0.6400480 -0.90865155
[17,]  1.03516565  0.4252861 -0.6442709  1.42776349
[18,] -0.09035502 -2.8972616 -0.2585811  0.13615376
[19,] -0.38283371  0.1594823 -0.3233319  0.25955597
[20,] -1.35689221 -2.8972616 -0.5203997  0.52281400
attr(,"scaled:center")
      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
    10.7465      7.3780     14.8630      1.9975     48.1100     14.3640 
     Discus   PoleVault     Javelin      X1500m 
    46.1160      4.3600     61.6270    275.8450 
attr(,"scaled:scale")
      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
 0.25192470  0.35983329  0.86639665  0.06873864  1.22200439  0.37591152 
     Discus   PoleVault     Javelin      X1500m 
 3.82933882  1.50486929  7.10415453 12.15537460

cov(decathlon_scaled)

                  X100m    LongJump     ShotPut    HighJump       X400m
X100m        1.00000000 -0.53608784 -0.26403959  0.09368668  0.53405515
LongJump    -0.53608784  1.00000000  0.25570073  0.42387078 -0.69575713
ShotPut     -0.26403959  0.25570073  1.00000000  0.07860944 -0.33429492
HighJump     0.09368668  0.42387078  0.07860944  1.00000000 -0.02988760
X400m        0.53405515 -0.69575713 -0.33429492 -0.02988760  1.00000000
X110mHurdle  0.39213561 -0.46424883 -0.02566870 -0.23464556  0.48497097
Discus      -0.06673324  0.24837677  0.69403174  0.16765819  0.05957716
PoleVault   -0.12200199 -0.12155272  0.35432088 -0.18774703  0.14304448
Javelin      0.48449401 -0.10152416 -0.29600780  0.10164074  0.30079750
X1500m      -0.29008444 -0.01529645 -0.10308894  0.10728909  0.42597294
            X110mHurdle      Discus  PoleVault     Javelin      X1500m
X100m        0.39213561 -0.06673324 -0.1220020  0.48449401 -0.29008444
LongJump    -0.46424883  0.24837677 -0.1215527 -0.10152416 -0.01529645
ShotPut     -0.02566870  0.69403174  0.3543209 -0.29600780 -0.10308894
HighJump    -0.23464556  0.16765819 -0.1877470  0.10164074  0.10728909
X400m        0.48497097  0.05957716  0.1430445  0.30079750  0.42597294
X110mHurdle  1.00000000  0.20019920  0.1331565  0.01350292 -0.08276666
Discus       0.20019920  1.00000000  0.2872561  0.08544044  0.10805321
PoleVault    0.13315648  0.28725615  1.0000000  0.15408984 -0.10241323
Javelin      0.01350292  0.08544044  0.1540898  1.00000000 -0.17571943
X1500m      -0.08276666  0.10805321 -0.1024132 -0.17571943  1.00000000

#Getting the SVD values 

svd_decathlon_scaled <- svd(decathlon_scaled)

U <- svd_decathlon_scaled$u
D <- svd_decathlon_scaled$d
V <- svd_decathlon_scaled$v

head(U[,1] * D[1])

[1] -0.9696335 -2.1338950 -0.7323012 -1.4369409  1.0587944 -1.1274914

#We are finding the first 2 principal component, so we know that k=2  

PCs <- U %*% D

head(PCs , 2)

           [,1]
[1,] -0.3509077
[2,]  1.4366112

PCs <- decathlon_scaled %*% V



#These seem to have more PC values so lets go with this one
head(PCs,2)

           [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
[1,] -0.9696335 0.9075397 0.7205553 -0.1637611 -0.7854975 -0.7506910
[2,] -2.1338950 2.2401416 0.4899618 -1.0902256  0.7897618 -0.2423639
             [,7]      [,8]      [,9]       [,10]
[1,] 6.474979e-05 0.4802080 0.2288616 -0.01855389
[2,] 1.877641e-01 0.6018405 0.4109560  0.18266992

(PCs 
%>% data.frame()
%>% summarize(across(everything(),var))
)

        X1       X2      X3       X4       X5        X6        X7        X8
1 2.913494 1.928006 1.46543 1.404117 0.952215 0.5535841 0.4806271 0.1403932
         X9        X10
1 0.1222556 0.03987768

#     X1       X2      X3       X4       X5        X6    X7        X8
# 2.913494 1.928006 1.46543 1.404117 0.952215 0.5535841 0.4806271 .1403932
 
#         X9        X10
# 0.1222556 0.03987768
 
#Total adds to around 11. 

#PC1 = 2.913  and PC2 = 1.93
#Rechange this
# Total is 4.84, which is about 50% of the total variability explained by #these 2 PCs.   
#

C)

Find and print the loadings. Based on the loadings alone, if the first two PCs are plotted in a 2D plane as shown below, which of the four quadrants will the medalists be in? Explain your reasoning.

loadings <- V

rownames(loadings) <- colnames(decathlon_scaled)
loadings[,1:2]

                    [,1]        [,2]
X100m        0.446612410  0.01678592
LongJump    -0.493037935 -0.06565628
ShotPut     -0.309431542  0.53134100
HighJump    -0.150386291 -0.05665757
X400m        0.485973869  0.14847674
X110mHurdle  0.345270647  0.31685991
Discus      -0.139145881  0.59570522
PoleVault    0.019829831  0.48083261
Javelin      0.252821246  0.03137029
X1500m       0.005588625 -0.01948870

#Based on the PC1 and PC2 values from the loadings we can say
#that the PC2 would bring the medalist points into the third quandrant  #because of the abundance of negative values that it has (it has 7 #negatives). The PC1 would then bring the medalist points into the  
#fourth quadrant. So we'd be in the fourth quadrant but be very close
#to the PC2 axis line.    
#
#

D)

Add the PCs to the decathlon data set and create a scatterplot of these PCs, with the points labeled by the athletes’ names. Color-code the points on whether or not the athlete was a medalist. Use the ggrepel package for better labeling. Verify that your intuition from C) is correct.

library(ggrepel)

Loading required package: ggplot2

#PCs converting to a df. 

PCs_df <- as.data.frame(PCs)

decathlon_with_PCs <- decathlon %>%
  mutate(PC1 = PCs_df[,1],
         PC2 = PCs_df[,2])

#scatterplot of the decathlon athlets and their PCs.   

scatter_decathlon <- ggplot(decathlon_with_PCs, aes(x = PC1, y = PC2, color = Medal, label = Athlete)) +
  geom_point(size = 3) +
  geom_text_repel(show.legend = FALSE, max.overlaps = 15) +
  labs(
    title = "Principal Components of Decathlon Results (Paris 2024)",
    x = "Principal Component 1",
    y = "Principal Component 2",
    color = "Medal Status"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "right"
  )


scatter_decathlon

E)

Canadian Damian Warner won the gold medal in the decathlon in the 2020 Tokyo games. He began the 2024 decathlon but bowed out after three straight missed pole vault attempts.

These are his results in the 10 events in 2020:

warner <- c(10.12, 8.24, 14.8, 2.02, 47.48, 13.46, 48.67, 4.9, 63.44, 271.08)

Would this have won a medal if it had happened in 2024? To answer this, we will compute his PCs with respect to the 2024 athletes and add it to the plot to see where his 2020 gold-medal performance compares to the 2024 athletes. To do this:

Find the mean vector from the 2024 athletes. Call it mean_vec_24.
Find the standard deviation vector from the 2024 athletes. Call it sd_vec_24.
Standardize Warner’s 2020 results with respect to the 2024 athletes: (warner-mean_vec_24)/sd_vec_24
Find Warner’s PC coordinates using the 2024 loadings.

Add his point to the scatterplot.

#The mean vector of the 2024 athletes: 

mean_vec_24 <- attr(decathlon_scaled, "scaled:center")

mean_vec_24

      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
    10.7465      7.3780     14.8630      1.9975     48.1100     14.3640 
     Discus   PoleVault     Javelin      X1500m 
    46.1160      4.3600     61.6270    275.8450

#The Std. Dev. vector of the 2024 athletes: 

decathlon_scaled

           X100m     LongJump    ShotPut   HighJump       X400m X110mHurdle
 [1,] -0.1448846  1.172765295  0.4466776 -0.1091089 -0.34369762 -0.30326285
 [2,] -0.3036622  1.672996937  1.9471451  0.7637626 -0.33551434  0.38838926
 [3,] -0.7403006  0.283464597  0.9776123  0.3273268 -0.22094847  0.68101131
 [4,] -0.8990782  0.505789772  0.2735468 -1.8548521 -1.39934030 -0.99491496
 [5,]  0.5696147 -0.355720279 -0.3266402 -0.1091089 -0.07364949  0.22877724
 [6,] -0.4227454  0.783696240 -0.2920141  1.2001984 -0.75286145 -0.03724281
 [7,] -0.5021342  0.116720717  0.9199020 -0.5455447 -1.14565873 -0.22345684
 [8,]  2.3558627 -0.800370628 -0.7190702  0.3273268  0.83469422  0.44159327
 [9,] -1.5738830  0.783696240  0.3543412  0.3273268 -1.58755567 -0.72889492
[10,]  0.6490035  0.005558129 -0.4420608  2.5095057  0.76104473 -0.86190494
[11,]  1.0459475 -0.605836100  1.1738273 -0.1091089  0.45826349  0.78741933
[12,] -0.1051902 -0.772579981  2.0510236 -0.1091089 -0.51554643 -1.07472098
[13,] -0.5815230 -0.050023164 -0.7767805  0.3273268  1.08019252 -1.76637309
[14,] -0.3433566 -0.383510926 -1.0307057 -0.9819805  0.54827953 -0.96831296
[15,] -0.8990782  0.644743006 -1.0422478 -0.5455447 -0.12274915  0.01596120
[16,] -0.4227454 -0.300138985 -0.6844440 -0.9819805 -0.31096451  1.10664338
[17,] -0.5815230 -1.439555504  0.1927524 -0.5455447  2.19311814  1.98450953
[18,]  0.1329762  1.811950171 -0.6382758  1.6366342 -0.89197715 -0.56928289
[19,]  2.2764739 -2.162112321 -1.1230422 -0.9819805  1.84942052  2.09091754
[20,]  0.4902259 -0.911533215 -1.2615469 -0.5455447 -0.02454983 -0.19685483
           Discus  PoleVault    Javelin      X1500m
 [1,]  0.96204598  0.6246390  0.7380189  0.30891685
 [2,]  1.88387613  0.4252861 -0.7019836  0.72848434
 [3,]  2.03533831  0.3588352  0.9280485  0.62976257
 [4,]  0.19951225  0.2259332  0.2946163 -1.42694080
 [5,] -0.68575807  0.6246390  1.4446476 -0.84283704
 [6,]  0.04543865  0.1594823 -0.2881413 -1.32821904
 [7,]  0.20734650  0.4917371 -1.4719556 -1.33644586
 [8,]  0.04282724  0.2923842  2.2737400 -1.71487928
 [9,] -0.71970649  0.2923842 -1.4719556  0.37473136
[10,] -0.64136398  0.2259332  0.8393680  0.39941180
[11,]  1.04822273  0.2259332  0.3565519 -0.77702254
[12,] -0.02245819  0.2259332 -1.2115446 -0.02015569
[13,] -0.01201252  0.2923842 -0.2529506  1.64166064
[14,] -0.85027733  0.2923842  1.2602485  1.13982501
[15,] -0.92078559  0.2259332 -0.3500768  0.78607203
[16,] -1.77733032  0.2259332 -0.6400480 -0.90865155
[17,]  1.03516565  0.4252861 -0.6442709  1.42776349
[18,] -0.09035502 -2.8972616 -0.2585811  0.13615376
[19,] -0.38283371  0.1594823 -0.3233319  0.25955597
[20,] -1.35689221 -2.8972616 -0.5203997  0.52281400
attr(,"scaled:center")
      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
    10.7465      7.3780     14.8630      1.9975     48.1100     14.3640 
     Discus   PoleVault     Javelin      X1500m 
    46.1160      4.3600     61.6270    275.8450 
attr(,"scaled:scale")
      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
 0.25192470  0.35983329  0.86639665  0.06873864  1.22200439  0.37591152 
     Discus   PoleVault     Javelin      X1500m 
 3.82933882  1.50486929  7.10415453 12.15537460

std_dev_vec_24 <- attr(decathlon_scaled, "scaled:scale")
std_dev_vec_24

      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
 0.25192470  0.35983329  0.86639665  0.06873864  1.22200439  0.37591152 
     Discus   PoleVault     Javelin      X1500m 
 3.82933882  1.50486929  7.10415453 12.15537460

compare_2020_2024 <- (warner-mean_vec_24)/std_dev_vec_24
compare_2020_2024

      X100m    LongJump     ShotPut    HighJump       X400m X110mHurdle 
-2.48685424  2.39555375 -0.07271496  0.32732684 -0.51554643 -2.40482120 
     Discus   PoleVault     Javelin      X1500m 
 0.66695587  0.35883515  0.25520278 -0.39200766

#Then we will multiply his standardized 2020 results (w/ respect to 2024) by the loadings 

loadings[,1:2]

                    [,1]        [,2]
X100m        0.446612410  0.01678592
LongJump    -0.493037935 -0.06565628
ShotPut     -0.309431542  0.53134100
HighJump    -0.150386291 -0.05665757
X400m        0.485973869  0.14847674
X110mHurdle  0.345270647  0.31685991
Discus      -0.139145881  0.59570522
PoleVault    0.019829831  0.48083261
Javelin      0.252821246  0.03137029
X1500m       0.005588625 -0.01948870

compare_2020_2024 %*% loadings[,1:2]

          [,1]       [,2]
[1,] -3.422699 -0.5092531

#His coordinates are -3.422699 -0.5092531

#Adding the point to the scatterplot

warner_pc <- as.data.frame(compare_2020_2024 %*% loadings[,1:2])

colnames(warner_pc) <- c("PC1", "PC2")


warner_pc$Athlete <- "Damian Warner (2020)"


warner_pc$Medal  <- "Gold (2020)"

scatter_decathlon + 
  geom_point(data = warner_pc, aes(x = PC1, y = PC2),
  color = "black", fill = "gold", size = 5, shape=21, stroke = 1.2) + 
  geom_text_repel(data = warner_pc, aes(x = PC1, y = PC2),
  color = "black", fill = "gold", size = 5, shape = 21)

Warning in geom_text_repel(data = warner_pc, aes(x = PC1, y = PC2), color =
"black", : Ignoring unknown parameters: `fill` and `shape`

#His point has been added as a gold filled circle with black outline

Do you think his 2020 performance would have won a medal if it had happened in 2024?

#Given how far in proximity Damian Warner is from the actual medalists of the 
#olympics - Leo Neugebaur, Lindon Victor and Markus Rooth- I would imagine that # Warner still would not have gotten a medal for his performance in the #Decathlon. Although it seems he would've gotten quite a bit closer to doing so #when compared to his 2024 performance of the Decathlon.

Question 2

Below is a screenshot of a conversation between me and chatbot Claude:

After looking at the graphs, I grew skeptical. So I said:

Behold, Claude’s three data sets which I’ve called claudeA, claudeB, and claudeC:

claudeA <- read.csv('C:/Users/lr7273ow/OneDrive - Minnesota State/Documents/Github/DSCI_415/Activities/Data/claude_dataA.csv')
claudeB <- read.csv('C:/Users/lr7273ow/OneDrive - Minnesota State/Documents/Github/DSCI_415/Activities/Data/claude_dataB.csv')
claudeC <- read.csv('C:/Users/lr7273ow/OneDrive - Minnesota State/Documents/Github/DSCI_415/Activities/Data/claude_dataC.csv')

Each data set has an X and a Y column which represent 2-dimensional variables that we need to rotate.

A)

Scale each data set and plot them side-by-side using the patchwork package. Make sure the aspect ratio of each graph is 1 (i.e., make the height and width of each graph equal). At this point, explain why you think I was skeptical. Specifically, do you think the percent variability explained by the first PC of each data set appears to exceed or fall short of the variability I asked it to?

library(patchwork)

(claudeA_scaled <- scale(claudeA))

                X           Y
 [1,] -1.57226981  1.66630497
 [2,] -1.41659953  1.29907523
 [3,] -1.26092925  1.48269010
 [4,] -1.10525898  0.93184548
 [5,] -0.94958870  1.11546035
 [6,] -0.79391842  0.56461573
 [7,] -0.63824814  0.74823061
 [8,] -0.48257786  0.19738599
 [9,] -0.32690758  0.38100086
[10,] -0.17123731 -0.16984376
[11,] -0.01556703  0.01377112
[12,]  0.14010325 -0.53707350
[13,]  0.29577353 -0.35345863
[14,]  0.45144381 -0.90430325
[15,]  0.60711409 -0.72068838
[16,]  0.76278436 -1.08791812
[17,] -1.51000170  1.84991984
[18,] -1.32319737  1.11546035
[19,] -1.19866114  1.66630497
[20,] -1.01185681  0.74823061
[21,] -0.88732059  1.29907523
[22,] -0.70051625  0.38100086
[23,] -0.57598003  0.93184548
[24,] -0.38917570  0.01377112
[25,] -0.26463947  0.56461573
[26,] -0.07783514 -0.35345863
[27,]  0.04670108  0.19738599
[28,]  0.23350542 -0.72068838
[29,]  0.35804164 -0.16984376
[30,]  0.54484597 -0.90430325
[31,]  0.66938220 -0.53707350
[32,]  0.85618653 -1.27153300
[33,]  0.91845464 -0.90430325
[34,]  1.07412492 -1.08791812
[35,]  1.22979520 -0.72068838
[36,]  1.38546548 -1.27153300
[37,]  1.54113576 -1.08791812
[38,]  1.69680603 -1.45514787
[39,]  1.85247631 -1.27153300
[40,]  2.00814659 -1.63876274
attr(,"scaled:center")
     X      Y 
90.500 69.625 
attr(,"scaled:scale")
       X        Y 
32.11917 27.23091

(claudeB_scaled <- scale(claudeB))

                X           Y
 [1,] -1.57226981 -1.28334200
 [2,] -1.41659953 -0.99430101
 [3,] -1.26092925 -1.39895839
 [4,] -1.10525898 -0.70526002
 [5,] -0.94958870 -1.10991740
 [6,] -0.79391842 -0.41621903
 [7,] -0.63824814 -0.82087641
 [8,] -0.48257786 -0.12717804
 [9,] -0.32690758 -0.53183542
[10,] -0.17123731  0.16186295
[11,] -0.01556703 -0.24279443
[12,]  0.14010325  0.45090395
[13,]  0.29577353  0.04624656
[14,]  0.45144381  0.73994494
[15,]  0.60711409  0.33528755
[16,]  0.76278436  1.02898593
[17,] -1.51000170 -1.57238299
[18,] -1.32319737 -0.82087641
[19,] -1.19866114 -1.68799939
[20,] -1.01185681 -0.53183542
[21,] -0.88732059 -1.39895839
[22,] -0.70051625 -0.24279443
[23,] -0.57598003 -1.10991740
[24,] -0.38917570  0.04624656
[25,] -0.26463947 -0.82087641
[26,] -0.07783514  0.33528755
[27,]  0.04670108 -0.53183542
[28,]  0.23350542  0.62432854
[29,]  0.35804164 -0.24279443
[30,]  0.54484597  0.91336953
[31,]  0.66938220  0.04624656
[32,]  0.85618653  1.20241052
[33,]  0.91845464  0.91336953
[34,]  1.07412492  1.31802692
[35,]  1.22979520  0.62432854
[36,]  1.38546548  1.60706791
[37,]  1.54113576  0.91336953
[38,]  1.69680603  1.89610890
[39,]  1.85247631  1.20241052
[40,]  2.00814659  2.18514989
attr(,"scaled:center")
   X    Y 
90.5 72.2 
attr(,"scaled:scale")
       X        Y 
32.11917 17.29858

(claudeC_scaled <- scale(claudeC))

                X           Y
 [1,] -1.57226981  1.63878255
 [2,] -1.41659953  1.31808146
 [3,] -1.26092925  1.44636190
 [4,] -1.10525898  0.99738038
 [5,] -0.94958870  1.12566081
 [6,] -0.79391842  0.67667929
 [7,] -0.63824814  0.80495972
 [8,] -0.48257786  0.35597820
 [9,] -0.32690758  0.48425864
[10,] -0.17123731  0.03527712
[11,] -0.01556703  0.16355755
[12,]  0.14010325 -0.28542397
[13,]  0.29577353 -0.15714353
[14,]  0.45144381 -0.60612505
[15,]  0.60711409 -0.47784462
[16,]  0.76278436 -0.92682614
[17,] -1.51000170  1.51050211
[18,] -1.32319737  1.18980103
[19,] -1.19866114  1.31808146
[20,] -1.01185681  0.86909994
[21,] -0.88732059  0.99738038
[22,] -0.70051625  0.54839886
[23,] -0.57598003  0.67667929
[24,] -0.38917570  0.22769777
[25,] -0.26463947  0.35597820
[26,] -0.07783514 -0.09300331
[27,]  0.04670108  0.03527712
[28,]  0.23350542 -0.41370440
[29,]  0.35804164 -0.28542397
[30,]  0.54484597 -0.73440549
[31,]  0.66938220 -0.60612505
[32,]  0.85618653 -1.05510657
[33,]  0.91845464 -0.79854570
[34,]  1.07412492 -1.11924679
[35,]  1.22979520 -0.99096635
[36,]  1.38546548 -1.43994787
[37,]  1.54113576 -1.31166744
[38,]  1.69680603 -1.76064896
[39,]  1.85247631 -1.63236852
[40,]  2.00814659 -2.08135004
attr(,"scaled:center")
    X     Y 
90.50 94.45 
attr(,"scaled:scale")
       X        Y 
32.11917 15.59084

claudeA_scaled <- as.data.frame(scale(claudeA))


claudeB_scaled <- as.data.frame(scale(claudeB))

claudeC_scaled <- as.data.frame(scale(claudeC))


library(ggplot2)

plotA <- ggplot(claudeA_scaled, aes(x = X, y = Y)) +
  geom_point(color = "steelblue", size = 2) +
  coord_equal() +  # ensures 1:1 aspect ratio
  labs(title = "Claude A (Scaled)") +
  theme_minimal()

plotA

plotB <- ggplot(claudeB_scaled, aes(x = X, y = Y)) +
  geom_point(color = "darkgreen", size = 2) +
  coord_equal() +
  labs(title = "Claude B (Scaled)") +
  theme_minimal()

plotB

plotC <- ggplot(claudeC_scaled, aes(x = X, y = Y)) +
  geom_point(color = "firebrick", size = 2) +
  coord_equal() +
  labs(title = "Claude C (Scaled)") +
  theme_minimal()

plotC

plotA + plotB + plotC

#I think the reason you were skeptical was because of the fact that there is
#we can see that with the scaling of the dataset there is indeed a whole lot #of varibility within each of the claudes. This is pontificated by the fact 
#that the Claude A, Claude B and Claude C are so diagonal. PCA identifies
#the direction of greatest variance, which means the greatest variance sits #across a axis line. So the first principal component should explain 100%  
#of the variance which is a lot of variance, more than 50%, 75% and even 90% #of the variance wanted for each dataset. This would be right to be skeptical 
#about

B)

Use SVD to find the first PC for each data set, and find the actual percent of total variability explained by each PC using aggregation methods.

svd_claudeA <- svd(claudeA_scaled)

svd_claudeB <- svd(claudeB_scaled)

svd_claudeC <- svd(claudeC_scaled)


var_explained_A <- svd_claudeA$d^2 / sum(svd_claudeA$d^2)


var_explained_B <- svd_claudeB$d^2 / sum(svd_claudeB$d^2)


var_explained_C <- svd_claudeC$d^2 / sum(svd_claudeC$d^2)

#displaying the variance values as percentages

data.frame(
  Dataset = c("Claude A", "Claude B", "Claude C"),
  PC1 = c(var_explained_A[1], var_explained_B[1], var_explained_C[1])* 100 ,
  PC2 = c(var_explained_A[2], var_explained_B[2], var_explained_C[2]) * 100 
)

   Dataset      PC1       PC2
1 Claude A 97.56201 2.4379913
2 Claude B 95.91807 4.0819323
3 Claude C 99.49096 0.5090442

#The actual PC1s and PC2s for each claude dataset
#   Dataset      PC1       PC2
#  Claude A 97.56    2.44
#  Claude B 95.92    4.08
#  Claude C 99.50    0.51

#The actual values show that the PC1s explain nearly all of the variance
#and the PC2s explain almost none of the variance (very low values). 
#far above the  required threshold of 50%,75% or 90%. Hence the linear 
#shape of the dots.