Petr Pajdla & Peter Tkáč
AES_707: Statistics seminar for archaeologists
28. 4. 2022
Today:
Imagine you have radiocarbon data from the whole Czech republic and you want to know from which regions are most of the neolithic data comming. In other words, which districts have most neolithic dates.
Lasoles_14C_database.csv from ZENODO depositary (https://doi.org/10.5281/zenodo.5728242) read.csv).Tkáč, P. and Kolář, J., 2021. Towards New Demography Proxies and Regional Chronologies: Radiocarbon Dates from Archaeological Contexts Located in the Czech Republic Covering the Period Between 10,000 BC and AD 1250. Journal of Open Archaeology Data, 9, p.9. DOI: http://doi.org/10.5334/joad.85
here::here(), ncol(), nrow(), head(), str().dplyr package, filter data from the Neolithic period and then count number of observation per district. Put the districts in order, from highest to lowest number of the observations. str() check the numeric variables in your dataframe. If they are a factor, you have to change it to numeric by:
dataframe$variable <- as.numeric(as.character(dataframe$variable))
names(), str(), head(), filter(), summarise(), %>%, arrange(), group_by().Activity_CZ). In other words, is there any difference between number of dates coming from graveyards (poh) and from settlements (sid)? The curse of higher dimensions…
Goal: Find low-dimensional representation of the observations that explain a good fraction of the variation.
data("DartPoints", package = "archdata")
dp <- DartPoints %>%
select(Length, Width, Thickness, Weight) %>% # select columns
as.matrix() # format as a matrix
# set row names with IDs
rownames(dp) <- DartPoints$Catalog
head(dp, 4)
Length Width Thickness Weight
41-0322 42.8 15.8 5.8 3.6
35-2946 40.5 17.4 5.8 4.5
35-2921 37.5 16.3 6.1 3.6
36-3487 40.3 16.1 6.3 4.0
plot(dp)
dp_pca <- prcomp(dp)
dp_pca
Standard deviations (1, .., p=4):
[1] 13.937973 3.238177 1.633327 1.192596
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
Length 0.90926230 0.3813173 0.1663782 0.01255058
Width 0.30499443 -0.8831253 0.3562343 0.01285657
Thickness 0.06682732 -0.0564513 -0.1616860 -0.98295728
Weight 0.27523551 -0.2673951 -0.9051371 0.18295404
summary(dp_pca)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 13.9380 3.23818 1.63333 1.19260
Proportion of Variance 0.9302 0.05021 0.01277 0.00681
Cumulative Proportion 0.9302 0.98042 0.99319 1.00000
plot(dp_pca)
Scree plot
biplot(dp_pca)
ggbiplot::ggbiplot(dp_pca)
# structure of the prcomp object
str(dp_pca)
List of 5
$ sdev : num [1:4] 13.94 3.24 1.63 1.19
$ rotation: num [1:4, 1:4] 0.9093 0.305 0.0668 0.2752 0.3813 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:4] "Length" "Width" "Thickness" "Weight"
.. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
$ center : Named num [1:4] 49.33 22.08 7.27 7.64
..- attr(*, "names")= chr [1:4] "Length" "Width" "Thickness" "Weight"
$ scale : logi FALSE
$ x : num [1:91, 1:4] -9.06 -10.42 -13.71 -11.1 -20.24 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:91] "41-0322" "35-2946" "35-2921" "36-3487" ...
.. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
- attr(*, "class")= chr "prcomp"
# extract the coordinates
dp_pca_x <- dp_pca$x
head(dp_pca_x)
PC1 PC2 PC3 PC4
41-0322 -9.063678 4.217119 0.57461646 0.54402974
35-2946 -10.419278 1.686433 -0.05270198 0.70039255
35-2921 -13.710223 1.737639 -0.17757672 0.18905277
36-3487 -11.101828 2.863704 -0.17735660 0.09821324
36-3321 -20.238281 -1.133789 0.47561971 1.93910904
35-2959 -9.946694 3.209080 1.58242087 2.10559068
class(dp_pca_x)
[1] "matrix" "array"
dp_pca_x <- as.data.frame(dp_pca_x)
class(dp_pca_x)
[1] "data.frame"
dp_pca_x %>%
bind_cols(
type = DartPoints$Name
) %>%
ggplot(
aes(PC1, PC2,
color = type)
) +
geom_point()
(James et al. 2013)
(James et al. 2013)
data(Fibulae, package = "archdata")
# create matrix
fib <- Fibulae %>%
select(
tot.len = Length,
bow.hei = BH,
foot.len = FL
) %>%
as.matrix()
head(fib)
tot.len bow.hei foot.len
[1,] 114 24 93
[2,] 35 7 21
[3,] 60 15 33
[4,] 74 26 23
[5,] 68 23 20
[6,] 55 15 27
plot(fib)
# scale + PCA
fib_pca <- prcomp(scale(fib))
# summary of a PCA
summary(fib_pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.4691 0.8941 0.20585
Proportion of Variance 0.7194 0.2665 0.01412
Cumulative Proportion 0.7194 0.9859 1.00000
plot(fib_pca$x)
factoextra.factoextra::fviz_nbclust(fib_pca$x, kmeans)
fib_km <- kmeans(fib_pca$x, centers = 2)
fib_km
K-means clustering with 2 clusters of sizes 24, 6
Cluster means:
PC1 PC2 PC3
1 0.6099434 -0.007314937 -0.0001198279
2 -2.4397737 0.029259748 0.0004793116
Clustering vector:
[1] 2 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2
Within cluster sum of squares by cluster:
[1] 22.13457 20.21529
(between_SS / total_SS = 51.3 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
# cluster assignment
fib_km$cluster
[1] 2 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2
# create a data frame with clusters
fib_df <- fib_pca$x %>%
as.data.frame() %>%
bind_cols(
clust = fib_km$cluster)
head(fib_df, 4)
PC1 PC2 PC3 clust
1 -4.22727376 0.03044856 0.60841190 2
2 1.76360759 1.78652460 -0.05839999 1
3 -0.08164798 0.43974905 -0.05057241 1
4 -1.28463323 -2.07589875 -0.24087960 2
fib_df %>%
ggplot(
aes(PC1, PC2,
color = factor(clust))) +
geom_point(size = 4) +
theme_minimal()
Output is a dendrogram - a hierarchical structure visualizing the cluster growth.
Starts with a distance matrix.
# data set same as in k-means
head(fib)
tot.len bow.hei foot.len
[1,] 114 24 93
[2,] 35 7 21
[3,] 60 15 33
[4,] 74 26 23
[5,] 68 23 20
[6,] 55 15 27
# create distance matrix
fib_dist <- dist(fib)
class(fib_dist)
[1] "dist"
as.matrix(fib_dist)[1:6, 1:4]
1 2 3 4
1 0.00000 108.23123 81.22192 80.647381
2 108.23123 0.00000 28.86174 43.428102
3 81.22192 28.86174 0.00000 20.420578
4 80.64738 43.42810 20.42058 0.000000
5 86.29021 36.68787 17.23369 7.348469
6 88.98314 22.36068 7.81025 22.315914
fib_hclust <- hclust(fib_dist)
fib_hclust
Call:
hclust(d = fib_dist)
Cluster method : complete
Distance : euclidean
Number of objects: 30
plot(fib_hclust)
cutree(x, k, h) function:
h35 <- cutree(fib_hclust, h = 35)
h35
[1] 1 2 3 4 4 3 2 2 3 2 4 4 3 2 2 2 2 2 2 3 1 3 2 2 2 3 3 2 2 1
k3 <- cutree(fib_hclust, k = 3)
k3
[1] 1 2 3 3 3 3 2 2 3 2 3 3 3 2 2 2 2 2 2 3 1 3 2 2 2 3 3 2 2 1
plot(fib, col = h35)
plot(fib, col = k3)
method = "complete"method = "single"method = "average"method = "ward.D2"